model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
qwen3-32b 78.5 89 40.4 11 3.2 2.7 1.8
qwen3-14b 77.8 86.6 39.9 12 3.2 2.9 1.4
qwen2.5-coder-14b-instruct 76 88.4 39 12 3.3 2.7 2
google_gemma_3_27b_it 75.6 80.5 37.7 12 3.4 3.2 1.1
qwen2.5-coder-32b-instruct 75.1 86.6 38.6 11 3.4 2.7 2
google_gemma_3_12b_it 73.2 77.4 35.9 11 3.5 3.3 1.1
qwen3-8b 69.9 87.2 35.4 12 3.6 2.9 2.1
qwen3-4b 69.7 82.9 34 12 3.6 3.2 1.6
google_gemma_2_27b_it 67.1 75 31.7 10 3.7 3.4 1.5
mistralai_mixtral_8x22b_instruct_v0.1 65.6 83.5 31.1 11 3.7 3 2.1
qwen2-math-72b-instruct 62.1 82.3 29.5 11 3.8 2.9 2.4
deepseek_r1_distill_qwen_32b 61 87.8 30.9 11 3.8 2.3 3
google_gemma_3_4b_it 60.6 68.9 28.2 13 3.8 3.6 1.3
llama-3.1-8B-instruct 57.3 57.3 26 15 3.9 3.9 0
google_gemma_2_9b_it 55.9 65.9 25.2 11 3.9 3.6 1.4
deepseek_r1_distill_qwen_14b 53 86 26.7 11 3.9 2.3 3.1
qwen3-1.7b 50.4 73.8 22.6 12 3.9 3.3 2
qwen2-7b-instruct 50 84.8 22.9 11 3.9 2.5 3
deepseek_r1_distill_llama_70b 49.6 86 24.9 11 3.9 2.2 3.2
qwen2-72b-instruct 48.2 84.1 23.3 11 3.9 2.2 3.2
qwen2.5-coder-7b-instruct 48.1 86.6 22.5 10 3.9 2.2 3.3
google_codegemma_1.1_7b_it 47.4 70.1 20 13 3.9 3.1 2.3
qwen1.5-14b-chat 45.1 75.6 19.1 12 3.9 3 2.5
qwen2.5-coder-3b-instruct 43.5 81.7 19.7 12 3.9 2.3 3.1
deepseek_v2_lite_chat 42.8 73.8 18 11 3.9 2.8 2.7
qwen1.5-32b-chat 40.9 65.2 17.5 11 3.8 3.1 2.2
mistralai_ministral_8b_instruct_2410 39.7 80.5 17.5 11 3.8 2.3 3.1
llama-3.2-3B-instruct 37.8 37.8 14.7 17 3.8 3.8 0
qwen1.5-72b-chat 37.7 62.2 15.6 11 3.8 2.9 2.4
deepseek_r1_distill_llama_8b 37 76.8 16.6 13 3.8 2.4 2.9
google_gemma_3_1b_it 36 43.3 14.5 13 3.7 3.5 1.3
mistralai_mathstral_7b_v0.1 35.9 74.4 14.9 11 3.7 2.3 2.9
qwen2.5-coder-1.5b-instruct 34.8 76.8 14.4 11 3.7 2.3 2.9
qwen1.5-7b-chat 34.1 65.2 13.4 12 3.7 2.9 2.3
mistralai_mistral_7b_instruct_v0.3 31.6 59.1 11.8 11 3.6 2.8 2.3
mistralai_mixtral_8x7b_instruct_v0.1 31.6 54.9 13 12 3.6 2.8 2.3
qwen2.5-coder-0.5b-instruct 31 68.3 12.7 13 3.6 2.3 2.8
qwen2-math-7b-instruct 29.6 56.1 12 6 3.6 2.5 2.5
deepseek_r1_distill_qwen_7b 26.9 71.3 11.9 11 3.5 2 2.9
llama-3.2-1B-instruct 25.6 25.6 9.17 12 3.4 3.4 0
mistralai_mistral_7b_instruct_v0.1 22.4 51.8 7.7 11 3.3 2.3 2.3
qwen3-0.6b 21.4 48.2 7.64 13 3.2 2.3 2.3
google_gemma_7b_it 20.3 40.2 6.93 13 3.1 2.6 1.8
qwen2-1.5b-instruct 14.1 57.9 5.9 13 2.7 1.2 2.4
google_gemma_2b_it 13.9 26.2 4.02 13 2.7 2.3 1.4
qwen2-0.5b-instruct 7.41 30.5 2.27 13 2 1.1 1.7
mistralai_mistral_7b_instruct_v0.2 6.95 28.7 2.49 10 2 0.98 1.7
qwen1.5-1.8b-chat 4.99 22.6 1.51 11 1.7 0.88 1.5
qwen2-math-1.5b-instruct 3.35 8.54 1.35 4 1.4 0.89 1.1
deepseek_r1_distill_qwen_1.5b 2.58 19.5 0.799 13 1.2 0.42 1.2
qwen1.5-0.5b-chat 1.92 7.93 0.349 13 1.1 0.77 0.74