model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
google_gemma_3_12b_it 27.6 56.9 22.9 11 2 1.4 1.4
qwen2-72b-instruct 25.4 59 21 10 1.9 1.3 1.4
qwen3-32b 24.9 58.6 20.6 10 1.9 1.3 1.4
qwen3-14b 24.6 53.4 20.1 12 1.9 1.4 1.3
qwen2.5-coder-32b-instruct 23.7 51.1 19.2 10 1.9 1.3 1.3
qwen3-4b 21.7 49.9 17.5 12 1.8 1.3 1.3
qwen3-8b 17.4 46.4 14 12 1.7 1.2 1.2
qwen2.5-coder-14b-instruct 16.8 48.5 13.3 12 1.6 1 1.3
qwen1.5-32b-chat 13.5 44.1 10.9 11 1.5 0.93 1.2
qwen1.5-72b-chat 13.4 41.4 10.7 10 1.5 0.92 1.2
google_gemma_7b_it 13 33.4 10.9 13 1.5 1.1 1
google_gemma_2_27b_it 12.7 37.7 10.4 10 1.5 0.99 1.1
qwen2-math-72b-instruct 11.4 22.9 9.05 10 1.4 1.1 0.85
qwen2.5-coder-7b-instruct 11.2 39.8 8.76 10 1.4 0.75 1.2
google_gemma_3_4b_it 11.1 38.1 8.85 13 1.4 0.86 1.1
llama-3.1-8B-instruct 10.5 10.5 8.59 15 1.4 1.4 0
google_gemma_2_9b_it 9.29 29.5 7.51 11 1.3 0.87 0.94
qwen1.5-14b-chat 9.17 38.8 7.42 12 1.3 0.69 1.1
mistralai_mixtral_8x22b_instruct_v0.1 8.93 37.5 7.07 11 1.3 0.65 1.1
qwen2-1.5b-instruct 7.57 38.4 6.5 13 1.2 0.59 1
google_codegemma_1.1_7b_it 7.45 38.3 6 13 1.2 0.5 1
qwen2-7b-instruct 7.13 31.1 5.52 11 1.1 0.59 0.97
deepseek_r1_distill_qwen_32b 6.64 19.8 4.9 10 1.1 0.73 0.82
mistralai_mistral_7b_instruct_v0.3 6.62 28.7 5.33 11 1.1 0.55 0.95
qwen3-1.7b 6.62 22.7 5.05 12 1.1 0.74 0.81
qwen1.5-7b-chat 6.57 34.4 5.29 12 1.1 0.5 0.97
llama-3.2-3B-instruct 6.41 6.41 5.07 17 1.1 1.1 0
deepseek_v2_lite_chat 6.41 31.5 5.21 11 1.1 0.47 0.97
qwen2-math-7b-instruct 6.15 14 4.61 6 1.1 0.76 0.74
deepseek_r1_distill_qwen_14b 6.02 19.2 4.48 11 1 0.68 0.79
mistralai_ministral_8b_instruct_2410 5.63 31.1 4.41 11 1 0.45 0.91
mistralai_mistral_7b_instruct_v0.1 5.53 28.2 4.49 11 1 0.45 0.9
deepseek_r1_distill_llama_70b 5.48 17.3 3.98 10 1 0.66 0.75
mistralai_mistral_7b_instruct_v0.2 5.32 23.5 4.33 10 0.99 0.5 0.85
qwen2.5-coder-3b-instruct 5.16 29.5 3.92 12 0.97 0.42 0.88
google_gemma_2b_it 5.11 13.2 4.1 13 0.97 0.74 0.62
mistralai_mathstral_7b_v0.1 5.07 28 3.98 11 0.97 0.38 0.89
qwen2-0.5b-instruct 4.9 27 4.03 13 0.95 0.41 0.86
qwen1.5-1.8b-chat 4.31 22.3 3.5 11 0.89 0.35 0.82
llama-3.2-1B-instruct 4.27 4.27 3.47 12 0.89 0.89 0
google_gemma_3_1b_it 4.22 19.2 3.15 12 0.89 0.51 0.72
deepseek_r1_distill_qwen_7b 4.17 12.6 2.95 11 0.88 0.59 0.66
qwen2-math-1.5b-instruct 4.03 9.13 2.95 4 0.87 0.54 0.67
qwen1.5-0.5b-chat 3.91 20.6 3.26 13 0.85 0.37 0.77
deepseek_r1_distill_llama_8b 3.54 17.1 2.57 12 0.81 0.43 0.69
qwen3-0.6b 2.66 15.5 1.94 13 0.71 0.37 0.61
deepseek_r1_distill_qwen_1.5b 2.25 10.3 1.57 12 0.65 0.35 0.55
qwen2.5-coder-1.5b-instruct 1.8 11.8 1.33 11 0.59 0.23 0.54
qwen2.5-coder-0.5b-instruct 1.69 13.2 1.33 13 0.57 0.17 0.54