model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
qwen2.5-coder-32b-instruct 82.4 92.2 44.4 10 1.3 1.1 0.72
qwen3-14b 78.7 92.4 41.3 12 1.4 1.2 0.87
qwen2.5-coder-14b-instruct 76.5 89.5 39.5 12 1.5 1.3 0.79
google_gemma_3_27b_it 76 87 38.9 9 1.5 1.3 0.77
llama-3.1-70B-instruct 70.2 70.2 34.6 13 1.6 1.6 0
google_gemma_3_12b_it 69.7 83.6 34.3 11 1.6 1.4 0.85
qwen3-32b 69.5 84 34.4 11 1.6 1.4 0.89
deepseek_r1_distill_llama_70b 66.5 87 32.3 10 1.7 1.3 1.1
qwen2-72b-instruct 64.1 85.5 30.5 10 1.7 1.3 1.1
google_gemma_2_27b_it 62.7 81.8 29.2 10 1.7 1.4 0.97
mistralai_mixtral_8x22b_instruct_v0.1 62.3 83 28.8 11 1.7 1.4 1
qwen2-math-72b-instruct 59.5 82 27.3 10 1.7 1.3 1.1
qwen3-4b 57.4 77.5 26.1 12 1.7 1.4 1
qwen2.5-coder-7b-instruct 56.2 80.6 25.1 11 1.8 1.4 1.1
google_gemma_3_4b_it 54.5 74.5 24 13 1.8 1.5 0.96
deepseek_r1_distill_qwen_14b 54.5 88.2 26.7 11 1.8 1 1.4
deepseek_r1_distill_llama_8b 52.3 78.6 22.8 12 1.8 1.3 1.2
qwen2.5-coder-3b-instruct 50.1 79.1 21.3 12 1.8 1.3 1.2
google_gemma_2_9b_it 48.6 69.6 19.9 12 1.8 1.5 0.98
qwen1.5-32b-chat 48.3 75.5 20.1 11 1.8 1.4 1.1
qwen1.5-72b-chat 46.6 75.6 19.6 10 1.8 1.3 1.2
mistralai_mixtral_8x7b_instruct_v0.1 45.7 73.6 18.5 11 1.8 1.3 1.1
llama-3.1-8B-instruct 44.2 44.2 17.9 15 1.8 1.8 0
mistralai_ministral_8b_instruct_2410 44.2 74.4 17.7 11 1.8 1.3 1.2
mistralai_mathstral_7b_v0.1 40.9 72.4 15.8 11 1.7 1.3 1.1
qwen2-7b-instruct 39.3 69.4 15.1 11 1.7 1.3 1.1
google_codegemma_1.1_7b_it 38.2 67.6 14.8 13 1.7 1.3 1.1
qwen1.5-14b-chat 37.8 68.8 14.3 12 1.7 1.3 1.1
mistralai_mistral_7b_instruct_v0.3 34.2 62.4 12.4 11 1.7 1.3 1.1
qwen3-1.7b 33.9 74.4 14.8 12 1.7 0.94 1.4
qwen3-0.6b 31.4 64.5 12 13 1.6 1.1 1.2
deepseek_r1_distill_qwen_32b 30.5 75.1 13.4 10 1.6 0.83 1.4
llama-3.2-3B-instruct 30.1 30.1 10.5 18 1.6 1.6 0
qwen3-8b 29.5 62.5 11.3 12 1.6 1.1 1.2
qwen2.5-coder-1.5b-instruct 29 68.2 10.7 12 1.6 1 1.2
deepseek_v2_lite_chat 28.1 59.6 9.99 11 1.6 1.1 1.1
google_gemma_3_1b_it 27.5 52.6 10.3 12 1.6 1.2 1
qwen2-math-7b-instruct 26.3 59.1 9.53 12 1.6 1 1.2
qwen1.5-7b-chat 26.2 57.5 9.23 12 1.6 1.1 1.1
google_gemma_7b_it 25.5 47.1 9.26 12 1.5 1.2 0.95
mistralai_mistral_7b_instruct_v0.1 23.6 54.1 8.26 11 1.5 1 1.1
mistralai_mistral_7b_instruct_v0.2 22.7 59 8.24 11 1.5 0.87 1.2
qwen2-math-1.5b-instruct 21.6 51.4 7.64 12 1.5 1 1
qwen2.5-coder-0.5b-instruct 20.5 53.1 7.57 13 1.4 0.9 1.1
deepseek_r1_distill_qwen_7b 19.3 58.9 7.45 11 1.4 0.71 1.2
deepseek_r1_distill_qwen_1.5b 14.1 51.6 4.94 12 1.2 0.59 1.1
google_gemma_2b_it 12.5 37.9 4.61 13 1.2 0.73 0.92
qwen2-1.5b-instruct 12.5 43.4 4.12 12 1.2 0.64 0.98
llama-3.2-1B-instruct 11.4 11.4 3.77 21 1.1 1.1 0
qwen1.5-0.5b-chat 6.23 28.6 2.32 13 0.85 0.43 0.74
qwen2-0.5b-instruct 1.84 16.2 0.666 13 0.47 0.11 0.46
qwen1.5-1.8b-chat 1.17 10.4 0.378 12 0.38 0.1 0.37