model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
qwen3-14b 77.6 91.5 38 1.1e+03 3.3 2.9 1.4
qwen3-32b 77.3 93.3 37.7 1.1e+03 3.3 2.7 1.8
google_gemma_3_27b_it 75.7 78.7 36.2 7 3.3 3.2 0.93
qwen2.5-coder-32b-instruct 75 90.9 36.8 1.1e+03 3.4 2.7 2.1
qwen2.5-coder-14b-instruct 74.8 93.3 36.6 1.1e+03 3.4 2.6 2.1
google_gemma_3_12b_it 72.9 85.4 34 1.1e+03 3.5 3.3 1.1
llama-3.1-70B-instruct 70.7 88.4 33 1.1e+03 3.6 2.9 2
qwen3-8b 70.7 92.1 34.1 1.1e+03 3.6 2.9 2.1
qwen3-4b 69.7 89.6 32.3 1.1e+03 3.6 3.2 1.7
google_gemma_2_27b_it 66.4 86.6 29.8 1.1e+03 3.7 3.4 1.5
mistralai_mixtral_8x22b_instruct_v0.1 65.9 92.7 29.8 8.9e+02 3.7 2.9 2.3
deepseek_r1_distill_qwen_32b 62.7 92.7 30.5 1.1e+03 3.8 2.3 3
qwen2-math-72b-instruct 61.7 89 27.8 1.4e+02 3.8 2.8 2.5
google_gemma_3_4b_it 61 77.4 27.1 1.1e+03 3.8 3.6 1.4
google_gemma_2_9b_it 55.2 79.3 23.4 1.1e+03 3.9 3.6 1.5
llama-3.1-8B-instruct 54.8 90.2 22.9 1.1e+03 3.9 3.1 2.4
deepseek_r1_distill_qwen_14b 51.9 92.1 24.7 1.1e+03 3.9 2.3 3.1
qwen2-7b-instruct 50.1 92.7 21.6 1.1e+03 3.9 2.5 3
deepseek_r1_distill_llama_70b 49.9 93.9 23.6 1.1e+03 3.9 2.3 3.2
qwen3-1.7b 49.8 86.6 21.2 1.1e+03 3.9 3.3 2.1
qwen2-72b-instruct 48.4 93.3 22.1 1.1e+03 3.9 2.4 3.1
google_codegemma_1.1_7b_it 47.4 84.8 18.7 1.1e+03 3.9 3.2 2.3
qwen2.5-coder-7b-instruct 47.4 93.9 20.8 1.1e+03 3.9 2.3 3.2
qwen1.5-14b-chat 44.7 89.6 17.7 1.1e+03 3.9 3 2.4
qwen2.5-coder-3b-instruct 44.3 92.1 19.2 1.1e+03 3.9 2.3 3.1
llama-3.2-3B-instruct 44.2 87.2 17.4 1.1e+03 3.9 2.9 2.5
deepseek_v2_lite_chat 42.2 90.2 16.8 1.1e+03 3.9 2.8 2.7
qwen1.5-32b-chat 41.6 83.5 16.6 1.1e+03 3.8 3.2 2.2
mistralai_ministral_8b_instruct_2410 40.1 92.1 16.5 1.1e+03 3.8 2.4 3
deepseek_r1_distill_llama_8b 37.9 91.5 16.1 1.1e+03 3.8 2.3 3
qwen1.5-72b-chat 36.9 84.8 14.4 1.1e+03 3.8 3 2.3
mistralai_mathstral_7b_v0.1 36.3 92.1 14 1.1e+03 3.8 2.4 2.9
google_gemma_3_1b_it 36.1 56.1 13.8 1.1e+03 3.8 3.5 1.3
qwen2.5-coder-1.5b-instruct 35.3 92.1 13.8 1.1e+03 3.7 2.3 3
qwen1.5-7b-chat 33.2 84.1 12.2 1.1e+03 3.7 2.8 2.4
mistralai_mistral_7b_instruct_v0.3 31.7 87.2 11 1.1e+03 3.6 2.8 2.3
qwen2.5-coder-0.5b-instruct 30.7 88.4 11.8 1.1e+03 3.6 2.2 2.8
deepseek_r1_distill_qwen_7b 28.3 92.1 11.7 1.1e+03 3.5 2 2.9
llama-3.2-1B-instruct 25.9 76.8 8.72 1.1e+03 3.4 2.5 2.3
mistralai_mistral_7b_instruct_v0.1 22.4 84.1 7.24 1.1e+03 3.3 2.3 2.3
qwen3-0.6b 21.2 82.3 6.99 1.1e+03 3.2 2.3 2.2
google_gemma_7b_it 20.4 65.9 6.5 1.1e+03 3.1 2.6 1.8
qwen2-1.5b-instruct 14 85.4 5.2 1.1e+03 2.7 1.2 2.4
google_gemma_2b_it 13.7 45.7 3.6 1.1e+03 2.7 2.3 1.4
mistralai_mistral_7b_instruct_v0.2 7.76 69.5 2.45 1.1e+03 2.1 1.2 1.7
qwen2-0.5b-instruct 7.41 67.7 2.12 1.1e+03 2 1.1 1.7
qwen1.5-1.8b-chat 5.42 62.2 1.52 1.1e+03 1.8 0.93 1.5
deepseek_r1_distill_qwen_1.5b 2.32 71.3 0.722 1.1e+03 1.2 0.38 1.1
qwen1.5-0.5b-chat 1.79 23.8 0.302 1e+03 1 0.64 0.81