model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
deepseek_r1_distill_qwen_32b 37.9 73.3 32.4 11 8.9 6.4 6.1
deepseek_r1_distill_llama_70b 34.2 66.7 29.1 11 8.7 6.3 5.9
deepseek_r1_distill_qwen_7b 28.5 46.7 23.8 11 8.2 6.5 5.1
deepseek_r1_distill_qwen_14b 27.6 43.3 22.7 11 8.2 6.9 4.4
google_gemma_3_27b_it 25.6 56.7 20.8 12 8 5.4 5.9
qwen3-14b 24.4 33.3 19.5 12 7.8 6.7 4
qwen3-32b 22.7 43.3 18.1 11 7.7 5.9 4.9
llama-3.1-70B-instruct 20 20 16.6 13 7.3 7.3 0
deepseek_r1_distill_llama_8b 19 40 15.3 13 7.2 5.2 4.9
qwen2-math-72b-instruct 18.3 36.7 14.4 6 7.1 5.1 4.9
google_gemma_3_12b_it 18.2 46.7 14.6 11 7 5.2 4.7
qwen3-8b 17.5 50 14.2 12 6.9 5.3 4.4
qwen2.5-coder-32b-instruct 13 30 9.68 11 6.1 4.6 4
qwen3-4b 11.9 23.3 8.86 12 5.9 4.5 3.9
deepseek_r1_distill_qwen_1.5b 11.5 23.3 8.78 13 5.8 4.7 3.5
qwen2.5-coder-14b-instruct 10.8 20 7.77 12 5.7 4.5 3.4
llama-3.2-3B-instruct 10 10 7.48 18 5.5 5.5 0
qwen3-1.7b 9.44 23.3 7.16 12 5.3 3.9 3.7
qwen2-math-7b-instruct 8.33 13.3 6.38 6 5 4.3 2.7
qwen2-math-1.5b-instruct 7.5 13.3 5.05 4 4.8 3.8 2.9
google_gemma_3_4b_it 7.18 23.3 5.57 13 4.7 2.5 4
qwen2-72b-instruct 6.06 20 4.19 11 4.4 2.8 3.3
qwen2.5-coder-7b-instruct 4.55 10 3.19 11 3.8 2.4 2.9
llama-3.1-8B-instruct 3.33 3.33 2.36 15 3.3 3.3 0
llama-3.2-1B-instruct 3.33 3.33 2.29 21 3.3 3.3 0
qwen2-7b-instruct 3.03 6.67 2.23 11 3.1 2 2.4
google_gemma_2_27b_it 3 13.3 2 10 3.1 1.8 2.5
qwen2.5-coder-3b-instruct 2.22 6.67 1.53 12 2.7 1.4 2.3
google_gemma_3_1b_it 1.54 3.33 1.04 13 2.2 1.4 1.7
mistralai_mathstral_7b_v0.1 1.52 6.67 1.12 11 2.2 0.86 2.1
mistralai_mixtral_8x22b_instruct_v0.1 1.52 13.3 1.06 11 2.2 0.35 2.2
deepseek_v2_lite_chat 1.21 3.33 0.818 11 2 1.1 1.7
qwen3-0.6b 1.03 3.33 0.691 13 1.8 0.91 1.6
qwen1.5-72b-chat 0.606 3.33 0.363 11 1.4 0.44 1.3
qwen1.5-32b-chat 0.606 6.67 0.384 11 1.4 0 1.4
google_gemma_2_9b_it 0.303 3.33 0.203 11 1 0 1
mistralai_mistral_7b_instruct_v0.2 0.303 3.33 0.252 11 1 0 1
mistralai_mixtral_8x7b_instruct_v0.1 0.278 3.33 0.231 12 0.96 0 0.96
google_codegemma_1.1_7b_it 0 0 0 13 0 0 0
google_gemma_2b_it 0 0 0 13 0 0 0
google_gemma_7b_it 0 0 0 13 0 0 0
qwen2-1.5b-instruct 0 0 0 13 0 0 0
mistralai_mistral_7b_instruct_v0.3 0 0 0 11 0 0 0
qwen1.5-0.5b-chat 0 0 0 13 0 0 0
mistralai_mistral_7b_instruct_v0.1 0 0 0 11 0 0 0
qwen2-0.5b-instruct 0 0 0 13 0 0 0
qwen1.5-7b-chat 0 0 0 12 0 0 0
qwen1.5-1.8b-chat 0 0 0 11 0 0 0
qwen1.5-14b-chat 0 0 0 12 0 0 0
mistralai_ministral_8b_instruct_2410 0 0 0 11 0 0 0
qwen2.5-coder-0.5b-instruct 0 0 0 13 0 0 0
qwen2.5-coder-1.5b-instruct 0 0 0 11 0 0 0