model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
qwen3-32b 82.1 90 31.3 10 0.37 0.33 0.18
qwen3-14b 81.2 87.1 30.6 10 0.38 0.35 0.15
qwen3-8b 78.9 87.4 29.1 10 0.4 0.35 0.18
llama-3.1-70B-instruct 78.3 78.3 29 12 0.4 0.4 0
deepseek_r1_distill_llama_70b 78.1 90.3 29.1 9 0.4 0.33 0.23
qwen2-72b-instruct 76.5 88.8 27.7 7 0.41 0.34 0.24
google_gemma_3_27b_it 75.1 81.4 26.5 10 0.42 0.39 0.15
deepseek_r1_distill_qwen_32b 75.1 91.1 28.6 10 0.42 0.31 0.29
qwen2-math-72b-instruct 74.9 88.8 26.6 10 0.42 0.35 0.24
qwen2.5-coder-32b-instruct 74.7 86.5 26.3 10 0.42 0.37 0.21
deepseek_r1_distill_qwen_14b 74.5 90.9 27.5 12 0.42 0.32 0.28
google_gemma_2_27b_it 74.3 86.5 26.3 9 0.43 0.36 0.22
google_gemma_3_12b_it 74.2 83.2 25.9 11 0.43 0.39 0.18
qwen3-4b 73.6 82.7 25.7 12 0.43 0.39 0.18
deepseek_r1_distill_qwen_7b 73.3 90.6 26.5 12 0.43 0.33 0.28
qwen2.5-coder-14b-instruct 70.9 88.7 24.2 10 0.44 0.35 0.27
google_gemma_2_9b_it 70.6 85.9 24.2 11 0.44 0.37 0.24
mistralai_mixtral_8x22b_instruct_v0.1 69.6 90.8 24.2 10 0.45 0.33 0.3
qwen1.5-32b-chat 67.1 88.9 22.7 10 0.46 0.34 0.3
qwen1.5-72b-chat 66.9 85.6 22.5 7 0.46 0.36 0.28
deepseek_r1_distill_llama_8b 66.8 90.6 23.5 12 0.46 0.32 0.33
qwen2-math-7b-instruct 65.5 86.9 21.4 12 0.46 0.36 0.28
mistralai_ministral_8b_instruct_2410 64.6 87.6 21 11 0.47 0.36 0.3
google_gemma_3_4b_it 64.3 79.2 20.4 13 0.47 0.41 0.22
qwen2-7b-instruct 64.2 88.8 20.9 12 0.47 0.35 0.31
llama-3.1-8B-instruct 64.1 64.1 21.2 16 0.47 0.47 0
qwen2-math-1.5b-instruct 61.5 84.2 19.2 12 0.47 0.38 0.28
qwen2.5-coder-7b-instruct 61.4 88.2 19.6 12 0.47 0.35 0.32
qwen3-1.7b 58.8 76.9 18.4 12 0.48 0.41 0.25
mistralai_mathstral_7b_v0.1 57.7 87.3 17.8 12 0.48 0.35 0.33
deepseek_r1_distill_qwen_1.5b 57 88.3 19.2 12 0.48 0.32 0.36
qwen1.5-14b-chat 55.7 82.9 16.9 10 0.48 0.37 0.32
llama-3.2-3B-instruct 55.3 55.3 16.9 18 0.48 0.48 0
deepseek_v2_lite_chat 49.9 80.6 14.5 10 0.49 0.36 0.33
qwen2.5-coder-3b-instruct 49.3 82.5 14.1 12 0.49 0.35 0.34
mistralai_mixtral_8x7b_instruct_v0.1 49.2 83.1 14.8 10 0.49 0.34 0.35
qwen1.5-7b-chat 43 79.1 12.2 11 0.48 0.33 0.35
google_codegemma_1.1_7b_it 37.5 71.4 9.58 13 0.47 0.35 0.32
mistralai_mistral_7b_instruct_v0.3 36.3 76.7 9.99 12 0.47 0.31 0.35
qwen2.5-coder-1.5b-instruct 34.9 74 8.96 12 0.46 0.32 0.34
qwen3-0.6b 30 63.1 7.67 13 0.45 0.33 0.3
mistralai_mistral_7b_instruct_v0.2 29.8 68.1 8.21 12 0.45 0.3 0.32
google_gemma_3_1b_it 29.6 56.5 7.25 12 0.44 0.35 0.27
qwen2-1.5b-instruct 25.8 70.4 6.56 12 0.43 0.26 0.34
llama-3.2-1B-instruct 24.3 24.3 5.75 22 0.42 0.42 0
mistralai_mistral_7b_instruct_v0.1 23.7 64 5.77 12 0.41 0.26 0.32
google_gemma_7b_it 19.1 47.9 4.54 12 0.38 0.28 0.27
qwen1.5-1.8b-chat 15.1 53.6 4.3 11 0.35 0.19 0.3
qwen2-0.5b-instruct 11.6 48.6 2.76 12 0.31 0.16 0.27
qwen2.5-coder-0.5b-instruct 9.04 41.8 2.19 13 0.28 0.15 0.24
google_gemma_2b_it 6.26 23.9 1.45 12 0.24 0.15 0.18
qwen1.5-0.5b-chat 4.29 27.9 1.22 13 0.2 0.083 0.18