model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
qwen3-32b 70 85.5 36.4 9 0.42 0.35 0.23
qwen3-14b 67.3 81.6 34.4 10 0.43 0.37 0.21
llama-3.1-70B-instruct 63.9 63.9 32.3 12 0.44 0.44 0
qwen3-8b 62.6 78.2 31 10 0.44 0.38 0.22
qwen2-72b-instruct 62.2 78.1 31 4 0.44 0.35 0.27
qwen2.5-coder-32b-instruct 60.5 80 29.5 8 0.45 0.36 0.26
google_gemma_3_12b_it 58.7 77.6 28.2 11 0.45 0.38 0.23
deepseek_r1_distill_llama_70b 57.7 75.8 27.6 9 0.45 0.37 0.25
qwen3-4b 57.6 74.4 27.8 11 0.45 0.39 0.23
mistralai_mixtral_8x22b_instruct_v0.1 51.8 82.2 24 9 0.46 0.34 0.31
deepseek_r1_distill_qwen_32b 51.5 73.5 23.6 8 0.46 0.36 0.28
qwen2-math-72b-instruct 51.1 79.8 23.8 8 0.46 0.34 0.31
deepseek_r1_distill_qwen_14b 48.1 75.5 21.5 12 0.46 0.35 0.29
qwen1.5-72b-chat 47.5 47.5 21.7 1 0.46 NaN NaN
qwen2.5-coder-14b-instruct 47.4 81.9 21.3 10 0.46 0.33 0.32
qwen1.5-32b-chat 46.1 75 20.6 8 0.45 0.34 0.31
llama-3.1-8B-instruct 45 45 19.9 15 0.45 0.45 0
qwen2-7b-instruct 44.2 78.1 19.5 12 0.45 0.34 0.3
qwen3-1.7b 42.9 68 19.2 12 0.45 0.37 0.26
mistralai_mixtral_8x7b_instruct_v0.1 42.3 75.9 18.8 10 0.45 0.33 0.31
google_gemma_3_4b_it 41.5 66.2 17.9 13 0.45 0.38 0.25
mistralai_ministral_8b_instruct_2410 39.2 78.4 16.7 11 0.45 0.31 0.32
qwen1.5-14b-chat 37.9 69.6 16 10 0.44 0.33 0.29
qwen2.5-coder-7b-instruct 37.7 78.4 16 12 0.44 0.3 0.32
mistralai_mathstral_7b_v0.1 36.7 79.2 15.7 12 0.44 0.29 0.33
deepseek_r1_distill_llama_8b 36.1 68.5 14.7 12 0.44 0.32 0.3
deepseek_r1_distill_qwen_7b 36.1 68.1 15.1 12 0.44 0.33 0.29
llama-3.2-3B-instruct 35 35 15 19 0.43 0.43 0
mistralai_mistral_7b_instruct_v0.3 33.7 71.8 14.3 12 0.43 0.31 0.3
qwen2-math-7b-instruct 33.1 72.4 14.6 12 0.43 0.3 0.31
qwen2.5-coder-3b-instruct 29.4 74.6 12.1 12 0.42 0.27 0.31
mistralai_mistral_7b_instruct_v0.2 29.1 65.4 11.9 12 0.41 0.3 0.28
deepseek_v2_lite_chat 29 67.5 11.9 10 0.41 0.28 0.3
qwen1.5-7b-chat 25.1 62.5 10.1 10 0.4 0.26 0.3
qwen2-math-1.5b-instruct 25.1 69.3 11.5 12 0.4 0.25 0.31
qwen3-0.6b 23.8 54 10.7 13 0.39 0.29 0.26
mistralai_mistral_7b_instruct_v0.1 23.8 67.2 9.96 12 0.39 0.25 0.3
llama-3.2-1B-instruct 21.5 21.5 9.44 21 0.37 0.37 0
deepseek_r1_distill_qwen_1.5b 20.5 59.4 8.21 12 0.37 0.22 0.29
qwen2.5-coder-1.5b-instruct 20.3 64.8 8.48 12 0.37 0.22 0.29
qwen2-1.5b-instruct 17.2 63 7.59 12 0.34 0.18 0.29
qwen1.5-1.8b-chat 12.4 45.6 5.71 10 0.3 0.16 0.25
qwen2-0.5b-instruct 11.7 55.2 6.16 13 0.29 0.13 0.26
qwen2.5-coder-0.5b-instruct 10.4 53.1 5.79 13 0.28 0.1 0.26
qwen1.5-0.5b-chat 10.3 57 5.83 13 0.28 0.098 0.26