model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
llama-3.1-70B-instruct 83.4 83.4 37 12 0.46 0.46 0
qwen3-14b 82.5 90.4 36.2 11 0.47 0.42 0.22
google_gemma_3_12b_it 81.8 92.6 36 12 0.48 0.4 0.26
qwen2-72b-instruct 79 92.4 34.2 10 0.5 0.4 0.31
qwen2-math-72b-instruct 78.5 92.6 34 10 0.51 0.4 0.32
qwen2.5-coder-32b-instruct 78.3 93.4 33.9 10 0.51 0.39 0.33
mistralai_mixtral_8x22b_instruct_v0.1 77.4 91.6 33 9 0.52 0.4 0.32
qwen3-32b 76.2 90.5 33.1 9 0.53 0.42 0.32
qwen3-4b 75.2 86 31.9 12 0.54 0.47 0.25
qwen3-8b 72.9 85.8 30.6 11 0.55 0.48 0.27
qwen2.5-coder-14b-instruct 71.4 92.2 29.9 11 0.56 0.41 0.38
qwen1.5-72b-chat 68.3 88.2 28.2 10 0.58 0.45 0.37
llama-3.1-8B-instruct 66.3 66.3 27.1 15 0.59 0.59 0
qwen1.5-32b-chat 66.2 87.5 26.8 9 0.59 0.44 0.39
google_gemma_3_4b_it 64.4 84.3 27.1 11 0.59 0.49 0.34
mistralai_mathstral_7b_v0.1 62.2 90.9 24.8 12 0.6 0.42 0.43
mistralai_mixtral_8x7b_instruct_v0.1 61.9 83.8 24.6 11 0.6 0.48 0.37
mistralai_ministral_8b_instruct_2410 60.2 90.2 23.8 11 0.61 0.41 0.45
qwen2.5-coder-7b-instruct 59.9 90.8 23.6 12 0.61 0.41 0.45
qwen2-math-7b-instruct 56.9 83.1 22.1 11 0.61 0.47 0.39
llama-3.2-3B-instruct 56.8 56.8 22 18 0.61 0.61 0
qwen2.5-coder-3b-instruct 54.2 86.9 20.9 12 0.62 0.43 0.44
mistralai_mistral_7b_instruct_v0.3 51.3 81.8 19.1 12 0.62 0.47 0.41
qwen3-1.7b 49.3 69.8 18.3 12 0.62 0.53 0.32
deepseek_v2_lite_chat 46.1 77 17.5 9 0.62 0.44 0.43
qwen1.5-14b-chat 42.3 82 15.9 11 0.61 0.39 0.47
mistralai_mistral_7b_instruct_v0.1 42.2 80.5 15.4 12 0.61 0.42 0.45
qwen2-math-1.5b-instruct 41.5 76.6 15.5 12 0.61 0.44 0.42
mistralai_mistral_7b_instruct_v0.2 40.5 75.7 14.3 12 0.61 0.45 0.41
qwen2-7b-instruct 38.5 80.5 13.9 12 0.6 0.38 0.46
qwen2.5-coder-1.5b-instruct 35.9 79.1 13.6 12 0.59 0.36 0.47
deepseek_r1_distill_llama_70b 35.6 71 12.9 9 0.59 0.37 0.46
llama-3.2-1B-instruct 34.9 34.9 13.3 21 0.59 0.59 0
deepseek_r1_distill_qwen_7b 33.1 73.8 11.5 12 0.58 0.37 0.45
qwen3-0.6b 33.1 68.1 12.2 12 0.58 0.42 0.4
deepseek_r1_distill_llama_8b 29.1 73.7 10.1 12 0.56 0.31 0.47
qwen2.5-coder-0.5b-instruct 27 69.5 10.5 12 0.55 0.34 0.43
qwen1.5-7b-chat 26.6 70.9 9.37 11 0.55 0.31 0.45
deepseek_r1_distill_qwen_14b 23.4 62.1 7.79 12 0.52 0.3 0.43
qwen2-1.5b-instruct 22.4 72.8 8.29 12 0.52 0.25 0.45
qwen1.5-0.5b-chat 21.7 63.9 8.47 12 0.51 0.29 0.42
deepseek_r1_distill_qwen_32b 21.5 58.4 7.26 7 0.51 0.26 0.44
qwen1.5-1.8b-chat 19.8 58.9 7.08 11 0.49 0.29 0.4
qwen2-0.5b-instruct 18.2 61.9 7.01 12 0.48 0.25 0.41
deepseek_r1_distill_qwen_1.5b 14 53.1 4.81 12 0.43 0.21 0.38