model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
google_gemma_3_27b_it 94.2 96.5 28 11 0.65 0.59 0.27
qwen3-32b 93.9 97.7 28 11 0.66 0.54 0.38
llama-3.1-70B-instruct 93.8 93.8 28 11 0.66 0.66 0
qwen3-14b 93.5 96.5 27.7 11 0.68 0.6 0.32
deepseek_r1_distill_llama_70b 92.6 97.6 27.6 10 0.72 0.53 0.48
qwen2-math-72b-instruct 92.5 98.4 27.3 9 0.73 0.49 0.53
qwen3-8b 92.2 96.1 26.8 11 0.74 0.64 0.38
qwen2.5-coder-32b-instruct 92.1 97.5 27 9 0.74 0.56 0.48
google_gemma_3_12b_it 92.1 96.2 26.9 12 0.74 0.65 0.36
qwen2-72b-instruct 91.6 98.1 26.8 10 0.76 0.55 0.53
google_gemma_2_27b_it 90.2 95.8 25.6 9 0.82 0.69 0.45
qwen3-4b 89.2 93.4 25.3 11 0.85 0.78 0.35
google_gemma_2_9b_it 87.6 95.1 24.3 11 0.91 0.74 0.52
qwen2.5-coder-14b-instruct 86.4 98 24 11 0.94 0.63 0.7
deepseek_r1_distill_qwen_7b 86.4 97.3 24.2 11 0.94 0.65 0.68
deepseek_r1_distill_qwen_32b 85.9 98 25.1 10 0.96 0.57 0.77
mistralai_mixtral_8x22b_instruct_v0.1 84.8 98.1 23.3 11 0.99 0.67 0.73
deepseek_r1_distill_qwen_14b 84.7 96 24.1 11 0.99 0.7 0.7
qwen1.5-72b-chat 84.2 96.7 23 10 1 0.73 0.69
google_gemma_3_4b_it 83.4 93.4 22.5 13 1 0.86 0.56
qwen1.5-32b-chat 83.2 96.9 22.6 11 1 0.71 0.74
qwen2-math-7b-instruct 83 96.1 22.2 11 1 0.76 0.7
llama-3.1-8B-instruct 81 81 21.3 15 1.1 1.1 0.035
qwen2-math-1.5b-instruct 80.4 95.1 21.1 11 1.1 0.81 0.73
mistralai_ministral_8b_instruct_2410 79.8 96.1 20.8 12 1.1 0.78 0.79
deepseek_r1_distill_llama_8b 79.7 96.4 21.4 12 1.1 0.72 0.84
qwen2-7b-instruct 78 96.8 20.4 12 1.1 0.74 0.87
qwen2.5-coder-7b-instruct 77.1 96.7 20 12 1.2 0.77 0.87
mistralai_mathstral_7b_v0.1 74.9 95.9 18.9 11 1.2 0.78 0.9
llama-3.2-3B-instruct 73.4 73.4 18 17 1.2 1.2 0
qwen3-1.7b 73.4 87.3 18.1 11 1.2 1 0.64
qwen1.5-14b-chat 72.8 92.8 17.9 10 1.2 0.9 0.83
deepseek_r1_distill_qwen_1.5b 68.6 94.2 17.3 12 1.3 0.79 1
deepseek_v2_lite_chat 67.7 92.3 16 11 1.3 0.92 0.9
qwen2.5-coder-3b-instruct 66.2 92.5 15.5 11 1.3 0.89 0.95
mistralai_mixtral_8x7b_instruct_v0.1 65.7 93.9 15.7 11 1.3 0.87 0.98
qwen1.5-7b-chat 59.1 88.9 13.2 12 1.4 0.93 0.99
google_codegemma_1.1_7b_it 54.8 87.5 11.8 13 1.4 0.97 0.97
qwen2.5-coder-1.5b-instruct 49.9 84.8 10.3 11 1.4 0.93 1
mistralai_mistral_7b_instruct_v0.3 48.3 84.8 10 11 1.4 0.93 1
google_gemma_3_1b_it 44.7 70.7 8.82 12 1.4 1.1 0.79
qwen3-0.6b 41.6 74.2 7.92 13 1.4 1 0.91
mistralai_mistral_7b_instruct_v0.2 40 75.7 7.87 11 1.3 0.94 0.97
llama-3.2-1B-instruct 38 38 7.06 12 1.3 1.3 0
qwen2-1.5b-instruct 37.9 81.2 7.42 11 1.3 0.78 1.1
mistralai_mistral_7b_instruct_v0.1 34.3 74.8 6.37 11 1.3 0.84 1
google_gemma_7b_it 28.9 60 5.11 12 1.2 0.92 0.85
qwen2-0.5b-instruct 19.5 62.7 3.12 13 1.1 0.62 0.9
qwen1.5-1.8b-chat 15.8 57.1 2.53 12 1 0.5 0.87
qwen2.5-coder-0.5b-instruct 14 51.4 2.04 13 0.95 0.55 0.78
google_gemma_2b_it 9.9 30.6 1.46 12 0.82 0.58 0.58
qwen1.5-0.5b-chat 6.86 34.7 1.02 13 0.7 0.34 0.61