model pass1 pass@count win_rate count SE(A) SE_x(A) SE_pred(A)
qwen3-14b 72.9 77.6 35.9 3 2 1.8 0.79
google_gemma_3_12b_it 69.5 75 33.1 4 2.1 1.9 0.8
qwen2.5-coder-14b-instruct 68.1 80.8 34 3 2.1 1.6 1.3
qwen3-8b 65.3 73 30.2 3 2.1 1.9 1
deepseek_r1_distill_llama_70b 64.1 72 28.9 2 2.1 1.7 1.3
qwen3-4b 64 71.4 29.1 4 2.1 1.9 0.9
google_gemma_3_4b_it 60.3 68 26.3 5 2.2 2 0.89
mistralai_mixtral_8x22b_instruct_v0.1 60.1 72.8 26.4 3 2.2 1.7 1.4
qwen2.5-coder-7b-instruct 57.3 80.8 26.7 4 2.2 1.5 1.7
deepseek_r1_distill_qwen_14b 53.3 71 22.1 4 2.2 1.7 1.4
llama-3.1-8B-instruct 52.2 52.2 21 7 2.2 2.2 0
mistralai_mixtral_8x7b_instruct_v0.1 49.1 61 19.3 3 2.2 1.8 1.3
qwen2-7b-instruct 48 65.6 18.7 4 2.2 1.7 1.4
qwen2.5-coder-3b-instruct 47.9 70.6 19.9 4 2.2 1.5 1.6
qwen3-1.7b 47.8 59 18.5 4 2.2 1.9 1.1
llama-3.2-3B-instruct 45.8 45.8 17.3 8 2.2 2.2 0
mistralai_ministral_8b_instruct_2410 44.9 63.8 16.7 4 2.2 1.7 1.5
mistralai_mathstral_7b_v0.1 41.1 61.4 14.7 4 2.2 1.6 1.5
deepseek_v2_lite_chat 40.3 55.8 14.6 3 2.2 1.7 1.4
qwen1.5-14b-chat 37.2 49.6 12.3 3 2.2 1.7 1.3
mistralai_mistral_7b_instruct_v0.3 36.8 53.8 12.1 4 2.2 1.7 1.3
deepseek_r1_distill_llama_8b 34.2 53.8 11.8 4 2.1 1.5 1.5
mistralai_mistral_7b_instruct_v0.2 34.1 50.8 11.3 4 2.1 1.6 1.4
qwen2.5-coder-1.5b-instruct 32.8 57.6 11.3 4 2.1 1.3 1.6
deepseek_r1_distill_qwen_7b 32.3 55.8 11.5 4 2.1 1.4 1.6
qwen1.5-7b-chat 31.9 45.4 9.96 3 2.1 1.6 1.3
llama-3.2-1B-instruct 26.2 26.2 8.06 11 2 2 0
qwen2.5-coder-0.5b-instruct 25 48 7.9 5 1.9 1.3 1.5
qwen3-0.6b 24.1 40.4 6.9 5 1.9 1.5 1.2
mistralai_mistral_7b_instruct_v0.1 22.9 43 6.63 4 1.9 1.2 1.4
qwen2-1.5b-instruct 12.4 27.2 3 4 1.5 0.9 1.2
deepseek_r1_distill_qwen_1.5b 11.2 24.4 2.86 4 1.4 0.88 1.1
qwen1.5-1.8b-chat 7.13 14.2 1.58 3 1.2 0.7 0.92
qwen2-0.5b-instruct 6.72 19.2 1.44 5 1.1 0.59 0.95
qwen1.5-0.5b-chat 1.12 4.2 0.179 5 0.47 0.18 0.43