model	pass1	pass@count	win_rate	count	SE(A)	SE_x(A)	SE_pred(A)
qwen3-32b	70	85.5	36.4	9	0.42	0.35	0.23
qwen3-14b	67.3	81.6	34.4	10	0.43	0.37	0.21
llama-3.1-70B-instruct	63.9	63.9	32.3	12	0.44	0.44	0
qwen3-8b	62.6	78.2	31	10	0.44	0.38	0.22
qwen2-72b-instruct	62.2	78.1	31	4	0.44	0.35	0.27
qwen2.5-coder-32b-instruct	60.5	80	29.5	8	0.45	0.36	0.26
google_gemma_3_12b_it	58.7	77.6	28.2	11	0.45	0.38	0.23
deepseek_r1_distill_llama_70b	57.7	75.8	27.6	9	0.45	0.37	0.25
qwen3-4b	57.6	74.4	27.8	11	0.45	0.39	0.23
mistralai_mixtral_8x22b_instruct_v0.1	51.8	82.2	24	9	0.46	0.34	0.31
deepseek_r1_distill_qwen_32b	51.5	73.5	23.6	8	0.46	0.36	0.28
qwen2-math-72b-instruct	51.1	79.8	23.8	8	0.46	0.34	0.31
deepseek_r1_distill_qwen_14b	48.1	75.5	21.5	12	0.46	0.35	0.29
qwen1.5-72b-chat	47.5	47.5	21.7	1	0.46	NaN	NaN
qwen2.5-coder-14b-instruct	47.4	81.9	21.3	10	0.46	0.33	0.32
qwen1.5-32b-chat	46.1	75	20.6	8	0.45	0.34	0.31
llama-3.1-8B-instruct	45	45	19.9	15	0.45	0.45	0
qwen2-7b-instruct	44.2	78.1	19.5	12	0.45	0.34	0.3
qwen3-1.7b	42.9	68	19.2	12	0.45	0.37	0.26
mistralai_mixtral_8x7b_instruct_v0.1	42.3	75.9	18.8	10	0.45	0.33	0.31
google_gemma_3_4b_it	41.5	66.2	17.9	13	0.45	0.38	0.25
mistralai_ministral_8b_instruct_2410	39.2	78.4	16.7	11	0.45	0.31	0.32
qwen1.5-14b-chat	37.9	69.6	16	10	0.44	0.33	0.29
qwen2.5-coder-7b-instruct	37.7	78.4	16	12	0.44	0.3	0.32
mistralai_mathstral_7b_v0.1	36.7	79.2	15.7	12	0.44	0.29	0.33
deepseek_r1_distill_llama_8b	36.1	68.5	14.7	12	0.44	0.32	0.3
deepseek_r1_distill_qwen_7b	36.1	68.1	15.1	12	0.44	0.33	0.29
llama-3.2-3B-instruct	35	35	15	19	0.43	0.43	0
mistralai_mistral_7b_instruct_v0.3	33.7	71.8	14.3	12	0.43	0.31	0.3
qwen2-math-7b-instruct	33.1	72.4	14.6	12	0.43	0.3	0.31
qwen2.5-coder-3b-instruct	29.4	74.6	12.1	12	0.42	0.27	0.31
mistralai_mistral_7b_instruct_v0.2	29.1	65.4	11.9	12	0.41	0.3	0.28
deepseek_v2_lite_chat	29	67.5	11.9	10	0.41	0.28	0.3
qwen1.5-7b-chat	25.1	62.5	10.1	10	0.4	0.26	0.3
qwen2-math-1.5b-instruct	25.1	69.3	11.5	12	0.4	0.25	0.31
qwen3-0.6b	23.8	54	10.7	13	0.39	0.29	0.26
mistralai_mistral_7b_instruct_v0.1	23.8	67.2	9.96	12	0.39	0.25	0.3
llama-3.2-1B-instruct	21.5	21.5	9.44	21	0.37	0.37	0
deepseek_r1_distill_qwen_1.5b	20.5	59.4	8.21	12	0.37	0.22	0.29
qwen2.5-coder-1.5b-instruct	20.3	64.8	8.48	12	0.37	0.22	0.29
qwen2-1.5b-instruct	17.2	63	7.59	12	0.34	0.18	0.29
qwen1.5-1.8b-chat	12.4	45.6	5.71	10	0.3	0.16	0.25
qwen2-0.5b-instruct	11.7	55.2	6.16	13	0.29	0.13	0.26
qwen2.5-coder-0.5b-instruct	10.4	53.1	5.79	13	0.28	0.1	0.26
qwen1.5-0.5b-chat	10.3	57	5.83	13	0.28	0.098	0.26