{"model":"deepseek_r1_distill_llama_70b","pass1":0.2575757576,"pass@count":0.5,"win_rate":0.2124945811,"count":11.0,"SE(A)":0.0798395238,"SE_x(A)":0.0665954624,"SE_pred(A)":0.0440385506}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.2545454545,"pass@count":0.4333333333,"win_rate":0.20917899,"count":11.0,"SE(A)":0.0795303016,"SE_x(A)":0.0677325264,"SE_pred(A)":0.0416818154}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.2454545455,"pass@count":0.3333333333,"win_rate":0.2016022975,"count":11.0,"SE(A)":0.0785719652,"SE_x(A)":0.0736619235,"SE_pred(A)":0.0273399844}
{"model":"google_gemma_3_27b_it","pass1":0.2444444444,"pass@count":0.3666666667,"win_rate":0.1986956798,"count":12.0,"SE(A)":0.0784625936,"SE_x(A)":0.0648345328,"SE_pred(A)":0.0441911977}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.2153846154,"pass@count":0.2666666667,"win_rate":0.1701191987,"count":13.0,"SE(A)":0.075054221,"SE_x(A)":0.0676683459,"SE_pred(A)":0.0324673845}
{"model":"google_gemma_3_12b_it","pass1":0.2090909091,"pass@count":0.3666666667,"win_rate":0.1664736007,"count":11.0,"SE(A)":0.0742455163,"SE_x(A)":0.0616053973,"SE_pred(A)":0.0414387707}
{"model":"qwen3-32b","pass1":0.1909090909,"pass@count":0.3666666667,"win_rate":0.154439493,"count":11.0,"SE(A)":0.0717548628,"SE_x(A)":0.0531556959,"SE_pred(A)":0.0481999204}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.1878787879,"pass@count":0.2666666667,"win_rate":0.1458620554,"count":11.0,"SE(A)":0.0713162789,"SE_x(A)":0.066020746,"SE_pred(A)":0.0269679945}
{"model":"qwen3-14b","pass1":0.1805555556,"pass@count":0.4,"win_rate":0.1446681305,"count":12.0,"SE(A)":0.0702270714,"SE_x(A)":0.0532529996,"SE_pred(A)":0.0457816513}
{"model":"qwen3-8b","pass1":0.175,"pass@count":0.3333333333,"win_rate":0.1381782188,"count":12.0,"SE(A)":0.0693721846,"SE_x(A)":0.052281693,"SE_pred(A)":0.0455974186}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.1717948718,"pass@count":0.2,"win_rate":0.1321513077,"count":13.0,"SE(A)":0.0688673589,"SE_x(A)":0.0632611444,"SE_pred(A)":0.0272165527}
{"model":"qwen3-4b","pass1":0.1638888889,"pass@count":0.2666666667,"win_rate":0.1282653712,"count":12.0,"SE(A)":0.0675843475,"SE_x(A)":0.0556289498,"SE_pred(A)":0.0383805155}
{"model":"qwen2-math-72b-instruct","pass1":0.12,"pass@count":0.3333333333,"win_rate":0.0947722397,"count":10.0,"SE(A)":0.0593295879,"SE_x(A)":0.0383486282,"SE_pred(A)":0.0452701084}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.1151515152,"pass@count":0.3,"win_rate":0.0895104964,"count":11.0,"SE(A)":0.0582785391,"SE_x(A)":0.0370969824,"SE_pred(A)":0.0449466575}
{"model":"qwen2-math-7b-instruct","pass1":0.1,"pass@count":0.2333333333,"win_rate":0.0787162347,"count":6.0,"SE(A)":0.0547722558,"SE_x(A)":0.0370185139,"SE_pred(A)":0.0403686714}
{"model":"google_gemma_3_4b_it","pass1":0.0923076923,"pass@count":0.2,"win_rate":0.0697128965,"count":13.0,"SE(A)":0.0528478893,"SE_x(A)":0.0415273806,"SE_pred(A)":0.0326860225}
{"model":"qwen3-1.7b","pass1":0.0805555556,"pass@count":0.3,"win_rate":0.0646745992,"count":12.0,"SE(A)":0.0496878114,"SE_x(A)":0.0278739916,"SE_pred(A)":0.0411329453}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.0777777778,"pass@count":0.2333333333,"win_rate":0.0583456105,"count":12.0,"SE(A)":0.0488973057,"SE_x(A)":0.0333389445,"SE_pred(A)":0.035769558}
{"model":"qwen2-72b-instruct","pass1":0.0363636364,"pass@count":0.2666666667,"win_rate":0.0299436224,"count":11.0,"SE(A)":0.0341766598,"SE_x(A)":0.0060606061,"SE_pred(A)":0.0336349986}
{"model":"llama-3.1-70B-instruct","pass1":0.0333333333,"pass@count":0.0333333333,"win_rate":0.0233977787,"count":13.0,"SE(A)":0.0327730693,"SE_x(A)":0.0327730693,"SE_pred(A)":0.0}
{"model":"qwen2-math-1.5b-instruct","pass1":0.0333333333,"pass@count":0.1,"win_rate":0.0260628152,"count":4.0,"SE(A)":0.0327730693,"SE_x(A)":0.0121716124,"SE_pred(A)":0.030429031}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.0181818182,"pass@count":0.2,"win_rate":0.0140722528,"count":11.0,"SE(A)":0.0243934688,"SE_x(A)":0.0,"SE_pred(A)":0.0246182982}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.0121212121,"pass@count":0.1333333333,"win_rate":0.0094340775,"count":11.0,"SE(A)":0.0199785621,"SE_x(A)":0.0,"SE_pred(A)":0.0201007563}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.0111111111,"pass@count":0.1,"win_rate":0.0083067033,"count":12.0,"SE(A)":0.0191377936,"SE_x(A)":0.0035664815,"SE_pred(A)":0.0188025358}
{"model":"google_gemma_2_27b_it","pass1":0.01,"pass@count":0.0666666667,"win_rate":0.0081224081,"count":10.0,"SE(A)":0.0181659021,"SE_x(A)":0.0046214743,"SE_pred(A)":0.0175682092}
{"model":"qwen3-0.6b","pass1":0.0076923077,"pass@count":0.1,"win_rate":0.0059554473,"count":13.0,"SE(A)":0.0159511087,"SE_x(A)":0.0,"SE_pred(A)":0.0160128154}
{"model":"google_gemma_2_9b_it","pass1":0.0060606061,"pass@count":0.0333333333,"win_rate":0.004211732,"count":11.0,"SE(A)":0.0141702448,"SE_x(A)":0.0043563351,"SE_pred(A)":0.0134839972}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.0060606061,"pass@count":0.0666666667,"win_rate":0.0043821251,"count":11.0,"SE(A)":0.0141702448,"SE_x(A)":0.0,"SE_pred(A)":0.0142133811}
{"model":"qwen2-1.5b-instruct","pass1":0.0051282051,"pass@count":0.0333333333,"win_rate":0.0038676005,"count":13.0,"SE(A)":0.0130408418,"SE_x(A)":0.003656282,"SE_pred(A)":0.0125177936}
{"model":"google_gemma_3_1b_it","pass1":0.0051282051,"pass@count":0.0333333333,"win_rate":0.0035145749,"count":13.0,"SE(A)":0.0130408418,"SE_x(A)":0.003656282,"SE_pred(A)":0.0125177936}
{"model":"qwen1.5-72b-chat","pass1":0.003030303,"pass@count":0.0333333333,"win_rate":0.003030303,"count":11.0,"SE(A)":0.0100351388,"SE_x(A)":0.0,"SE_pred(A)":0.0100503782}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.003030303,"pass@count":0.0333333333,"win_rate":0.0020730547,"count":11.0,"SE(A)":0.0100351388,"SE_x(A)":0.0,"SE_pred(A)":0.0100503782}
{"model":"qwen2-7b-instruct","pass1":0.003030303,"pass@count":0.0333333333,"win_rate":0.0028071819,"count":11.0,"SE(A)":0.0100351388,"SE_x(A)":0.0,"SE_pred(A)":0.0100503782}
{"model":"qwen1.5-32b-chat","pass1":0.003030303,"pass@count":0.0333333333,"win_rate":0.0028071819,"count":11.0,"SE(A)":0.0100351388,"SE_x(A)":0.0,"SE_pred(A)":0.0100503782}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.0027777778,"pass@count":0.0333333333,"win_rate":0.0024281846,"count":12.0,"SE(A)":0.0096091306,"SE_x(A)":0.0,"SE_pred(A)":0.0096225045}
{"model":"google_codegemma_1.1_7b_it","pass1":0.0025641026,"pass@count":0.0333333333,"win_rate":0.0019299328,"count":13.0,"SE(A)":0.0092331431,"SE_x(A)":0.0,"SE_pred(A)":0.0092450033}
{"model":"llama-3.2-3B-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":18.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"llama-3.2-1B-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":21.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"llama-3.1-8B-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":15.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"google_gemma_7b_it","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"deepseek_v2_lite_chat","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"google_gemma_2b_it","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen1.5-0.5b-chat","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen1.5-7b-chat","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":12.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen2-0.5b-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen1.5-14b-chat","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":12.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen1.5-1.8b-chat","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
