{"model":"qwen2.5-coder-32b-instruct","pass1":0.76075,"pass@count":0.9275,"win_rate":0.4458446268,"count":10.0,"SE(A)":0.0150834942,"SE_x(A)":0.0115535284,"SE_pred(A)":0.0096967921}
{"model":"google_gemma_3_27b_it","pass1":0.6897222222,"pass@count":0.87,"win_rate":0.3870754725,"count":9.0,"SE(A)":0.0163556366,"SE_x(A)":0.0128735129,"SE_pred(A)":0.0100885833}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.6611458333,"pass@count":0.91,"win_rate":0.3700795379,"count":12.0,"SE(A)":0.0167343965,"SE_x(A)":0.0122979799,"SE_pred(A)":0.0113489963}
{"model":"google_gemma_3_12b_it","pass1":0.6254545455,"pass@count":0.85625,"win_rate":0.3374759361,"count":11.0,"SE(A)":0.0171121725,"SE_x(A)":0.0131927267,"SE_pred(A)":0.0108985508}
{"model":"llama-3.1-70B-instruct","pass1":0.61375,"pass@count":0.61375,"win_rate":0.3294143166,"count":13.0,"SE(A)":0.0172141271,"SE_x(A)":0.0172141271,"SE_pred(A)":0.0}
{"model":"google_gemma_2_27b_it","pass1":0.579875,"pass@count":0.85,"win_rate":0.3062977914,"count":10.0,"SE(A)":0.0174506441,"SE_x(A)":0.0134161742,"SE_pred(A)":0.0111593571}
{"model":"qwen2-72b-instruct","pass1":0.569125,"pass@count":0.8425,"win_rate":0.2982733024,"count":10.0,"SE(A)":0.0175079173,"SE_x(A)":0.0129146268,"SE_pred(A)":0.0118211498}
{"model":"qwen2-math-72b-instruct","pass1":0.565625,"pass@count":0.84875,"win_rate":0.2978991784,"count":10.0,"SE(A)":0.0175247453,"SE_x(A)":0.012837744,"SE_pred(A)":0.0119293348}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.5605681818,"pass@count":0.8675,"win_rate":0.2958974662,"count":11.0,"SE(A)":0.017547489,"SE_x(A)":0.0123071323,"SE_pred(A)":0.012507952}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.5493181818,"pass@count":0.85375,"win_rate":0.2878644183,"count":11.0,"SE(A)":0.0175914652,"SE_x(A)":0.0126457183,"SE_pred(A)":0.0122288779}
{"model":"google_gemma_2_9b_it","pass1":0.479375,"pass@count":0.77,"win_rate":0.2342276876,"count":12.0,"SE(A)":0.0176626233,"SE_x(A)":0.0138114496,"SE_pred(A)":0.0110096377}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.4704545455,"pass@count":0.7825,"win_rate":0.2380032756,"count":11.0,"SE(A)":0.0176467797,"SE_x(A)":0.0126565099,"SE_pred(A)":0.0122972188}
{"model":"qwen1.5-72b-chat","pass1":0.47025,"pass@count":0.795,"win_rate":0.232776491,"count":10.0,"SE(A)":0.0176463501,"SE_x(A)":0.012570824,"SE_pred(A)":0.0123841857}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.468125,"pass@count":0.79125,"win_rate":0.236408607,"count":10.0,"SE(A)":0.0176417114,"SE_x(A)":0.0128883068,"SE_pred(A)":0.0120466397}
{"model":"google_codegemma_1.1_7b_it","pass1":0.4467307692,"pass@count":0.82625,"win_rate":0.2220183138,"count":13.0,"SE(A)":0.0175770585,"SE_x(A)":0.0120885265,"SE_pred(A)":0.0127601141}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.4397727273,"pass@count":0.7825,"win_rate":0.2124071087,"count":11.0,"SE(A)":0.0175489557,"SE_x(A)":0.0124206547,"SE_pred(A)":0.0123973054}
{"model":"google_gemma_3_4b_it","pass1":0.4370192308,"pass@count":0.7975,"win_rate":0.2160522909,"count":13.0,"SE(A)":0.0175368691,"SE_x(A)":0.0125371927,"SE_pred(A)":0.0122621604}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.4330681818,"pass@count":0.77875,"win_rate":0.2084423377,"count":11.0,"SE(A)":0.0175185663,"SE_x(A)":0.0123144014,"SE_pred(A)":0.0124601638}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.4330208333,"pass@count":0.82875,"win_rate":0.2167299663,"count":12.0,"SE(A)":0.0175183401,"SE_x(A)":0.0112422408,"SE_pred(A)":0.0134351874}
{"model":"llama-3.1-8B-instruct","pass1":0.419,"pass@count":0.42,"win_rate":0.2048285481,"count":15.0,"SE(A)":0.0174441609,"SE_x(A)":0.0174364817,"SE_pred(A)":0.0005175492}
{"model":"qwen1.5-32b-chat","pass1":0.4144318182,"pass@count":0.7725,"win_rate":0.1981197376,"count":11.0,"SE(A)":0.0174168771,"SE_x(A)":0.0123523505,"SE_pred(A)":0.0122787233}
{"model":"qwen2-7b-instruct","pass1":0.40375,"pass@count":0.76875,"win_rate":0.1897089756,"count":11.0,"SE(A)":0.0173470436,"SE_x(A)":0.0122339659,"SE_pred(A)":0.0122983739}
{"model":"qwen1.5-14b-chat","pass1":0.395625,"pass@count":0.735,"win_rate":0.1872125959,"count":12.0,"SE(A)":0.0172882134,"SE_x(A)":0.0124199991,"SE_pred(A)":0.0120260528}
{"model":"qwen3-1.7b","pass1":0.38625,"pass@count":0.72125,"win_rate":0.1843979532,"count":12.0,"SE(A)":0.0172141271,"SE_x(A)":0.0121570545,"SE_pred(A)":0.0121873786}
{"model":"qwen3-4b","pass1":0.3788541667,"pass@count":0.6475,"win_rate":0.1809824202,"count":12.0,"SE(A)":0.0171509361,"SE_x(A)":0.0133796461,"SE_pred(A)":0.0107303158}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.3604166667,"pass@count":0.715,"win_rate":0.16738611,"count":12.0,"SE(A)":0.0169748525,"SE_x(A)":0.0119334396,"SE_pred(A)":0.0120722258}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.3447916667,"pass@count":0.7525,"win_rate":0.1612922386,"count":12.0,"SE(A)":0.0168044032,"SE_x(A)":0.011113511,"SE_pred(A)":0.0126046754}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.3430208333,"pass@count":0.78,"win_rate":0.1688629901,"count":12.0,"SE(A)":0.0167838293,"SE_x(A)":0.0100205496,"SE_pred(A)":0.0134642308}
{"model":"qwen3-14b","pass1":0.3234375,"pass@count":0.51625,"win_rate":0.1492366553,"count":12.0,"SE(A)":0.016538806,"SE_x(A)":0.0139232339,"SE_pred(A)":0.0089261225}
{"model":"llama-3.2-3B-instruct","pass1":0.32125,"pass@count":0.32125,"win_rate":0.154198019,"count":18.0,"SE(A)":0.0165094078,"SE_x(A)":0.0165094078,"SE_pred(A)":0.0}
{"model":"qwen3-32b","pass1":0.3194318182,"pass@count":0.5675,"win_rate":0.1489884209,"count":11.0,"SE(A)":0.0164846569,"SE_x(A)":0.0132366848,"SE_pred(A)":0.0098251764}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.3171590909,"pass@count":0.65625,"win_rate":0.1396183121,"count":11.0,"SE(A)":0.0164533128,"SE_x(A)":0.0120085728,"SE_pred(A)":0.0112474745}
{"model":"qwen2-math-7b-instruct","pass1":0.3109375,"pass@count":0.65,"win_rate":0.1475971463,"count":12.0,"SE(A)":0.016365183,"SE_x(A)":0.0114669564,"SE_pred(A)":0.0116759636}
{"model":"google_gemma_7b_it","pass1":0.2975,"pass@count":0.63375,"win_rate":0.1462589016,"count":12.0,"SE(A)":0.0161629882,"SE_x(A)":0.0105885851,"SE_pred(A)":0.012211636}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.2821153846,"pass@count":0.7275,"win_rate":0.1470683287,"count":13.0,"SE(A)":0.0159109355,"SE_x(A)":0.0090854217,"SE_pred(A)":0.0130618904}
{"model":"qwen1.5-7b-chat","pass1":0.2716666667,"pass@count":0.655,"win_rate":0.1202377412,"count":12.0,"SE(A)":0.0157267244,"SE_x(A)":0.0106023749,"SE_pred(A)":0.0116154857}
{"model":"deepseek_v2_lite_chat","pass1":0.2652272727,"pass@count":0.65625,"win_rate":0.1204283446,"count":11.0,"SE(A)":0.0156077612,"SE_x(A)":0.0100394078,"SE_pred(A)":0.0119504184}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.2631818182,"pass@count":0.63125,"win_rate":0.1199653111,"count":11.0,"SE(A)":0.0155690859,"SE_x(A)":0.010148548,"SE_pred(A)":0.0118069221}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.2470454545,"pass@count":0.6025,"win_rate":0.1070835035,"count":11.0,"SE(A)":0.0152485244,"SE_x(A)":0.0097607981,"SE_pred(A)":0.011715132}
{"model":"qwen3-0.6b","pass1":0.2274038461,"pass@count":0.59875,"win_rate":0.1061651625,"count":13.0,"SE(A)":0.014819385,"SE_x(A)":0.0090151442,"SE_pred(A)":0.0117618598}
{"model":"qwen3-8b","pass1":0.211875,"pass@count":0.475,"win_rate":0.0966142589,"count":12.0,"SE(A)":0.0144474905,"SE_x(A)":0.0099117249,"SE_pred(A)":0.0105113124}
{"model":"google_gemma_2b_it","pass1":0.1871153846,"pass@count":0.52125,"win_rate":0.0975612741,"count":13.0,"SE(A)":0.0137887281,"SE_x(A)":0.0085300195,"SE_pred(A)":0.0108336415}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.1690909091,"pass@count":0.525,"win_rate":0.0714899993,"count":11.0,"SE(A)":0.0132523193,"SE_x(A)":0.0073526503,"SE_pred(A)":0.0110255385}
{"model":"qwen2-math-1.5b-instruct","pass1":0.1685416667,"pass@count":0.46375,"win_rate":0.0728150168,"count":12.0,"SE(A)":0.0132351508,"SE_x(A)":0.0083216979,"SE_pred(A)":0.0102916743}
{"model":"qwen2-1.5b-instruct","pass1":0.1508333333,"pass@count":0.60125,"win_rate":0.0703825831,"count":12.0,"SE(A)":0.0126531932,"SE_x(A)":0.0058016523,"SE_pred(A)":0.0112447378}
{"model":"llama-3.2-1B-instruct","pass1":0.08875,"pass@count":0.08875,"win_rate":0.0430084423,"count":21.0,"SE(A)":0.0100544416,"SE_x(A)":0.0100544416,"SE_pred(A)":0.0}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.06125,"pass@count":0.2725,"win_rate":0.0209348614,"count":12.0,"SE(A)":0.0084777973,"SE_x(A)":0.0042022651,"SE_pred(A)":0.0073630167}
{"model":"qwen2-0.5b-instruct","pass1":0.0485576923,"pass@count":0.29125,"win_rate":0.021820709,"count":13.0,"SE(A)":0.0075993291,"SE_x(A)":0.0027109339,"SE_pred(A)":0.0070993409}
{"model":"qwen1.5-1.8b-chat","pass1":0.030625,"pass@count":0.24375,"win_rate":0.0142353246,"count":12.0,"SE(A)":0.0060917064,"SE_x(A)":0.0014716334,"SE_pred(A)":0.0059112758}
{"model":"qwen1.5-0.5b-chat","pass1":0.0173076923,"pass@count":0.16625,"win_rate":0.0086689973,"count":13.0,"SE(A)":0.0046108752,"SE_x(A)":0.001080751,"SE_pred(A)":0.0044824265}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.015625,"pass@count":0.1075,"win_rate":0.0056352176,"count":10.0,"SE(A)":0.0043847548,"SE_x(A)":0.0012803393,"SE_pred(A)":0.0041936625}
{"model":"google_gemma_3_1b_it","pass1":0.0008653846,"pass@count":0.005,"win_rate":0.0003543845,"count":13.0,"SE(A)":0.0010396127,"SE_x(A)":0.0005473067,"SE_pred(A)":0.0008838835}
