{"model":"google_gemma_3_27b_it","pass1":0.861225,"pass@count":0.9326,"win_rate":0.4663080621,"count":8.0,"SE(A)":0.0048891001,"SE_x(A)":0.0041215132,"SE_pred(A)":0.0026299104}
{"model":"qwen3-14b","pass1":0.82848,"pass@count":0.9342,"win_rate":0.4400032,"count":10.0,"SE(A)":0.0053310579,"SE_x(A)":0.004284411,"SE_pred(A)":0.0031723808}
{"model":"google_gemma_3_12b_it","pass1":0.8013090909,"pass@count":0.9218,"win_rate":0.4188534747,"count":11.0,"SE(A)":0.0056429218,"SE_x(A)":0.0046141702,"SE_pred(A)":0.0032483842}
{"model":"qwen3-4b","pass1":0.7856833333,"pass@count":0.9212,"win_rate":0.4068847533,"count":12.0,"SE(A)":0.0058031893,"SE_x(A)":0.0046888366,"SE_pred(A)":0.0034193301}
{"model":"qwen3-8b","pass1":0.78188,"pass@count":0.9278,"win_rate":0.4050823304,"count":10.0,"SE(A)":0.0058402682,"SE_x(A)":0.0045721208,"SE_pred(A)":0.003633792}
{"model":"qwen3-32b","pass1":0.76858,"pass@count":0.9424,"win_rate":0.4017986335,"count":10.0,"SE(A)":0.0059643069,"SE_x(A)":0.0041123204,"SE_pred(A)":0.004319928}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.7356222222,"pass@count":0.9238,"win_rate":0.3939617116,"count":9.0,"SE(A)":0.0062367005,"SE_x(A)":0.0045597015,"SE_pred(A)":0.0042550623}
{"model":"google_gemma_3_4b_it","pass1":0.7202461538,"pass@count":0.8792,"win_rate":0.3595719687,"count":13.0,"SE(A)":0.0063480963,"SE_x(A)":0.0052742595,"SE_pred(A)":0.0035327769}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.7184166667,"pass@count":0.956,"win_rate":0.3881392595,"count":12.0,"SE(A)":0.0063607257,"SE_x(A)":0.0036801328,"SE_pred(A)":0.0051880107}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.6910333333,"pass@count":0.9462,"win_rate":0.3620027942,"count":12.0,"SE(A)":0.0065346196,"SE_x(A)":0.0039534463,"SE_pred(A)":0.0052030294}
{"model":"qwen3-1.7b","pass1":0.6386666667,"pass@count":0.8652,"win_rate":0.3053432875,"count":12.0,"SE(A)":0.0067936964,"SE_x(A)":0.0053399361,"SE_pred(A)":0.0041999278}
{"model":"llama-3.1-70B-instruct","pass1":0.6292166667,"pass@count":0.6312,"win_rate":0.2983767339,"count":12.0,"SE(A)":0.0068308572,"SE_x(A)":0.0068041212,"SE_pred(A)":0.000603776}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.6258333333,"pass@count":0.934,"win_rate":0.3160753693,"count":12.0,"SE(A)":0.0068434782,"SE_x(A)":0.0038767974,"SE_pred(A)":0.0056394713}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.6133818182,"pass@count":0.9256,"win_rate":0.3274013568,"count":11.0,"SE(A)":0.0068868652,"SE_x(A)":0.0041018625,"SE_pred(A)":0.0055320553}
{"model":"qwen2-72b-instruct","pass1":0.59312,"pass@count":0.856,"win_rate":0.2741739165,"count":10.0,"SE(A)":0.0069473544,"SE_x(A)":0.0051329827,"SE_pred(A)":0.0046816901}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.5801777778,"pass@count":0.9018,"win_rate":0.3063521002,"count":9.0,"SE(A)":0.0069795634,"SE_x(A)":0.0043787206,"SE_pred(A)":0.0054351735}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.55148,"pass@count":0.8612,"win_rate":0.2606972482,"count":10.0,"SE(A)":0.0070334886,"SE_x(A)":0.004953177,"SE_pred(A)":0.0049935959}
{"model":"google_gemma_2_27b_it","pass1":0.5194285714,"pass@count":0.7156,"win_rate":0.2249734908,"count":7.0,"SE(A)":0.0070657276,"SE_x(A)":0.0058212036,"SE_pred(A)":0.0040047591}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.4941333333,"pass@count":0.823,"win_rate":0.2170713175,"count":9.0,"SE(A)":0.0070705811,"SE_x(A)":0.004773981,"SE_pred(A)":0.005215575}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.4589818182,"pass@count":0.8338,"win_rate":0.2011169084,"count":11.0,"SE(A)":0.0070472336,"SE_x(A)":0.0044583172,"SE_pred(A)":0.0054577385}
{"model":"google_gemma_2_9b_it","pass1":0.44878,"pass@count":0.6698,"win_rate":0.1823829278,"count":10.0,"SE(A)":0.0070338682,"SE_x(A)":0.0058722863,"SE_pred(A)":0.003871893}
{"model":"qwen1.5-72b-chat","pass1":0.43464,"pass@count":0.7266,"win_rate":0.1780802487,"count":10.0,"SE(A)":0.0070103933,"SE_x(A)":0.0053320678,"SE_pred(A)":0.0045513368}
{"model":"qwen1.5-32b-chat","pass1":0.4224,"pass@count":0.7242,"win_rate":0.1713996524,"count":10.0,"SE(A)":0.0069853882,"SE_x(A)":0.0052286479,"SE_pred(A)":0.0046321581}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.41762,"pass@count":0.7536,"win_rate":0.1709496295,"count":10.0,"SE(A)":0.0069744324,"SE_x(A)":0.0050107037,"SE_pred(A)":0.0048513457}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.3972727273,"pass@count":0.7486,"win_rate":0.1593628553,"count":11.0,"SE(A)":0.0069202183,"SE_x(A)":0.0048899864,"SE_pred(A)":0.0048966779}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.3830181818,"pass@count":0.7426,"win_rate":0.1523388288,"count":11.0,"SE(A)":0.0068748128,"SE_x(A)":0.004811705,"SE_pred(A)":0.004910249}
{"model":"llama-3.2-3B-instruct","pass1":0.3777789474,"pass@count":0.3802,"win_rate":0.1509440137,"count":19.0,"SE(A)":0.0068565591,"SE_x(A)":0.0068261969,"SE_pred(A)":0.0006445453}
{"model":"llama-3.1-8B-instruct","pass1":0.374575,"pass@count":0.3768,"win_rate":0.1496652338,"count":16.0,"SE(A)":0.0068449773,"SE_x(A)":0.0068150603,"SE_pred(A)":0.0006392704}
{"model":"qwen3-0.6b","pass1":0.3704615385,"pass@count":0.7084,"win_rate":0.1477884389,"count":13.0,"SE(A)":0.0068296382,"SE_x(A)":0.0050320367,"SE_pred(A)":0.0046176362}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.3666666667,"pass@count":0.7584,"win_rate":0.148570696,"count":12.0,"SE(A)":0.0068150161,"SE_x(A)":0.0043957509,"SE_pred(A)":0.0052078612}
{"model":"qwen2-7b-instruct","pass1":0.3624,"pass@count":0.7262,"win_rate":0.1502876868,"count":11.0,"SE(A)":0.0067980327,"SE_x(A)":0.0044392641,"SE_pred(A)":0.0051484155}
{"model":"google_gemma_3_1b_it","pass1":0.3329333333,"pass@count":0.6032,"win_rate":0.129665002,"count":12.0,"SE(A)":0.006664664,"SE_x(A)":0.0052876804,"SE_pred(A)":0.0040568685}
{"model":"qwen1.5-14b-chat","pass1":0.31522,"pass@count":0.6398,"win_rate":0.1182225892,"count":10.0,"SE(A)":0.0065704848,"SE_x(A)":0.0047236689,"SE_pred(A)":0.0045670803}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.2717166667,"pass@count":0.6594,"win_rate":0.1014011258,"count":12.0,"SE(A)":0.0062910527,"SE_x(A)":0.004045208,"SE_pred(A)":0.0048180532}
{"model":"deepseek_v2_lite_chat","pass1":0.2573,"pass@count":0.573,"win_rate":0.0909566002,"count":10.0,"SE(A)":0.0061821794,"SE_x(A)":0.0043684993,"SE_pred(A)":0.0043744206}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.2559,"pass@count":0.5856,"win_rate":0.0915643119,"count":10.0,"SE(A)":0.0061711456,"SE_x(A)":0.0042580035,"SE_pred(A)":0.0044668159}
{"model":"google_codegemma_1.1_7b_it","pass1":0.2054307692,"pass@count":0.5326,"win_rate":0.0711860677,"count":13.0,"SE(A)":0.0057136498,"SE_x(A)":0.0040019417,"SE_pred(A)":0.0040780211}
{"model":"qwen1.5-7b-chat","pass1":0.20542,"pass@count":0.515,"win_rate":0.0701869066,"count":10.0,"SE(A)":0.0057135387,"SE_x(A)":0.0037919904,"SE_pred(A)":0.0042737961}
{"model":"llama-3.2-1B-instruct","pass1":0.1871238095,"pass@count":0.1886,"win_rate":0.0637867497,"count":21.0,"SE(A)":0.0055155868,"SE_x(A)":0.0054920534,"SE_pred(A)":0.0005089672}
{"model":"qwen2-1.5b-instruct","pass1":0.15145,"pass@count":0.5006,"win_rate":0.0505804437,"count":12.0,"SE(A)":0.0050697711,"SE_x(A)":0.0029696195,"SE_pred(A)":0.0041090071}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.1318181818,"pass@count":0.4272,"win_rate":0.0425652669,"count":11.0,"SE(A)":0.0047841854,"SE_x(A)":0.0030091486,"SE_pred(A)":0.0037193352}
{"model":"google_gemma_7b_it","pass1":0.1188,"pass@count":0.3168,"win_rate":0.0390723078,"count":12.0,"SE(A)":0.0045757308,"SE_x(A)":0.0034067282,"SE_pred(A)":0.0030547529}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.1033636364,"pass@count":0.3382,"win_rate":0.0319819436,"count":11.0,"SE(A)":0.0043053361,"SE_x(A)":0.0027833124,"SE_pred(A)":0.0032846752}
{"model":"qwen2-0.5b-instruct","pass1":0.0761846154,"pass@count":0.3438,"win_rate":0.0240980004,"count":13.0,"SE(A)":0.0037518134,"SE_x(A)":0.0020284392,"SE_pred(A)":0.0031561905}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.0721692308,"pass@count":0.3398,"win_rate":0.0229150394,"count":13.0,"SE(A)":0.0036595309,"SE_x(A)":0.0018608965,"SE_pred(A)":0.0031510682}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.0705818182,"pass@count":0.2878,"win_rate":0.0220804859,"count":11.0,"SE(A)":0.0036221547,"SE_x(A)":0.0020863203,"SE_pred(A)":0.0029609581}
{"model":"google_gemma_2b_it","pass1":0.06335,"pass@count":0.2224,"win_rate":0.0211280972,"count":12.0,"SE(A)":0.0034449028,"SE_x(A)":0.0023176054,"SE_pred(A)":0.0025487371}
{"model":"qwen1.5-1.8b-chat","pass1":0.0522,"pass@count":0.2538,"win_rate":0.0167246614,"count":10.0,"SE(A)":0.003145637,"SE_x(A)":0.0014550788,"SE_pred(A)":0.0027888668}
{"model":"qwen1.5-0.5b-chat","pass1":0.0129846154,"pass@count":0.1116,"win_rate":0.0048579655,"count":13.0,"SE(A)":0.0016010006,"SE_x(A)":0.000488979,"SE_pred(A)":0.0015245008}
