{"model":"google_gemma_3_12b_it","pass1":0.2755516328,"pass@count":0.5689320388,"win_rate":0.2288878495,"count":11.0,"SE(A)":0.0196879999,"SE_x(A)":0.014170814,"SE_pred(A)":0.0136676761}
{"model":"qwen2-72b-instruct","pass1":0.253592233,"pass@count":0.5902912621,"win_rate":0.2098659787,"count":10.0,"SE(A)":0.0191713385,"SE_x(A)":0.0131970236,"SE_pred(A)":0.0139060702}
{"model":"qwen3-32b","pass1":0.2485436893,"pass@count":0.586407767,"win_rate":0.2063326464,"count":10.0,"SE(A)":0.0190436251,"SE_x(A)":0.0128390107,"SE_pred(A)":0.0140648307}
{"model":"qwen3-14b","pass1":0.2459546926,"pass@count":0.5339805825,"win_rate":0.2013683724,"count":12.0,"SE(A)":0.0189767861,"SE_x(A)":0.0136610262,"SE_pred(A)":0.0131717415}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.2365048544,"pass@count":0.5106796117,"win_rate":0.1917075922,"count":10.0,"SE(A)":0.0187249021,"SE_x(A)":0.0131813911,"SE_pred(A)":0.0132993566}
{"model":"qwen3-4b","pass1":0.2165048544,"pass@count":0.4990291262,"win_rate":0.1754175699,"count":12.0,"SE(A)":0.0181488186,"SE_x(A)":0.0128172741,"SE_pred(A)":0.0128490117}
{"model":"qwen3-8b","pass1":0.1737864078,"pass@count":0.4640776699,"win_rate":0.139599977,"count":12.0,"SE(A)":0.0166974617,"SE_x(A)":0.0115506255,"SE_pred(A)":0.0120577062}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.167961165,"pass@count":0.4854368932,"win_rate":0.1332175949,"count":12.0,"SE(A)":0.0164729972,"SE_x(A)":0.0102972824,"SE_pred(A)":0.0128579007}
{"model":"qwen1.5-32b-chat","pass1":0.1353927626,"pass@count":0.440776699,"win_rate":0.1085289268,"count":11.0,"SE(A)":0.0150766045,"SE_x(A)":0.0092893303,"SE_pred(A)":0.011874862}
{"model":"qwen1.5-72b-chat","pass1":0.133592233,"pass@count":0.413592233,"win_rate":0.1072753789,"count":10.0,"SE(A)":0.014991606,"SE_x(A)":0.0091963995,"SE_pred(A)":0.0118395306}
{"model":"google_gemma_7b_it","pass1":0.1299477222,"pass@count":0.3339805825,"win_rate":0.1090581549,"count":13.0,"SE(A)":0.0148167651,"SE_x(A)":0.0108639863,"SE_pred(A)":0.0100752334}
{"model":"google_gemma_2_27b_it","pass1":0.1273786408,"pass@count":0.3766990291,"win_rate":0.1044510065,"count":10.0,"SE(A)":0.0146912115,"SE_x(A)":0.0099473515,"SE_pred(A)":0.0108111929}
{"model":"qwen2-math-72b-instruct","pass1":0.114368932,"pass@count":0.2291262136,"win_rate":0.0904538533,"count":10.0,"SE(A)":0.0140241594,"SE_x(A)":0.011187736,"SE_pred(A)":0.0084564537}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.112038835,"pass@count":0.3980582524,"win_rate":0.0876113459,"count":10.0,"SE(A)":0.0138988115,"SE_x(A)":0.0075375561,"SE_pred(A)":0.011677423}
{"model":"google_gemma_3_4b_it","pass1":0.110978342,"pass@count":0.3805825243,"win_rate":0.0884945066,"count":13.0,"SE(A)":0.013841134,"SE_x(A)":0.0086482237,"SE_pred(A)":0.0108067209}
{"model":"llama-3.1-8B-instruct","pass1":0.1048543689,"pass@count":0.1048543689,"win_rate":0.0859160971,"count":15.0,"SE(A)":0.0135000849,"SE_x(A)":0.0135000849,"SE_pred(A)":0.0}
{"model":"google_gemma_2_9b_it","pass1":0.0928508385,"pass@count":0.2951456311,"win_rate":0.0750784069,"count":11.0,"SE(A)":0.0127887663,"SE_x(A)":0.0086869253,"SE_pred(A)":0.0093856205}
{"model":"qwen1.5-14b-chat","pass1":0.0917475728,"pass@count":0.3883495146,"win_rate":0.0741863518,"count":12.0,"SE(A)":0.0127202885,"SE_x(A)":0.0068854101,"SE_pred(A)":0.0106956471}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.0893203883,"pass@count":0.3747572816,"win_rate":0.0707336687,"count":11.0,"SE(A)":0.012567662,"SE_x(A)":0.0065300032,"SE_pred(A)":0.0107380253}
{"model":"qwen2-1.5b-instruct","pass1":0.0757281553,"pass@count":0.3844660194,"win_rate":0.0649669749,"count":13.0,"SE(A)":0.0116580238,"SE_x(A)":0.0058609599,"SE_pred(A)":0.010077632}
{"model":"google_codegemma_1.1_7b_it","pass1":0.0745332338,"pass@count":0.3825242718,"win_rate":0.0599514465,"count":13.0,"SE(A)":0.0115731553,"SE_x(A)":0.0050224691,"SE_pred(A)":0.0104265396}
{"model":"qwen2-7b-instruct","pass1":0.0713150927,"pass@count":0.3106796117,"win_rate":0.0551575904,"count":11.0,"SE(A)":0.0113402154,"SE_x(A)":0.005929884,"SE_pred(A)":0.0096662796}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.066407767,"pass@count":0.1980582524,"win_rate":0.0490120265,"count":10.0,"SE(A)":0.0109719656,"SE_x(A)":0.0073044499,"SE_pred(A)":0.0081871266}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.06619594,"pass@count":0.2873786408,"win_rate":0.0532959805,"count":11.0,"SE(A)":0.0109556952,"SE_x(A)":0.0055162697,"SE_pred(A)":0.0094656234}
{"model":"qwen3-1.7b","pass1":0.0661812298,"pass@count":0.227184466,"win_rate":0.0505170438,"count":12.0,"SE(A)":0.0109545641,"SE_x(A)":0.0073944386,"SE_pred(A)":0.008082373}
{"model":"qwen1.5-7b-chat","pass1":0.0656957929,"pass@count":0.3436893204,"win_rate":0.0528995629,"count":12.0,"SE(A)":0.010917151,"SE_x(A)":0.0050095884,"SE_pred(A)":0.0096999077}
{"model":"llama-3.2-3B-instruct","pass1":0.0640776699,"pass@count":0.0640776699,"win_rate":0.0507438934,"count":17.0,"SE(A)":0.0107911976,"SE_x(A)":0.0107911976,"SE_pred(A)":0.0}
{"model":"deepseek_v2_lite_chat","pass1":0.0640776699,"pass@count":0.3145631068,"win_rate":0.0521368404,"count":11.0,"SE(A)":0.0107911976,"SE_x(A)":0.0047396798,"SE_pred(A)":0.0096946058}
{"model":"qwen2-math-7b-instruct","pass1":0.0614886731,"pass@count":0.1398058252,"win_rate":0.0460968237,"count":6.0,"SE(A)":0.0105855568,"SE_x(A)":0.007550259,"SE_pred(A)":0.0074194071}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.0601941748,"pass@count":0.1922330097,"win_rate":0.0448407697,"count":11.0,"SE(A)":0.0104807578,"SE_x(A)":0.0068444577,"SE_pred(A)":0.007937234}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.0563106796,"pass@count":0.3106796117,"win_rate":0.0440999384,"count":11.0,"SE(A)":0.0101579551,"SE_x(A)":0.0044524485,"SE_pred(A)":0.0091301563}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.0552515446,"pass@count":0.2815533981,"win_rate":0.0448523086,"count":11.0,"SE(A)":0.0100676172,"SE_x(A)":0.0044934217,"SE_pred(A)":0.0090092218}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.0547572816,"pass@count":0.172815534,"win_rate":0.0398155566,"count":10.0,"SE(A)":0.0100251065,"SE_x(A)":0.0066292538,"SE_pred(A)":0.007520356}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.0532038835,"pass@count":0.2349514563,"win_rate":0.0433215906,"count":10.0,"SE(A)":0.0098899999,"SE_x(A)":0.0050252522,"SE_pred(A)":0.0085181535}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.051618123,"pass@count":0.2951456311,"win_rate":0.0391790957,"count":12.0,"SE(A)":0.009749652,"SE_x(A)":0.0042112095,"SE_pred(A)":0.0087932604}
{"model":"google_gemma_2b_it","pass1":0.0510828977,"pass@count":0.132038835,"win_rate":0.0409910376,"count":13.0,"SE(A)":0.0097017099,"SE_x(A)":0.007420638,"SE_pred(A)":0.0062495846}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.0506619594,"pass@count":0.2796116505,"win_rate":0.0398069405,"count":11.0,"SE(A)":0.0096637974,"SE_x(A)":0.0037606346,"SE_pred(A)":0.0089020564}
{"model":"qwen2-0.5b-instruct","pass1":0.0489917849,"pass@count":0.2699029126,"win_rate":0.0402645208,"count":13.0,"SE(A)":0.0095115249,"SE_x(A)":0.004116628,"SE_pred(A)":0.0085745251}
{"model":"qwen1.5-1.8b-chat","pass1":0.0430714916,"pass@count":0.2233009709,"win_rate":0.0349673928,"count":11.0,"SE(A)":0.0089460452,"SE_x(A)":0.0035269008,"SE_pred(A)":0.0082214777}
{"model":"llama-3.2-1B-instruct","pass1":0.0427184466,"pass@count":0.0427184466,"win_rate":0.0347269653,"count":12.0,"SE(A)":0.008910949,"SE_x(A)":0.008910949,"SE_pred(A)":0.0}
{"model":"google_gemma_3_1b_it","pass1":0.0422330097,"pass@count":0.1922330097,"win_rate":0.0314646021,"count":12.0,"SE(A)":0.0088624201,"SE_x(A)":0.0051227629,"SE_pred(A)":0.0072318595}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.0416593116,"pass@count":0.1262135922,"win_rate":0.0294995944,"count":11.0,"SE(A)":0.008804656,"SE_x(A)":0.0058712045,"SE_pred(A)":0.0065613204}
{"model":"qwen2-math-1.5b-instruct","pass1":0.0402912621,"pass@count":0.0912621359,"win_rate":0.0294938969,"count":4.0,"SE(A)":0.0086650594,"SE_x(A)":0.0054336408,"SE_pred(A)":0.0067497261}
{"model":"qwen1.5-0.5b-chat","pass1":0.0391336818,"pass@count":0.2058252427,"win_rate":0.0326338422,"count":13.0,"SE(A)":0.008544826,"SE_x(A)":0.0037017587,"SE_pred(A)":0.0077013658}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.0354368932,"pass@count":0.1708737864,"win_rate":0.0257378368,"count":12.0,"SE(A)":0.0081468464,"SE_x(A)":0.0042910336,"SE_pred(A)":0.0069251814}
{"model":"qwen3-0.6b","pass1":0.0265870052,"pass@count":0.1553398058,"win_rate":0.019362876,"count":13.0,"SE(A)":0.0070889133,"SE_x(A)":0.0036565839,"SE_pred(A)":0.0060730623}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.0224919094,"pass@count":0.1029126214,"win_rate":0.015690216,"count":12.0,"SE(A)":0.0065338586,"SE_x(A)":0.0034702534,"SE_pred(A)":0.0055361222}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.0180052957,"pass@count":0.1184466019,"win_rate":0.0133131544,"count":11.0,"SE(A)":0.0058593722,"SE_x(A)":0.0023244607,"SE_pred(A)":0.0053785802}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.0168782674,"pass@count":0.132038835,"win_rate":0.0132642722,"count":13.0,"SE(A)":0.005676282,"SE_x(A)":0.0017389288,"SE_pred(A)":0.0054033604}
