{"model":"qwen3-32b","pass1":0.9253292418,"pass@count":0.9774964838,"win_rate":0.3191709049,"count":11.0,"SE(A)":0.0098579957,"SE_x(A)":0.0080861423,"SE_pred(A)":0.0056386506}
{"model":"qwen3-14b","pass1":0.9028257256,"pass@count":0.9563994374,"win_rate":0.3030328157,"count":11.0,"SE(A)":0.0111081776,"SE_x(A)":0.0095931656,"SE_pred(A)":0.0056002485}
{"model":"qwen2-72b-instruct","pass1":0.8895139866,"pass@count":0.9563994374,"win_rate":0.2964038484,"count":9.0,"SE(A)":0.0117569655,"SE_x(A)":0.0095665954,"SE_pred(A)":0.0068342146}
{"model":"llama-3.1-70B-instruct","pass1":0.8874824191,"pass@count":0.8874824191,"win_rate":0.3000403007,"count":11.0,"SE(A)":0.0118510075,"SE_x(A)":0.0118510075,"SE_pred(A)":0.0}
{"model":"google_gemma_3_27b_it","pass1":0.8864978903,"pass@count":0.94092827,"win_rate":0.2939375811,"count":10.0,"SE(A)":0.0118961387,"SE_x(A)":0.0101352975,"SE_pred(A)":0.0062284718}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.8863884982,"pass@count":0.9592123769,"win_rate":0.2979420842,"count":9.0,"SE(A)":0.0119011357,"SE_x(A)":0.0094715576,"SE_pred(A)":0.0072060132}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.8718549773,"pass@count":0.964838256,"win_rate":0.288621387,"count":9.0,"SE(A)":0.0125354,"SE_x(A)":0.0096156904,"SE_pred(A)":0.0080420613}
{"model":"qwen3-8b","pass1":0.8684311469,"pass@count":0.9578059072,"win_rate":0.2829840659,"count":11.0,"SE(A)":0.0126767942,"SE_x(A)":0.0104583423,"SE_pred(A)":0.0071640901}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.8640412564,"pass@count":0.9549929677,"win_rate":0.2794948298,"count":9.0,"SE(A)":0.0128539324,"SE_x(A)":0.0103164117,"SE_pred(A)":0.0076678046}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.8525550867,"pass@count":0.9549929677,"win_rate":0.2763561988,"count":12.0,"SE(A)":0.013296622,"SE_x(A)":0.0103684478,"SE_pred(A)":0.0083243886}
{"model":"google_gemma_3_12b_it","pass1":0.8518092316,"pass@count":0.9310829817,"win_rate":0.2718918744,"count":11.0,"SE(A)":0.013324378,"SE_x(A)":0.0113682034,"SE_pred(A)":0.0069500361}
{"model":"qwen3-4b","pass1":0.833141542,"pass@count":0.9226441632,"win_rate":0.2670686071,"count":11.0,"SE(A)":0.0139829469,"SE_x(A)":0.0120142616,"SE_pred(A)":0.0071540423}
{"model":"google_gemma_2_27b_it","pass1":0.8308016878,"pass@count":0.929676512,"win_rate":0.2612579711,"count":10.0,"SE(A)":0.0140608604,"SE_x(A)":0.0118586244,"SE_pred(A)":0.0075551853}
{"model":"qwen1.5-72b-chat","pass1":0.8154399125,"pass@count":0.9381153305,"win_rate":0.2540183476,"count":9.0,"SE(A)":0.0145488972,"SE_x(A)":0.0113831844,"SE_pred(A)":0.0090605476}
{"model":"google_gemma_2_9b_it","pass1":0.8086029067,"pass@count":0.9240506329,"win_rate":0.2499832722,"count":12.0,"SE(A)":0.0147536854,"SE_x(A)":0.0123343943,"SE_pred(A)":0.0080953042}
{"model":"qwen2-math-72b-instruct","pass1":0.8051258009,"pass@count":0.9479606188,"win_rate":0.2532444253,"count":9.0,"SE(A)":0.0148550543,"SE_x(A)":0.0111066779,"SE_pred(A)":0.0098648033}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.804756425,"pass@count":0.94092827,"win_rate":0.248332376,"count":11.0,"SE(A)":0.014865715,"SE_x(A)":0.0115223119,"SE_pred(A)":0.0093928596}
{"model":"qwen1.5-32b-chat","pass1":0.7905638665,"pass@count":0.94092827,"win_rate":0.2427061119,"count":11.0,"SE(A)":0.015260174,"SE_x(A)":0.011788316,"SE_pred(A)":0.0096906407}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.7680603503,"pass@count":0.9437412096,"win_rate":0.2486593075,"count":11.0,"SE(A)":0.0158288867,"SE_x(A)":0.0108751501,"SE_pred(A)":0.0115015115}
{"model":"llama-3.1-8B-instruct","pass1":0.7341772152,"pass@count":0.7341772152,"win_rate":0.2181004293,"count":15.0,"SE(A)":0.0165676821,"SE_x(A)":0.0165676821,"SE_pred(A)":0.0}
{"model":"qwen2-7b-instruct","pass1":0.7259728083,"pass@count":0.9367088608,"win_rate":0.2140062884,"count":12.0,"SE(A)":0.0167271598,"SE_x(A)":0.0123262723,"SE_pred(A)":0.0113075589}
{"model":"qwen1.5-14b-chat","pass1":0.7181946043,"pass@count":0.8902953586,"win_rate":0.2068450048,"count":11.0,"SE(A)":0.0168717807,"SE_x(A)":0.0137367527,"SE_pred(A)":0.0097958464}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.7098836466,"pass@count":0.9240506329,"win_rate":0.2088344768,"count":11.0,"SE(A)":0.0170194253,"SE_x(A)":0.0127511259,"SE_pred(A)":0.0112725163}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.7092123769,"pass@count":0.9465541491,"win_rate":0.2069353747,"count":12.0,"SE(A)":0.0170310456,"SE_x(A)":0.0121262006,"SE_pred(A)":0.011958753}
{"model":"qwen3-1.7b","pass1":0.7075821506,"pass@count":0.876230661,"win_rate":0.2093026212,"count":11.0,"SE(A)":0.0170590787,"SE_x(A)":0.0139210177,"SE_pred(A)":0.0098598902}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.6924519456,"pass@count":0.9578059072,"win_rate":0.2010888486,"count":12.0,"SE(A)":0.0173067895,"SE_x(A)":0.0119160278,"SE_pred(A)":0.0125512248}
{"model":"google_gemma_3_4b_it","pass1":0.6850589635,"pass@count":0.864978903,"win_rate":0.1971478251,"count":13.0,"SE(A)":0.0174198256,"SE_x(A)":0.0145601715,"SE_pred(A)":0.0095630397}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.6322081575,"pass@count":0.9254571027,"win_rate":0.1823087125,"count":12.0,"SE(A)":0.0180840748,"SE_x(A)":0.0116345312,"SE_pred(A)":0.0138445456}
{"model":"qwen2-math-7b-instruct","pass1":0.618590973,"pass@count":0.9268635724,"win_rate":0.1857111348,"count":11.0,"SE(A)":0.0182163966,"SE_x(A)":0.0124393764,"SE_pred(A)":0.0133078555}
{"model":"qwen1.5-7b-chat","pass1":0.6150961088,"pass@count":0.900140647,"win_rate":0.1710834184,"count":12.0,"SE(A)":0.0182478978,"SE_x(A)":0.0135312752,"SE_pred(A)":0.0122429721}
{"model":"llama-3.2-3B-instruct","pass1":0.6033755274,"pass@count":0.6033755274,"win_rate":0.1643248336,"count":18.0,"SE(A)":0.018346313,"SE_x(A)":0.018346313,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.5971636193,"pass@count":0.8691983122,"win_rate":0.1654916229,"count":12.0,"SE(A)":0.0183940013,"SE_x(A)":0.0139963203,"SE_pred(A)":0.0119349195}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.5764181903,"pass@count":0.9226441632,"win_rate":0.1591648452,"count":12.0,"SE(A)":0.0185311636,"SE_x(A)":0.0119849956,"SE_pred(A)":0.0141337859}
{"model":"deepseek_v2_lite_chat","pass1":0.5715381665,"pass@count":0.9029535865,"win_rate":0.1582721235,"count":11.0,"SE(A)":0.0185585435,"SE_x(A)":0.0129449714,"SE_pred(A)":0.0132983929}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.5529983378,"pass@count":0.929676512,"win_rate":0.1524204258,"count":11.0,"SE(A)":0.0186458283,"SE_x(A)":0.0119454537,"SE_pred(A)":0.0143168799}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.5437177684,"pass@count":0.8860759494,"win_rate":0.1553524135,"count":12.0,"SE(A)":0.0186796503,"SE_x(A)":0.0118515206,"SE_pred(A)":0.0144385177}
{"model":"google_codegemma_1.1_7b_it","pass1":0.5288326301,"pass@count":0.8340365682,"win_rate":0.1417718583,"count":13.0,"SE(A)":0.0187202621,"SE_x(A)":0.0139727934,"SE_pred(A)":0.0124583007}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.5229723394,"pass@count":0.8073136428,"win_rate":0.1424706719,"count":12.0,"SE(A)":0.0187316632,"SE_x(A)":0.0138874911,"SE_pred(A)":0.0125703142}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.4896858884,"pass@count":0.8888888889,"win_rate":0.131307735,"count":12.0,"SE(A)":0.018747475,"SE_x(A)":0.0124294537,"SE_pred(A)":0.0140348317}
{"model":"google_gemma_7b_it","pass1":0.4812470699,"pass@count":0.7735583685,"win_rate":0.131362947,"count":12.0,"SE(A)":0.0187382716,"SE_x(A)":0.0142362607,"SE_pred(A)":0.0121840759}
{"model":"qwen3-0.6b","pass1":0.4718706048,"pass@count":0.8227848101,"win_rate":0.1285951354,"count":12.0,"SE(A)":0.0187217668,"SE_x(A)":0.0136212419,"SE_pred(A)":0.0128439215}
{"model":"qwen2-1.5b-instruct","pass1":0.3924050633,"pass@count":0.841068917,"win_rate":0.104367479,"count":11.0,"SE(A)":0.01831216,"SE_x(A)":0.0115507274,"SE_pred(A)":0.0142097114}
{"model":"google_gemma_3_1b_it","pass1":0.3537271449,"pass@count":0.7130801688,"win_rate":0.0938835104,"count":12.0,"SE(A)":0.0179311173,"SE_x(A)":0.0131329289,"SE_pred(A)":0.0122086505}
{"model":"qwen2-math-1.5b-instruct","pass1":0.342795039,"pass@count":0.8565400844,"win_rate":0.1069330778,"count":11.0,"SE(A)":0.0178005278,"SE_x(A)":0.0095256202,"SE_pred(A)":0.0150373319}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.3390870733,"pass@count":0.8860759494,"win_rate":0.0975069509,"count":11.0,"SE(A)":0.0177538661,"SE_x(A)":0.0080722173,"SE_pred(A)":0.0158126237}
{"model":"google_gemma_2b_it","pass1":0.3257149555,"pass@count":0.5091420534,"win_rate":0.0927587024,"count":12.0,"SE(A)":0.0175754241,"SE_x(A)":0.0152394862,"SE_pred(A)":0.0087552037}
{"model":"llama-3.2-1B-instruct","pass1":0.3192686357,"pass@count":0.3192686357,"win_rate":0.0866606167,"count":21.0,"SE(A)":0.0174836139,"SE_x(A)":0.0174836139,"SE_pred(A)":0.0}
{"model":"qwen1.5-1.8b-chat","pass1":0.3050867323,"pass@count":0.7946554149,"win_rate":0.0828957743,"count":12.0,"SE(A)":0.0172680049,"SE_x(A)":0.0098303992,"SE_pred(A)":0.0141967335}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.2648851383,"pass@count":0.776371308,"win_rate":0.073373563,"count":12.0,"SE(A)":0.016548994,"SE_x(A)":0.0080934115,"SE_pred(A)":0.0144348845}
{"model":"qwen2-0.5b-instruct","pass1":0.2226549821,"pass@count":0.7805907173,"win_rate":0.0707563239,"count":13.0,"SE(A)":0.015602289,"SE_x(A)":0.007862657,"SE_pred(A)":0.0134762772}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.1967885607,"pass@count":0.8593530239,"win_rate":0.0681680435,"count":12.0,"SE(A)":0.0149100832,"SE_x(A)":0.003931702,"SE_pred(A)":0.0143823607}
{"model":"qwen1.5-0.5b-chat","pass1":0.1966713549,"pass@count":0.7341772152,"win_rate":0.0623394695,"count":12.0,"SE(A)":0.0149067299,"SE_x(A)":0.0068245303,"SE_pred(A)":0.0132527877}
