{"model":"qwen3-32b","pass1":0.9416996047,"pass@count":1.0,"win_rate":0.4073871468,"count":11.0,"SE(A)":0.0244285692,"SE_x(A)":0.018235524,"SE_pred(A)":0.0162548656}
{"model":"qwen3-14b","pass1":0.9094202899,"pass@count":0.9891304348,"win_rate":0.3820967894,"count":12.0,"SE(A)":0.029922913,"SE_x(A)":0.0240898497,"SE_pred(A)":0.0177499257}
{"model":"qwen3-8b","pass1":0.8704710145,"pass@count":1.0,"win_rate":0.3628115879,"count":12.0,"SE(A)":0.0350079537,"SE_x(A)":0.0258098151,"SE_pred(A)":0.0236518554}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.8695652174,"pass@count":0.9565217391,"win_rate":0.3546016888,"count":11.0,"SE(A)":0.0351118632,"SE_x(A)":0.0291260484,"SE_pred(A)":0.0196090856}
{"model":"qwen2-72b-instruct","pass1":0.847826087,"pass@count":0.9673913043,"win_rate":0.3480440725,"count":11.0,"SE(A)":0.0374480819,"SE_x(A)":0.0284134709,"SE_pred(A)":0.0243933087}
{"model":"google_gemma_3_27b_it","pass1":0.839673913,"pass@count":0.9239130435,"win_rate":0.3327258919,"count":12.0,"SE(A)":0.038252825,"SE_x(A)":0.0337397856,"SE_pred(A)":0.0180251347}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.8359683794,"pass@count":0.9239130435,"win_rate":0.3456029511,"count":11.0,"SE(A)":0.0386068883,"SE_x(A)":0.0311551449,"SE_pred(A)":0.0228001924}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.831027668,"pass@count":0.9239130435,"win_rate":0.3437494219,"count":11.0,"SE(A)":0.0390680403,"SE_x(A)":0.0323616683,"SE_pred(A)":0.0218868499}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.8063241107,"pass@count":0.9239130435,"win_rate":0.3271789551,"count":11.0,"SE(A)":0.0412001397,"SE_x(A)":0.0328121986,"SE_pred(A)":0.0249160819}
{"model":"qwen3-4b","pass1":0.7998188406,"pass@count":0.9347826087,"win_rate":0.3128948114,"count":12.0,"SE(A)":0.0417170415,"SE_x(A)":0.0328188169,"SE_pred(A)":0.0257533844}
{"model":"google_gemma_3_12b_it","pass1":0.7845849802,"pass@count":0.8695652174,"win_rate":0.306522877,"count":11.0,"SE(A)":0.0428611749,"SE_x(A)":0.0377443812,"SE_pred(A)":0.0203086681}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.7817028986,"pass@count":0.9673913043,"win_rate":0.3065045417,"count":12.0,"SE(A)":0.0430676259,"SE_x(A)":0.0304702642,"SE_pred(A)":0.0304365471}
{"model":"google_gemma_2_27b_it","pass1":0.75,"pass@count":0.9456521739,"win_rate":0.2941347041,"count":10.0,"SE(A)":0.0451446949,"SE_x(A)":0.0343821299,"SE_pred(A)":0.0292559844}
{"model":"qwen2-math-72b-instruct","pass1":0.7460474308,"pass@count":1.0,"win_rate":0.2992003556,"count":11.0,"SE(A)":0.045380117,"SE_x(A)":0.0266922107,"SE_pred(A)":0.0366998761}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.7401185771,"pass@count":0.9347826087,"win_rate":0.2971635617,"count":11.0,"SE(A)":0.0457240148,"SE_x(A)":0.0330424358,"SE_pred(A)":0.0316051099}
{"model":"qwen1.5-72b-chat","pass1":0.7055335968,"pass@count":0.9239130435,"win_rate":0.2707845205,"count":11.0,"SE(A)":0.0475206981,"SE_x(A)":0.0331718518,"SE_pred(A)":0.0340271214}
{"model":"qwen1.5-32b-chat","pass1":0.70256917,"pass@count":0.9347826087,"win_rate":0.2683965399,"count":11.0,"SE(A)":0.0476588569,"SE_x(A)":0.0331433961,"SE_pred(A)":0.034247364}
{"model":"google_gemma_2_9b_it","pass1":0.6986166008,"pass@count":0.8913043478,"win_rate":0.2634521338,"count":11.0,"SE(A)":0.0478393423,"SE_x(A)":0.0360022792,"SE_pred(A)":0.0315029929}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.6729249012,"pass@count":0.9239130435,"win_rate":0.258480942,"count":11.0,"SE(A)":0.0489117416,"SE_x(A)":0.0357417988,"SE_pred(A)":0.033389853}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.6680602007,"pass@count":0.9239130435,"win_rate":0.2550007621,"count":13.0,"SE(A)":0.0490957105,"SE_x(A)":0.0342913901,"SE_pred(A)":0.0351353008}
{"model":"google_gemma_3_4b_it","pass1":0.6605351171,"pass@count":0.8695652174,"win_rate":0.2482642491,"count":13.0,"SE(A)":0.0493686747,"SE_x(A)":0.0375134996,"SE_pred(A)":0.0320936659}
{"model":"llama-3.1-8B-instruct","pass1":0.652173913,"pass@count":0.652173913,"win_rate":0.2422916033,"count":15.0,"SE(A)":0.0496556731,"SE_x(A)":0.0496556731,"SE_pred(A)":0.0}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.625,"pass@count":0.9565217391,"win_rate":0.2333725322,"count":10.0,"SE(A)":0.0504733033,"SE_x(A)":0.0323586427,"SE_pred(A)":0.0387359341}
{"model":"qwen3-1.7b","pass1":0.6032608696,"pass@count":0.902173913,"win_rate":0.2208206902,"count":12.0,"SE(A)":0.0510048157,"SE_x(A)":0.0355818358,"SE_pred(A)":0.0365434561}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.5830039526,"pass@count":0.9130434783,"win_rate":0.215842936,"count":11.0,"SE(A)":0.051405289,"SE_x(A)":0.0326461688,"SE_pred(A)":0.0397080772}
{"model":"qwen1.5-14b-chat","pass1":0.5751811594,"pass@count":0.8913043478,"win_rate":0.2102301696,"count":12.0,"SE(A)":0.0515359513,"SE_x(A)":0.0345331293,"SE_pred(A)":0.038254637}
{"model":"llama-3.2-3B-instruct","pass1":0.5652173913,"pass@count":0.5652173913,"win_rate":0.2013228438,"count":17.0,"SE(A)":0.0516832632,"SE_x(A)":0.0516832632,"SE_pred(A)":0.0}
{"model":"qwen2-math-7b-instruct","pass1":0.5543478261,"pass@count":0.8586956522,"win_rate":0.2049900637,"count":6.0,"SE(A)":0.0518197455,"SE_x(A)":0.0331980454,"SE_pred(A)":0.0397891418}
{"model":"qwen2-7b-instruct","pass1":0.5533596838,"pass@count":0.9239130435,"win_rate":0.1994445512,"count":11.0,"SE(A)":0.0518309065,"SE_x(A)":0.0306945746,"SE_pred(A)":0.0417646496}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.5389492754,"pass@count":0.9239130435,"win_rate":0.2073242458,"count":12.0,"SE(A)":0.0519701999,"SE_x(A)":0.032479115,"SE_pred(A)":0.0405710336}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.5375494071,"pass@count":0.9565217391,"win_rate":0.196945408,"count":11.0,"SE(A)":0.0519813974,"SE_x(A)":0.0290504481,"SE_pred(A)":0.0431061148}
{"model":"qwen3-0.6b","pass1":0.4724080268,"pass@count":0.847826087,"win_rate":0.1715158571,"count":13.0,"SE(A)":0.0520491702,"SE_x(A)":0.0369535868,"SE_pred(A)":0.0366544478}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.4584980237,"pass@count":0.8586956522,"win_rate":0.1659977105,"count":11.0,"SE(A)":0.0519487191,"SE_x(A)":0.0324198943,"SE_pred(A)":0.040590884}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.427173913,"pass@count":0.7826086957,"win_rate":0.1563765709,"count":10.0,"SE(A)":0.0515726968,"SE_x(A)":0.0342786152,"SE_pred(A)":0.0385320593}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.4221014493,"pass@count":0.8695652174,"win_rate":0.1422116757,"count":12.0,"SE(A)":0.0514920652,"SE_x(A)":0.0307191328,"SE_pred(A)":0.0413251456}
{"model":"deepseek_v2_lite_chat","pass1":0.4169960474,"pass@count":0.7826086957,"win_rate":0.1478228438,"count":11.0,"SE(A)":0.051405289,"SE_x(A)":0.0342830461,"SE_pred(A)":0.0383037399}
{"model":"qwen1.5-7b-chat","pass1":0.3686594203,"pass@count":0.902173913,"win_rate":0.1333629058,"count":12.0,"SE(A)":0.0502979873,"SE_x(A)":0.0287816827,"SE_pred(A)":0.0412492698}
{"model":"google_codegemma_1.1_7b_it","pass1":0.3620401338,"pass@count":0.7391304348,"win_rate":0.1203596955,"count":13.0,"SE(A)":0.0501050066,"SE_x(A)":0.0330107996,"SE_pred(A)":0.0376934848}
{"model":"google_gemma_7b_it","pass1":0.3570234114,"pass@count":0.6739130435,"win_rate":0.126844733,"count":13.0,"SE(A)":0.0499519002,"SE_x(A)":0.0362243447,"SE_pred(A)":0.0343946099}
{"model":"google_gemma_3_1b_it","pass1":0.347826087,"pass@count":0.7826086957,"win_rate":0.119243541,"count":13.0,"SE(A)":0.0496556731,"SE_x(A)":0.0322778198,"SE_pred(A)":0.0377336484}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.3389328063,"pass@count":0.8369565217,"win_rate":0.1184086786,"count":11.0,"SE(A)":0.0493498336,"SE_x(A)":0.0284853594,"SE_pred(A)":0.040298764}
{"model":"google_gemma_2b_it","pass1":0.3327759197,"pass@count":0.4347826087,"win_rate":0.1080634321,"count":13.0,"SE(A)":0.0491267335,"SE_x(A)":0.0456755777,"SE_pred(A)":0.0180880499}
{"model":"qwen2-1.5b-instruct","pass1":0.2859531772,"pass@count":0.7282608696,"win_rate":0.1003746813,"count":13.0,"SE(A)":0.0471104112,"SE_x(A)":0.0268295934,"SE_pred(A)":0.0387242013}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.2826086956,"pass@count":0.8152173913,"win_rate":0.1021186745,"count":13.0,"SE(A)":0.046943655,"SE_x(A)":0.0230535452,"SE_pred(A)":0.040893041}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.2826086956,"pass@count":0.8152173913,"win_rate":0.0977549476,"count":11.0,"SE(A)":0.046943655,"SE_x(A)":0.0235814945,"SE_pred(A)":0.040590884}
{"model":"llama-3.2-1B-instruct","pass1":0.25,"pass@count":0.25,"win_rate":0.0994788943,"count":12.0,"SE(A)":0.0451446949,"SE_x(A)":0.0451446949,"SE_pred(A)":0.0}
{"model":"qwen1.5-1.8b-chat","pass1":0.1936758893,"pass@count":0.7391304348,"win_rate":0.0720295544,"count":11.0,"SE(A)":0.0412001397,"SE_x(A)":0.0187235308,"SE_pred(A)":0.0366998761}
{"model":"qwen2-0.5b-instruct","pass1":0.1588628762,"pass@count":0.7608695652,"win_rate":0.0619989397,"count":13.0,"SE(A)":0.038111031,"SE_x(A)":0.0121772173,"SE_pred(A)":0.0361132394}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.1337792642,"pass@count":0.8043478261,"win_rate":0.0535465655,"count":13.0,"SE(A)":0.0354907172,"SE_x(A)":0.005847214,"SE_pred(A)":0.0350057295}
{"model":"qwen2-math-1.5b-instruct","pass1":0.1222826087,"pass@count":0.2934782609,"win_rate":0.0440815914,"count":4.0,"SE(A)":0.0341559041,"SE_x(A)":0.0192127695,"SE_pred(A)":0.0282399588}
{"model":"qwen1.5-0.5b-chat","pass1":0.0994983278,"pass@count":0.597826087,"win_rate":0.0445598869,"count":13.0,"SE(A)":0.0312073031,"SE_x(A)":0.0078765362,"SE_pred(A)":0.0301969526}
