{"model":"qwen3-32b","pass1":0.8747228381,"pass@count":0.9756097561,"win_rate":0.4287971072,"count":11.0,"SE(A)":0.0258493283,"SE_x(A)":0.0188466885,"SE_pred(A)":0.0176915263}
{"model":"google_gemma_3_27b_it","pass1":0.8648373984,"pass@count":0.8963414634,"win_rate":0.4142383746,"count":12.0,"SE(A)":0.0266976834,"SE_x(A)":0.0254886747,"SE_pred(A)":0.0079431579}
{"model":"qwen3-14b","pass1":0.8597560976,"pass@count":0.9451219512,"win_rate":0.4173183867,"count":12.0,"SE(A)":0.027114881,"SE_x(A)":0.0226801371,"SE_pred(A)":0.0148602877}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.832594235,"pass@count":0.9573170732,"win_rate":0.4097451842,"count":11.0,"SE(A)":0.0291527804,"SE_x(A)":0.0194778192,"SE_pred(A)":0.0216909926}
{"model":"google_gemma_3_12b_it","pass1":0.8281596452,"pass@count":0.8780487805,"win_rate":0.3868730017,"count":11.0,"SE(A)":0.0294576222,"SE_x(A)":0.0273637481,"SE_pred(A)":0.0109076487}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.8231707317,"pass@count":0.9573170732,"win_rate":0.3991560173,"count":12.0,"SE(A)":0.029792031,"SE_x(A)":0.0190551372,"SE_pred(A)":0.0229012414}
{"model":"qwen3-4b","pass1":0.7891260163,"pass@count":0.9207317073,"win_rate":0.3665524535,"count":12.0,"SE(A)":0.0318539107,"SE_x(A)":0.02694367,"SE_pred(A)":0.0169914765}
{"model":"qwen3-8b","pass1":0.7764227642,"pass@count":0.9512195122,"win_rate":0.366305353,"count":12.0,"SE(A)":0.0325342648,"SE_x(A)":0.0249716265,"SE_pred(A)":0.0208541664}
{"model":"google_gemma_2_27b_it","pass1":0.7524390244,"pass@count":0.8231707317,"win_rate":0.3367367247,"count":10.0,"SE(A)":0.0337019341,"SE_x(A)":0.030699887,"SE_pred(A)":0.0139045784}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.7416851441,"pass@count":0.9024390244,"win_rate":0.3335321714,"count":11.0,"SE(A)":0.0341792523,"SE_x(A)":0.0264272233,"SE_pred(A)":0.0216754044}
{"model":"qwen2-math-72b-instruct","pass1":0.7062084257,"pass@count":0.9329268293,"win_rate":0.3178731891,"count":11.0,"SE(A)":0.0355683887,"SE_x(A)":0.0244656861,"SE_pred(A)":0.0258174452}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.7017738359,"pass@count":0.9756097561,"win_rate":0.3379817435,"count":11.0,"SE(A)":0.0357231324,"SE_x(A)":0.0184357234,"SE_pred(A)":0.0305984688}
{"model":"google_gemma_3_4b_it","pass1":0.6955909944,"pass@count":0.7743902439,"win_rate":0.3050109443,"count":13.0,"SE(A)":0.0359321993,"SE_x(A)":0.0330939818,"SE_pred(A)":0.0139968324}
{"model":"llama-3.1-8B-instruct","pass1":0.6585365854,"pass@count":0.6585365854,"win_rate":0.2803636364,"count":15.0,"SE(A)":0.037028841,"SE_x(A)":0.037028841,"SE_pred(A)":0.0}
{"model":"google_gemma_2_9b_it","pass1":0.6308203991,"pass@count":0.7134146341,"win_rate":0.2655530484,"count":11.0,"SE(A)":0.0376833747,"SE_x(A)":0.0351495019,"SE_pred(A)":0.0135848901}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.5864745011,"pass@count":0.9451219512,"win_rate":0.2772898247,"count":11.0,"SE(A)":0.0384550863,"SE_x(A)":0.0198615804,"SE_pred(A)":0.0329288823}
{"model":"qwen2-7b-instruct","pass1":0.5770509978,"pass@count":0.9268292683,"win_rate":0.2466481479,"count":11.0,"SE(A)":0.0385770644,"SE_x(A)":0.0233632097,"SE_pred(A)":0.0306977252}
{"model":"qwen3-1.7b","pass1":0.5711382114,"pass@count":0.8170731707,"win_rate":0.2392754397,"count":12.0,"SE(A)":0.0386462502,"SE_x(A)":0.0324673636,"SE_pred(A)":0.0209619406}
{"model":"google_codegemma_1.1_7b_it","pass1":0.5642589118,"pass@count":0.7865853659,"win_rate":0.2226297282,"count":13.0,"SE(A)":0.038719661,"SE_x(A)":0.0311465663,"SE_pred(A)":0.0230022511}
{"model":"qwen2-72b-instruct","pass1":0.5498891353,"pass@count":0.9512195122,"win_rate":0.2558795968,"count":11.0,"SE(A)":0.0388486019,"SE_x(A)":0.020481566,"SE_pred(A)":0.033010897}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.544345898,"pass@count":0.9329268293,"win_rate":0.2514126094,"count":11.0,"SE(A)":0.0388895749,"SE_x(A)":0.0203609345,"SE_pred(A)":0.0331335386}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.5396341463,"pass@count":0.9268292683,"win_rate":0.2351207843,"count":10.0,"SE(A)":0.0389205832,"SE_x(A)":0.0213154509,"SE_pred(A)":0.0325647562}
{"model":"qwen1.5-14b-chat","pass1":0.5213414634,"pass@count":0.7987804878,"win_rate":0.2031601936,"count":12.0,"SE(A)":0.039007859,"SE_x(A)":0.0305069242,"SE_pred(A)":0.0243092706}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.5,"pass@count":0.9146341463,"win_rate":0.2134646015,"count":12.0,"SE(A)":0.0390434405,"SE_x(A)":0.021908063,"SE_pred(A)":0.0323175961}
{"model":"deepseek_v2_lite_chat","pass1":0.4883592018,"pass@count":0.8170731707,"win_rate":0.1919452408,"count":11.0,"SE(A)":0.0390328576,"SE_x(A)":0.0273282776,"SE_pred(A)":0.0278698622}
{"model":"qwen1.5-32b-chat","pass1":0.4822616408,"pass@count":0.737804878,"win_rate":0.1910013735,"count":11.0,"SE(A)":0.0390188627,"SE_x(A)":0.0316962485,"SE_pred(A)":0.0227556472}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.4606430155,"pass@count":0.8780487805,"win_rate":0.1915855235,"count":11.0,"SE(A)":0.0389222981,"SE_x(A)":0.023255596,"SE_pred(A)":0.0312109363}
{"model":"llama-3.2-3B-instruct","pass1":0.4573170732,"pass@count":0.4573170732,"win_rate":0.1686788902,"count":17.0,"SE(A)":0.0389009192,"SE_x(A)":0.0389009192,"SE_pred(A)":0.0}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.439578714,"pass@count":0.8231707317,"win_rate":0.1711108299,"count":11.0,"SE(A)":0.0387573178,"SE_x(A)":0.0232271224,"SE_pred(A)":0.0310262867}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.4312638581,"pass@count":0.8475609756,"win_rate":0.1660173197,"count":11.0,"SE(A)":0.038672747,"SE_x(A)":0.0232316796,"SE_pred(A)":0.0309171542}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.425891182,"pass@count":0.8841463415,"win_rate":0.1812527016,"count":13.0,"SE(A)":0.0386121966,"SE_x(A)":0.0222409914,"SE_pred(A)":0.0315632702}
{"model":"qwen1.5-72b-chat","pass1":0.4257206208,"pass@count":0.7012195122,"win_rate":0.1657581172,"count":11.0,"SE(A)":0.0386101982,"SE_x(A)":0.0297383814,"SE_pred(A)":0.0246247046}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.4157427938,"pass@count":0.6768292683,"win_rate":0.1479151833,"count":11.0,"SE(A)":0.0384850876,"SE_x(A)":0.0312754631,"SE_pred(A)":0.0224264884}
{"model":"google_gemma_3_1b_it","pass1":0.4132270169,"pass@count":0.5,"win_rate":0.1538362493,"count":13.0,"SE(A)":0.0384509863,"SE_x(A)":0.0357329959,"SE_pred(A)":0.0141996956}
{"model":"qwen1.5-7b-chat","pass1":0.3978658537,"pass@count":0.6951219512,"win_rate":0.1443037955,"count":12.0,"SE(A)":0.0382202073,"SE_x(A)":0.0293926618,"SE_pred(A)":0.0244306299}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.3866869919,"pass@count":0.6707317073,"win_rate":0.149010531,"count":12.0,"SE(A)":0.0380276025,"SE_x(A)":0.0280785054,"SE_pred(A)":0.0256455861}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.3639774859,"pass@count":0.7743902439,"win_rate":0.1361422414,"count":13.0,"SE(A)":0.0375708985,"SE_x(A)":0.0225451327,"SE_pred(A)":0.0300547734}
{"model":"qwen2-math-7b-instruct","pass1":0.3221544715,"pass@count":0.6585365854,"win_rate":0.1240927545,"count":6.0,"SE(A)":0.0364901397,"SE_x(A)":0.0238087481,"SE_pred(A)":0.0276527359}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.3181818182,"pass@count":0.756097561,"win_rate":0.1298246929,"count":11.0,"SE(A)":0.0363705648,"SE_x(A)":0.0203036908,"SE_pred(A)":0.030175787}
{"model":"llama-3.2-1B-instruct","pass1":0.3048780488,"pass@count":0.3048780488,"win_rate":0.1057407755,"count":12.0,"SE(A)":0.0359477453,"SE_x(A)":0.0359477453,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.2827050998,"pass@count":0.6341463415,"win_rate":0.0940066997,"count":11.0,"SE(A)":0.0351636355,"SE_x(A)":0.0241409375,"SE_pred(A)":0.025567487}
{"model":"google_gemma_7b_it","pass1":0.2631332083,"pass@count":0.4573170732,"win_rate":0.0857770525,"count":13.0,"SE(A)":0.0343843167,"SE_x(A)":0.0285107364,"SE_pred(A)":0.0192202796}
{"model":"qwen3-0.6b","pass1":0.2514071295,"pass@count":0.5609756098,"win_rate":0.0802547877,"count":13.0,"SE(A)":0.033875812,"SE_x(A)":0.02451184,"SE_pred(A)":0.0233824794}
{"model":"google_gemma_2b_it","pass1":0.1702626642,"pass@count":0.3170731707,"win_rate":0.0473373131,"count":13.0,"SE(A)":0.0293499999,"SE_x(A)":0.0243918847,"SE_pred(A)":0.0163235553}
{"model":"qwen2-1.5b-instruct","pass1":0.16369606,"pass@count":0.6524390244,"win_rate":0.06092398,"count":13.0,"SE(A)":0.0288921101,"SE_x(A)":0.0120816415,"SE_pred(A)":0.0262447702}
{"model":"qwen2-0.5b-instruct","pass1":0.1031894934,"pass@count":0.3536585366,"win_rate":0.0282375556,"count":13.0,"SE(A)":0.0237545146,"SE_x(A)":0.0143464707,"SE_pred(A)":0.0189329275}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.0914634146,"pass@count":0.3353658537,"win_rate":0.0268578813,"count":10.0,"SE(A)":0.0225098702,"SE_x(A)":0.0122378045,"SE_pred(A)":0.0188926017}
{"model":"qwen1.5-1.8b-chat","pass1":0.0742793792,"pass@count":0.2804878049,"win_rate":0.0208646727,"count":11.0,"SE(A)":0.0204763327,"SE_x(A)":0.0112503437,"SE_pred(A)":0.0171087687}
{"model":"qwen2-math-1.5b-instruct","pass1":0.0396341463,"pass@count":0.0975609756,"win_rate":0.014484518,"count":4.0,"SE(A)":0.0152346045,"SE_x(A)":0.0097859896,"SE_pred(A)":0.0116759403}
{"model":"qwen1.5-0.5b-chat","pass1":0.0290806754,"pass@count":0.1280487805,"win_rate":0.0051689419,"count":13.0,"SE(A)":0.0131211498,"SE_x(A)":0.0075372716,"SE_pred(A)":0.010740303}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.0267354597,"pass@count":0.1951219512,"win_rate":0.0080427955,"count":13.0,"SE(A)":0.0125961364,"SE_x(A)":0.0038967962,"SE_pred(A)":0.0119782148}
