{"model":"deepseek_r1_distill_llama_70b","pass1":0.3745757576,"pass@count":0.8333333333,"win_rate":0.3150789945,"count":1100.0,"SE(A)":0.0883683125,"SE_x(A)":0.0674439488,"SE_pred(A)":0.0570987953}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.365969697,"pass@count":0.8,"win_rate":0.3068901897,"count":1100.0,"SE(A)":0.0879461725,"SE_x(A)":0.0676468475,"SE_pred(A)":0.0561999403}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.313969697,"pass@count":0.8,"win_rate":0.2602055142,"count":1100.0,"SE(A)":0.08473345,"SE_x(A)":0.062737253,"SE_pred(A)":0.0569543206}
{"model":"google_gemma_3_27b_it","pass1":0.2780183727,"pass@count":0.7333333333,"win_rate":0.2245181058,"count":1016.0,"SE(A)":0.0817973425,"SE_x(A)":0.0610044841,"SE_pred(A)":0.0544908997}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.2736969697,"pass@count":0.7666666667,"win_rate":0.2241367311,"count":1100.0,"SE(A)":0.0814016663,"SE_x(A)":0.0610737154,"SE_pred(A)":0.053816657}
{"model":"qwen3-32b","pass1":0.2411212121,"pass@count":0.8,"win_rate":0.1906076261,"count":1100.0,"SE(A)":0.0780986072,"SE_x(A)":0.0614578945,"SE_pred(A)":0.0481904518}
{"model":"qwen3-14b","pass1":0.236978684,"pass@count":0.7,"win_rate":0.1891103361,"count":1079.0,"SE(A)":0.0776358567,"SE_x(A)":0.0613305602,"SE_pred(A)":0.0476013512}
{"model":"qwen3-8b","pass1":0.2180606061,"pass@count":0.8333333333,"win_rate":0.1720550416,"count":1100.0,"SE(A)":0.075390136,"SE_x(A)":0.0579599652,"SE_pred(A)":0.0482111505}
{"model":"google_gemma_3_12b_it","pass1":0.2053636364,"pass@count":0.8333333333,"win_rate":0.1588514751,"count":1100.0,"SE(A)":0.0737539633,"SE_x(A)":0.057738859,"SE_pred(A)":0.045889773}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.2037272727,"pass@count":0.7666666667,"win_rate":0.1637516622,"count":1100.0,"SE(A)":0.0735351324,"SE_x(A)":0.0505185231,"SE_pred(A)":0.0534349561}
{"model":"llama-3.1-70B-instruct","pass1":0.1923636364,"pass@count":0.6,"win_rate":0.1547023697,"count":1100.0,"SE(A)":0.0719629228,"SE_x(A)":0.0526310219,"SE_pred(A)":0.0490778747}
{"model":"qwen2-math-72b-instruct","pass1":0.167311828,"pass@count":0.6333333333,"win_rate":0.1287249363,"count":155.0,"SE(A)":0.0681465529,"SE_x(A)":0.0504423644,"SE_pred(A)":0.0458205254}
{"model":"qwen3-4b","pass1":0.1646969697,"pass@count":0.8,"win_rate":0.1234396518,"count":1100.0,"SE(A)":0.0677180128,"SE_x(A)":0.0521021793,"SE_pred(A)":0.043256123}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.1432727273,"pass@count":0.7333333333,"win_rate":0.1098465132,"count":1100.0,"SE(A)":0.0639650042,"SE_x(A)":0.0435321007,"SE_pred(A)":0.0468665976}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.1332121212,"pass@count":0.8666666667,"win_rate":0.0977584611,"count":1100.0,"SE(A)":0.06203941,"SE_x(A)":0.0471963085,"SE_pred(A)":0.0402665725}
{"model":"qwen3-1.7b","pass1":0.0926363636,"pass@count":0.6666666667,"win_rate":0.067813999,"count":1100.0,"SE(A)":0.0529323051,"SE_x(A)":0.0375733691,"SE_pred(A)":0.0372836541}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.0804139635,"pass@count":0.6,"win_rate":0.0588219814,"count":1079.0,"SE(A)":0.0496479466,"SE_x(A)":0.0304499316,"SE_pred(A)":0.0392137765}
{"model":"google_gemma_3_4b_it","pass1":0.0701818182,"pass@count":0.5,"win_rate":0.0511961129,"count":1100.0,"SE(A)":0.0466391576,"SE_x(A)":0.0274707029,"SE_pred(A)":0.0376904696}
{"model":"llama-3.2-3B-instruct","pass1":0.0575757576,"pass@count":0.6,"win_rate":0.0444555366,"count":1100.0,"SE(A)":0.0425287314,"SE_x(A)":0.0201722817,"SE_pred(A)":0.0374402463}
{"model":"qwen2-72b-instruct","pass1":0.0509090909,"pass@count":0.6333333333,"win_rate":0.0355532429,"count":1100.0,"SE(A)":0.0401320136,"SE_x(A)":0.024368336,"SE_pred(A)":0.0318867169}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.0501515152,"pass@count":0.6,"win_rate":0.0356437449,"count":1100.0,"SE(A)":0.0398481872,"SE_x(A)":0.0209361289,"SE_pred(A)":0.0339051107}
{"model":"llama-3.1-8B-instruct","pass1":0.0484242424,"pass@count":0.6666666667,"win_rate":0.0358229131,"count":1100.0,"SE(A)":0.0391915532,"SE_x(A)":0.0195841748,"SE_pred(A)":0.0339475763}
{"model":"google_gemma_2_27b_it","pass1":0.0411818182,"pass@count":0.3666666667,"win_rate":0.0270597386,"count":1100.0,"SE(A)":0.0362794138,"SE_x(A)":0.0245123254,"SE_pred(A)":0.0267458739}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.0215757576,"pass@count":0.5333333333,"win_rate":0.0152209176,"count":1100.0,"SE(A)":0.0265268696,"SE_x(A)":0.0106256531,"SE_pred(A)":0.0243057669}
{"model":"qwen2-7b-instruct","pass1":0.018030303,"pass@count":0.5,"win_rate":0.0127830857,"count":1100.0,"SE(A)":0.024293491,"SE_x(A)":0.008364029,"SE_pred(A)":0.02280826}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.0164848485,"pass@count":0.5666666667,"win_rate":0.0122305673,"count":1100.0,"SE(A)":0.0232472925,"SE_x(A)":0.0094438911,"SE_pred(A)":0.0212426347}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.0161515152,"pass@count":0.6,"win_rate":0.0122467487,"count":1100.0,"SE(A)":0.0230149543,"SE_x(A)":0.0068153503,"SE_pred(A)":0.0219827006}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.0149090909,"pass@count":0.5666666667,"win_rate":0.0103249421,"count":1100.0,"SE(A)":0.0221260103,"SE_x(A)":0.0070933917,"SE_pred(A)":0.0209581517}
{"model":"qwen1.5-32b-chat","pass1":0.0141212121,"pass@count":0.6666666667,"win_rate":0.0101917794,"count":1100.0,"SE(A)":0.0215420546,"SE_x(A)":0.0064278829,"SE_pred(A)":0.0205607013}
{"model":"google_gemma_2_9b_it","pass1":0.0139090909,"pass@count":0.3,"win_rate":0.0092661782,"count":1100.0,"SE(A)":0.0213819457,"SE_x(A)":0.0106516771,"SE_pred(A)":0.0185399401}
{"model":"llama-3.2-1B-instruct","pass1":0.0136060606,"pass@count":0.6333333333,"win_rate":0.0110651242,"count":1100.0,"SE(A)":0.0211509934,"SE_x(A)":0.0061071696,"SE_pred(A)":0.0202501112}
{"model":"qwen3-0.6b","pass1":0.012030303,"pass@count":0.5333333333,"win_rate":0.0086407044,"count":1100.0,"SE(A)":0.0199044173,"SE_x(A)":0.0085085607,"SE_pred(A)":0.0179941719}
{"model":"qwen1.5-72b-chat","pass1":0.0104545455,"pass@count":0.5333333333,"win_rate":0.0082270985,"count":1100.0,"SE(A)":0.0185699111,"SE_x(A)":0.0041859283,"SE_pred(A)":0.0180919762}
{"model":"qwen1.5-14b-chat","pass1":0.0086060606,"pass@count":0.3333333333,"win_rate":0.007340795,"count":1100.0,"SE(A)":0.0168641596,"SE_x(A)":0.0037821863,"SE_pred(A)":0.0164345655}
{"model":"qwen1.5-7b-chat","pass1":0.0078484848,"pass@count":0.3666666667,"win_rate":0.0057203873,"count":1100.0,"SE(A)":0.016110955,"SE_x(A)":0.0056556907,"SE_pred(A)":0.0150856234}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.0073333333,"pass@count":0.6,"win_rate":0.0058165848,"count":1100.0,"SE(A)":0.0155772864,"SE_x(A)":0.0033320428,"SE_pred(A)":0.0152167455}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.004,"pass@count":0.6666666667,"win_rate":0.0030895769,"count":1100.0,"SE(A)":0.0115238882,"SE_x(A)":0.0013255515,"SE_pred(A)":0.0114473977}
{"model":"google_codegemma_1.1_7b_it","pass1":0.0038181818,"pass@count":0.4,"win_rate":0.0032120266,"count":1100.0,"SE(A)":0.0112599634,"SE_x(A)":0.0013295508,"SE_pred(A)":0.0111811928}
{"model":"deepseek_v2_lite_chat","pass1":0.0037575758,"pass@count":0.4,"win_rate":0.0034793747,"count":1100.0,"SE(A)":0.011170581,"SE_x(A)":0.0019092211,"SE_pred(A)":0.0110062143}
{"model":"google_gemma_3_1b_it","pass1":0.0036969697,"pass@count":0.3333333333,"win_rate":0.0031049721,"count":1100.0,"SE(A)":0.0110804665,"SE_x(A)":0.0016366356,"SE_pred(A)":0.0109589306}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.0020606061,"pass@count":0.4,"win_rate":0.0016907601,"count":1100.0,"SE(A)":0.0082792108,"SE_x(A)":0.000740132,"SE_pred(A)":0.0082460619}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.0016060606,"pass@count":0.4,"win_rate":0.0013776331,"count":1100.0,"SE(A)":0.0073109078,"SE_x(A)":0.0006066786,"SE_pred(A)":0.0072856924}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.0015454545,"pass@count":0.3333333333,"win_rate":0.0012131861,"count":1100.0,"SE(A)":0.0071718573,"SE_x(A)":0.0007890406,"SE_pred(A)":0.0071283204}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.001,"pass@count":0.2333333333,"win_rate":0.0009531282,"count":1100.0,"SE(A)":0.0057706152,"SE_x(A)":0.0005521765,"SE_pred(A)":0.0057441362}
{"model":"qwen2-1.5b-instruct","pass1":0.0006666667,"pass@count":0.3666666667,"win_rate":0.0005835098,"count":1100.0,"SE(A)":0.0047124736,"SE_x(A)":0.0001711795,"SE_pred(A)":0.0047093635}
{"model":"google_gemma_7b_it","pass1":0.0005454545,"pass@count":0.2,"win_rate":0.000491995,"count":1100.0,"SE(A)":0.0042628512,"SE_x(A)":0.0002332816,"SE_pred(A)":0.0042564634}
{"model":"qwen2-0.5b-instruct","pass1":0.0003636364,"pass@count":0.2333333333,"win_rate":0.0003017312,"count":1100.0,"SE(A)":0.00348092,"SE_x(A)":0.0001014792,"SE_pred(A)":0.0034794405}
{"model":"qwen1.5-0.5b-chat","pass1":0.0002727273,"pass@count":0.1333333333,"win_rate":0.0002267077,"count":1100.0,"SE(A)":0.0030147023,"SE_x(A)":0.0001019225,"SE_pred(A)":0.0030129788}
{"model":"qwen1.5-1.8b-chat","pass1":0.0001818182,"pass@count":0.1666666667,"win_rate":0.0001532031,"count":1100.0,"SE(A)":0.002461606,"SE_x(A)":0.0000271347,"SE_pred(A)":0.0024614564}
{"model":"google_gemma_2b_it","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":1100.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
