{"model":"qwen3-32b","pass1":0.7849223947,"pass@count":0.8902439024,"win_rate":0.4036496273,"count":11.0,"SE(A)":0.0320840388,"SE_x(A)":0.0267529332,"SE_pred(A)":0.0177106213}
{"model":"qwen3-14b","pass1":0.7784552846,"pass@count":0.8658536585,"win_rate":0.3988920016,"count":12.0,"SE(A)":0.0324284066,"SE_x(A)":0.0294419255,"SE_pred(A)":0.0135931812}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.7596544715,"pass@count":0.8841463415,"win_rate":0.3896646863,"count":12.0,"SE(A)":0.0333660002,"SE_x(A)":0.0266922388,"SE_pred(A)":0.0200203487}
{"model":"google_gemma_3_27b_it","pass1":0.756097561,"pass@count":0.8048780488,"win_rate":0.3772082504,"count":12.0,"SE(A)":0.0335332052,"SE_x(A)":0.03181779,"SE_pred(A)":0.0105879217}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.7505543237,"pass@count":0.8658536585,"win_rate":0.3862163852,"count":11.0,"SE(A)":0.0337875835,"SE_x(A)":0.0270294291,"SE_pred(A)":0.0202734004}
{"model":"google_gemma_3_12b_it","pass1":0.7317073171,"pass@count":0.7743902439,"win_rate":0.3587190225,"count":11.0,"SE(A)":0.0345980155,"SE_x(A)":0.0326891697,"SE_pred(A)":0.0113331753}
{"model":"qwen3-8b","pass1":0.6986788618,"pass@count":0.8719512195,"win_rate":0.3538348826,"count":12.0,"SE(A)":0.0358287517,"SE_x(A)":0.0289256192,"SE_pred(A)":0.0211425638}
{"model":"qwen3-4b","pass1":0.6971544715,"pass@count":0.8292682927,"win_rate":0.339748748,"count":12.0,"SE(A)":0.0358800606,"SE_x(A)":0.031898907,"SE_pred(A)":0.0164267611}
{"model":"google_gemma_2_27b_it","pass1":0.6707317073,"pass@count":0.75,"win_rate":0.3172628973,"count":10.0,"SE(A)":0.0366967371,"SE_x(A)":0.0336181325,"SE_pred(A)":0.0147129765}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.6563192905,"pass@count":0.8353658537,"win_rate":0.3105742845,"count":11.0,"SE(A)":0.0370862771,"SE_x(A)":0.0303164787,"SE_pred(A)":0.0213612515}
{"model":"qwen2-math-72b-instruct","pass1":0.6208425721,"pass@count":0.8231707317,"win_rate":0.2946467653,"count":11.0,"SE(A)":0.0378859869,"SE_x(A)":0.0289324037,"SE_pred(A)":0.0244594362}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.6103104213,"pass@count":0.8780487805,"win_rate":0.3088922157,"count":11.0,"SE(A)":0.0380813964,"SE_x(A)":0.0233746588,"SE_pred(A)":0.0300635672}
{"model":"google_gemma_3_4b_it","pass1":0.6064727955,"pass@count":0.6890243902,"win_rate":0.2820324318,"count":13.0,"SE(A)":0.0381479424,"SE_x(A)":0.0357616786,"SE_pred(A)":0.0132803559}
{"model":"llama-3.1-8B-instruct","pass1":0.5731707317,"pass@count":0.5731707317,"win_rate":0.2603412644,"count":15.0,"SE(A)":0.0386231041,"SE_x(A)":0.0386231041,"SE_pred(A)":0.0}
{"model":"google_gemma_2_9b_it","pass1":0.5587583149,"pass@count":0.6585365854,"win_rate":0.2521490265,"count":11.0,"SE(A)":0.0387729051,"SE_x(A)":0.0359597102,"SE_pred(A)":0.014499566}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.5304878049,"pass@count":0.8597560976,"win_rate":0.2670242752,"count":11.0,"SE(A)":0.0389707906,"SE_x(A)":0.0229570002,"SE_pred(A)":0.0314912474}
{"model":"qwen3-1.7b","pass1":0.5040650407,"pass@count":0.737804878,"win_rate":0.2257383118,"count":12.0,"SE(A)":0.0390421501,"SE_x(A)":0.0332694557,"SE_pred(A)":0.0204311724}
{"model":"qwen2-7b-instruct","pass1":0.5,"pass@count":0.8475609756,"win_rate":0.2287287793,"count":11.0,"SE(A)":0.0390434405,"SE_x(A)":0.0248297431,"SE_pred(A)":0.0301309492}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.4955654102,"pass@count":0.8597560976,"win_rate":0.2486846596,"count":11.0,"SE(A)":0.0390419048,"SE_x(A)":0.0217970475,"SE_pred(A)":0.0323907248}
{"model":"qwen2-72b-instruct","pass1":0.4817073171,"pass@count":0.8414634146,"win_rate":0.2327997315,"count":11.0,"SE(A)":0.0390173021,"SE_x(A)":0.0224864188,"SE_pred(A)":0.0318859033}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.481097561,"pass@count":0.8658536585,"win_rate":0.2246483171,"count":10.0,"SE(A)":0.0390155299,"SE_x(A)":0.0215075441,"SE_pred(A)":0.0325520678}
{"model":"google_codegemma_1.1_7b_it","pass1":0.4742026266,"pass@count":0.7012195122,"win_rate":0.1999004513,"count":13.0,"SE(A)":0.0389914387,"SE_x(A)":0.0314004039,"SE_pred(A)":0.0231159452}
{"model":"qwen1.5-14b-chat","pass1":0.4512195122,"pass@count":0.756097561,"win_rate":0.1910876601,"count":12.0,"SE(A)":0.0388571857,"SE_x(A)":0.0299400527,"SE_pred(A)":0.0247684098}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.4349593496,"pass@count":0.8170731707,"win_rate":0.1973756242,"count":12.0,"SE(A)":0.0387117013,"SE_x(A)":0.0230631594,"SE_pred(A)":0.0310915824}
{"model":"deepseek_v2_lite_chat","pass1":0.4279379157,"pass@count":0.737804878,"win_rate":0.1803571925,"count":11.0,"SE(A)":0.0386358118,"SE_x(A)":0.0280169448,"SE_pred(A)":0.0266040741}
{"model":"qwen1.5-32b-chat","pass1":0.4085365854,"pass@count":0.6524390244,"win_rate":0.1748210839,"count":11.0,"SE(A)":0.0383846422,"SE_x(A)":0.0314648615,"SE_pred(A)":0.0219850688}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.3968957871,"pass@count":0.8048780488,"win_rate":0.1745294735,"count":11.0,"SE(A)":0.0382043224,"SE_x(A)":0.0225933591,"SE_pred(A)":0.0308076351}
{"model":"llama-3.2-3B-instruct","pass1":0.3780487805,"pass@count":0.3780487805,"win_rate":0.1469549719,"count":17.0,"SE(A)":0.0378643197,"SE_x(A)":0.0378643197,"SE_pred(A)":0.0}
{"model":"qwen1.5-72b-chat","pass1":0.376940133,"pass@count":0.6219512195,"win_rate":0.1556239023,"count":11.0,"SE(A)":0.0378424421,"SE_x(A)":0.0294548235,"SE_pred(A)":0.0237584468}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.3700750469,"pass@count":0.7682926829,"win_rate":0.1658589372,"count":13.0,"SE(A)":0.03770226,"SE_x(A)":0.0238173886,"SE_pred(A)":0.0292265702}
{"model":"google_gemma_3_1b_it","pass1":0.3602251407,"pass@count":0.4329268293,"win_rate":0.1450739776,"count":13.0,"SE(A)":0.037486826,"SE_x(A)":0.035157422,"SE_pred(A)":0.0130083744}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.3586474501,"pass@count":0.743902439,"win_rate":0.1493206808,"count":11.0,"SE(A)":0.0374507365,"SE_x(A)":0.0232373808,"SE_pred(A)":0.0293697429}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.3481152993,"pass@count":0.7682926829,"win_rate":0.1440777991,"count":11.0,"SE(A)":0.0371984656,"SE_x(A)":0.0230789107,"SE_pred(A)":0.029173442}
{"model":"qwen1.5-7b-chat","pass1":0.3414634146,"pass@count":0.6524390244,"win_rate":0.1342554911,"count":12.0,"SE(A)":0.037028841,"SE_x(A)":0.0287567629,"SE_pred(A)":0.0233277444}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.3159645233,"pass@count":0.5914634146,"win_rate":0.1181895167,"count":11.0,"SE(A)":0.0363025014,"SE_x(A)":0.0283568008,"SE_pred(A)":0.0226663507}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.3155487805,"pass@count":0.5487804878,"win_rate":0.1298985635,"count":12.0,"SE(A)":0.0362896333,"SE_x(A)":0.0278440092,"SE_pred(A)":0.0232733461}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.3095684803,"pass@count":0.6829268293,"win_rate":0.127205395,"count":13.0,"SE(A)":0.0361007935,"SE_x(A)":0.0225428914,"SE_pred(A)":0.0281972577}
{"model":"qwen2-math-7b-instruct","pass1":0.2957317073,"pass@count":0.5609756098,"win_rate":0.1199180714,"count":6.0,"SE(A)":0.0356365858,"SE_x(A)":0.0251337481,"SE_pred(A)":0.025263827}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.2688470066,"pass@count":0.7134146341,"win_rate":0.1189408605,"count":11.0,"SE(A)":0.0346206175,"SE_x(A)":0.0195787047,"SE_pred(A)":0.0285527841}
{"model":"llama-3.2-1B-instruct","pass1":0.256097561,"pass@count":0.256097561,"win_rate":0.0916952357,"count":12.0,"SE(A)":0.0340830764,"SE_x(A)":0.0340830764,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.2239467849,"pass@count":0.5182926829,"win_rate":0.0769723445,"count":11.0,"SE(A)":0.0325533916,"SE_x(A)":0.0233223464,"SE_pred(A)":0.0227110428}
{"model":"qwen3-0.6b","pass1":0.2143527204,"pass@count":0.4817073171,"win_rate":0.0763694092,"count":13.0,"SE(A)":0.0320447126,"SE_x(A)":0.0227966416,"SE_pred(A)":0.0225205848}
{"model":"google_gemma_7b_it","pass1":0.2026266416,"pass@count":0.4024390244,"win_rate":0.0693349821,"count":13.0,"SE(A)":0.031387535,"SE_x(A)":0.0259322563,"SE_pred(A)":0.0176831964}
{"model":"qwen2-1.5b-instruct","pass1":0.1407129456,"pass@count":0.5792682927,"win_rate":0.0589604637,"count":13.0,"SE(A)":0.027152776,"SE_x(A)":0.011792055,"SE_pred(A)":0.0244585504}
{"model":"google_gemma_2b_it","pass1":0.1393058161,"pass@count":0.262195122,"win_rate":0.0402152536,"count":13.0,"SE(A)":0.0270387826,"SE_x(A)":0.02311344,"SE_pred(A)":0.0140308466}
{"model":"qwen2-0.5b-instruct","pass1":0.074108818,"pass@count":0.3048780488,"win_rate":0.022746053,"count":13.0,"SE(A)":0.0204546943,"SE_x(A)":0.0107568188,"SE_pred(A)":0.0173978553}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.0695121951,"pass@count":0.2865853659,"win_rate":0.0249062098,"count":10.0,"SE(A)":0.0198592993,"SE_x(A)":0.0098463074,"SE_pred(A)":0.0172465069}
{"model":"qwen1.5-1.8b-chat","pass1":0.0498891353,"pass@count":0.2256097561,"win_rate":0.0151155461,"count":11.0,"SE(A)":0.0170007549,"SE_x(A)":0.0087997867,"SE_pred(A)":0.0145461136}
{"model":"qwen2-math-1.5b-instruct","pass1":0.0335365854,"pass@count":0.0853658537,"win_rate":0.0134826561,"count":4.0,"SE(A)":0.0140582158,"SE_x(A)":0.0089384548,"SE_pred(A)":0.0108506893}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.0257973734,"pass@count":0.1951219512,"win_rate":0.0079939396,"count":13.0,"SE(A)":0.0123791398,"SE_x(A)":0.0041703945,"SE_pred(A)":0.01165551}
{"model":"qwen1.5-0.5b-chat","pass1":0.0192307692,"pass@count":0.0792682927,"win_rate":0.0034904234,"count":13.0,"SE(A)":0.0107240745,"SE_x(A)":0.0077273584,"SE_pred(A)":0.0074359737}
