{"model":"deepseek_r1_distill_qwen_32b","pass1":0.3787878788,"pass@count":0.7333333333,"win_rate":0.3239795044,"count":11.0,"SE(A)":0.0885640299,"SE_x(A)":0.0637637014,"SE_pred(A)":0.0614636297}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.3424242424,"pass@count":0.6666666667,"win_rate":0.2911466094,"count":11.0,"SE(A)":0.0866352278,"SE_x(A)":0.0633301918,"SE_pred(A)":0.0591180979}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.2848484848,"pass@count":0.4666666667,"win_rate":0.2376021077,"count":11.0,"SE(A)":0.0824034436,"SE_x(A)":0.0649974688,"SE_pred(A)":0.0506523106}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.2757575758,"pass@count":0.4333333333,"win_rate":0.2265762174,"count":11.0,"SE(A)":0.0815915304,"SE_x(A)":0.0689796197,"SE_pred(A)":0.0435774012}
{"model":"google_gemma_3_27b_it","pass1":0.2555555556,"pass@count":0.5666666667,"win_rate":0.2083232145,"count":12.0,"SE(A)":0.0796339361,"SE_x(A)":0.053762304,"SE_pred(A)":0.0587467314}
{"model":"qwen3-14b","pass1":0.2444444444,"pass@count":0.3333333333,"win_rate":0.1946851433,"count":12.0,"SE(A)":0.0784625936,"SE_x(A)":0.0672560924,"SE_pred(A)":0.040410353}
{"model":"qwen3-32b","pass1":0.2272727273,"pass@count":0.4333333333,"win_rate":0.1809317361,"count":11.0,"SE(A)":0.076511401,"SE_x(A)":0.0585640851,"SE_pred(A)":0.0492365964}
{"model":"llama-3.1-70B-instruct","pass1":0.2,"pass@count":0.2,"win_rate":0.1661295458,"count":13.0,"SE(A)":0.0730296743,"SE_x(A)":0.0730296743,"SE_pred(A)":0.0}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.1897435897,"pass@count":0.4,"win_rate":0.1527692401,"count":13.0,"SE(A)":0.0715870007,"SE_x(A)":0.0524002031,"SE_pred(A)":0.0487741466}
{"model":"qwen2-math-72b-instruct","pass1":0.1833333333,"pass@count":0.3666666667,"win_rate":0.1436284921,"count":6.0,"SE(A)":0.0706451749,"SE_x(A)":0.0508265023,"SE_pred(A)":0.0490653381}
{"model":"google_gemma_3_12b_it","pass1":0.1818181818,"pass@count":0.4666666667,"win_rate":0.1458709888,"count":11.0,"SE(A)":0.070417879,"SE_x(A)":0.0521176884,"SE_pred(A)":0.0473542421}
{"model":"qwen3-8b","pass1":0.175,"pass@count":0.5,"win_rate":0.1416541928,"count":12.0,"SE(A)":0.0693721846,"SE_x(A)":0.0533968214,"SE_pred(A)":0.0442863349}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.1303030303,"pass@count":0.3,"win_rate":0.096763584,"count":11.0,"SE(A)":0.0614611396,"SE_x(A)":0.0462721088,"SE_pred(A)":0.0404519917}
{"model":"qwen3-4b","pass1":0.1194444444,"pass@count":0.2333333333,"win_rate":0.0885811674,"count":12.0,"SE(A)":0.059210773,"SE_x(A)":0.044900074,"SE_pred(A)":0.0385992097}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.1153846154,"pass@count":0.2333333333,"win_rate":0.0877777602,"count":13.0,"SE(A)":0.0583298111,"SE_x(A)":0.0465086081,"SE_pred(A)":0.0352039236}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.1083333333,"pass@count":0.2,"win_rate":0.0776848932,"count":12.0,"SE(A)":0.056744228,"SE_x(A)":0.0450892306,"SE_pred(A)":0.0344509606}
{"model":"llama-3.2-3B-instruct","pass1":0.1,"pass@count":0.1,"win_rate":0.0748136722,"count":18.0,"SE(A)":0.0547722558,"SE_x(A)":0.0547722558,"SE_pred(A)":0.0}
{"model":"qwen3-1.7b","pass1":0.0944444444,"pass@count":0.2333333333,"win_rate":0.0715832597,"count":12.0,"SE(A)":0.0533930992,"SE_x(A)":0.0385640595,"SE_pred(A)":0.0369274473}
{"model":"qwen2-math-7b-instruct","pass1":0.0833333333,"pass@count":0.1333333333,"win_rate":0.0637949124,"count":6.0,"SE(A)":0.0504608392,"SE_x(A)":0.0429254306,"SE_pred(A)":0.0265274142}
{"model":"qwen2-math-1.5b-instruct","pass1":0.075,"pass@count":0.1333333333,"win_rate":0.0505382063,"count":4.0,"SE(A)":0.0480884602,"SE_x(A)":0.0384599359,"SE_pred(A)":0.0288675135}
{"model":"google_gemma_3_4b_it","pass1":0.0717948718,"pass@count":0.2333333333,"win_rate":0.0556945485,"count":13.0,"SE(A)":0.0471311532,"SE_x(A)":0.0247317407,"SE_pred(A)":0.0401208999}
{"model":"qwen2-72b-instruct","pass1":0.0606060606,"pass@count":0.2,"win_rate":0.0419427512,"count":11.0,"SE(A)":0.0435633508,"SE_x(A)":0.0284052186,"SE_pred(A)":0.033028913}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.0454545455,"pass@count":0.1,"win_rate":0.0318818031,"count":11.0,"SE(A)":0.0380300012,"SE_x(A)":0.02444987,"SE_pred(A)":0.0291287632}
{"model":"llama-3.1-8B-instruct","pass1":0.0333333333,"pass@count":0.0333333333,"win_rate":0.0236042171,"count":15.0,"SE(A)":0.0327730693,"SE_x(A)":0.0327730693,"SE_pred(A)":0.0}
{"model":"llama-3.2-1B-instruct","pass1":0.0333333333,"pass@count":0.0333333333,"win_rate":0.0229205326,"count":21.0,"SE(A)":0.0327730693,"SE_x(A)":0.0327730693,"SE_pred(A)":0.0}
{"model":"qwen2-7b-instruct","pass1":0.0303030303,"pass@count":0.0666666667,"win_rate":0.0223151022,"count":11.0,"SE(A)":0.0312968351,"SE_x(A)":0.0198401941,"SE_pred(A)":0.0242045158}
{"model":"google_gemma_2_27b_it","pass1":0.03,"pass@count":0.1333333333,"win_rate":0.0199880022,"count":10.0,"SE(A)":0.031144823,"SE_x(A)":0.0184491293,"SE_pred(A)":0.0250924218}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.0222222222,"pass@count":0.0666666667,"win_rate":0.0153350371,"count":12.0,"SE(A)":0.0269124476,"SE_x(A)":0.0142265355,"SE_pred(A)":0.0228448139}
{"model":"google_gemma_3_1b_it","pass1":0.0153846154,"pass@count":0.0333333333,"win_rate":0.0104162755,"count":13.0,"SE(A)":0.022470669,"SE_x(A)":0.0143452315,"SE_pred(A)":0.0172958174}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.0151515152,"pass@count":0.0666666667,"win_rate":0.0111752448,"count":11.0,"SE(A)":0.0223024264,"SE_x(A)":0.0085531164,"SE_pred(A)":0.020597146}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.0151515152,"pass@count":0.1333333333,"win_rate":0.0105582979,"count":11.0,"SE(A)":0.0223024264,"SE_x(A)":0.0035425612,"SE_pred(A)":0.0220192753}
{"model":"deepseek_v2_lite_chat","pass1":0.0121212121,"pass@count":0.0333333333,"win_rate":0.0081834941,"count":11.0,"SE(A)":0.0199785621,"SE_x(A)":0.0107849275,"SE_pred(A)":0.0168174993}
{"model":"qwen3-0.6b","pass1":0.0102564103,"pass@count":0.0333333333,"win_rate":0.0069132443,"count":13.0,"SE(A)":0.0183949416,"SE_x(A)":0.0090533762,"SE_pred(A)":0.0160128154}
{"model":"qwen1.5-72b-chat","pass1":0.0060606061,"pass@count":0.0333333333,"win_rate":0.0036265853,"count":11.0,"SE(A)":0.0141702448,"SE_x(A)":0.0043563351,"SE_pred(A)":0.0134839972}
{"model":"qwen1.5-32b-chat","pass1":0.0060606061,"pass@count":0.0666666667,"win_rate":0.0038375597,"count":11.0,"SE(A)":0.0141702448,"SE_x(A)":0.0,"SE_pred(A)":0.0142133811}
{"model":"google_gemma_2_9b_it","pass1":0.003030303,"pass@count":0.0333333333,"win_rate":0.0020296687,"count":11.0,"SE(A)":0.0100351388,"SE_x(A)":0.0,"SE_pred(A)":0.0100503782}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.003030303,"pass@count":0.0333333333,"win_rate":0.0025215129,"count":11.0,"SE(A)":0.0100351388,"SE_x(A)":0.0,"SE_pred(A)":0.0100503782}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.0027777778,"pass@count":0.0333333333,"win_rate":0.0023109742,"count":12.0,"SE(A)":0.0096091306,"SE_x(A)":0.0,"SE_pred(A)":0.0096225045}
{"model":"google_codegemma_1.1_7b_it","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"google_gemma_2b_it","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"google_gemma_7b_it","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen2-1.5b-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen1.5-0.5b-chat","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen2-0.5b-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen1.5-7b-chat","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":12.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen1.5-1.8b-chat","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen1.5-14b-chat","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":12.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":13.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.0,"pass@count":0.0,"win_rate":0.0,"count":11.0,"SE(A)":0.0,"SE_x(A)":0.0,"SE_pred(A)":0.0}
