{"model":"google_gemma_3_27b_it","pass1":0.8534545455,"pass@count":0.942,"win_rate":0.4215973655,"count":11.0,"SE(A)":0.0158158076,"SE_x(A)":0.012878232,"SE_pred(A)":0.0091810081}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.839,"pass@count":0.952,"win_rate":0.4134940001,"count":10.0,"SE(A)":0.0164364838,"SE_x(A)":0.011891276,"SE_pred(A)":0.0113470505}
{"model":"qwen3-32b","pass1":0.8191111111,"pass@count":0.928,"win_rate":0.3943547522,"count":9.0,"SE(A)":0.0172144183,"SE_x(A)":0.0134949611,"SE_pred(A)":0.0106874797}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.8114,"pass@count":0.942,"win_rate":0.3936513795,"count":10.0,"SE(A)":0.0174945729,"SE_x(A)":0.0129569403,"SE_pred(A)":0.0117549044}
{"model":"qwen3-14b","pass1":0.8096363636,"pass@count":0.93,"win_rate":0.3870482177,"count":11.0,"SE(A)":0.0175570682,"SE_x(A)":0.0139699583,"SE_pred(A)":0.010634421}
{"model":"qwen2-math-72b-instruct","pass1":0.8037142857,"pass@count":0.914,"win_rate":0.3811060473,"count":7.0,"SE(A)":0.0177627494,"SE_x(A)":0.0145435644,"SE_pred(A)":0.010198039}
{"model":"qwen3-8b","pass1":0.7921818182,"pass@count":0.934,"win_rate":0.3729626821,"count":11.0,"SE(A)":0.018145511,"SE_x(A)":0.01463326,"SE_pred(A)":0.0107297378}
{"model":"google_gemma_3_12b_it","pass1":0.7895,"pass@count":0.92,"win_rate":0.3728290418,"count":12.0,"SE(A)":0.0182312781,"SE_x(A)":0.0149762676,"SE_pred(A)":0.0103966778}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.7885,"pass@count":0.942,"win_rate":0.3789171786,"count":12.0,"SE(A)":0.0182629543,"SE_x(A)":0.0131974188,"SE_pred(A)":0.0126239311}
{"model":"qwen3-4b","pass1":0.7723333333,"pass@count":0.922,"win_rate":0.36019772,"count":12.0,"SE(A)":0.0187528427,"SE_x(A)":0.015010179,"SE_pred(A)":0.0112411581}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.7706666667,"pass@count":0.93,"win_rate":0.3645260586,"count":12.0,"SE(A)":0.0188010402,"SE_x(A)":0.0139357951,"SE_pred(A)":0.0126203299}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.7608,"pass@count":0.902,"win_rate":0.3496623623,"count":10.0,"SE(A)":0.0190779118,"SE_x(A)":0.0151367855,"SE_pred(A)":0.0116122541}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.7134545455,"pass@count":0.89,"win_rate":0.3165735167,"count":11.0,"SE(A)":0.0202206408,"SE_x(A)":0.0161718539,"SE_pred(A)":0.0121385936}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.7041666667,"pass@count":0.912,"win_rate":0.3268737623,"count":12.0,"SE(A)":0.020411564,"SE_x(A)":0.0145204706,"SE_pred(A)":0.0143453086}
{"model":"qwen2-math-1.5b-instruct","pass1":0.68,"pass@count":0.68,"win_rate":0.2969127242,"count":1.0,"SE(A)":0.0208614477,"SE_x(A)":null,"SE_pred(A)":null}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.6758333333,"pass@count":0.896,"win_rate":0.3005485461,"count":12.0,"SE(A)":0.0209323978,"SE_x(A)":0.0152955273,"SE_pred(A)":0.0142902807}
{"model":"qwen3-1.7b","pass1":0.6668333333,"pass@count":0.87,"win_rate":0.2909418971,"count":12.0,"SE(A)":0.0210792144,"SE_x(A)":0.0164501065,"SE_pred(A)":0.0131805642}
{"model":"llama-3.1-70B-instruct","pass1":0.6556666667,"pass@count":0.658,"win_rate":0.2867469292,"count":12.0,"SE(A)":0.0212493712,"SE_x(A)":0.0211407123,"SE_pred(A)":0.0021461735}
{"model":"google_gemma_3_4b_it","pass1":0.6508333333,"pass@count":0.848,"win_rate":0.2910577825,"count":12.0,"SE(A)":0.021318973,"SE_x(A)":0.0172164951,"SE_pred(A)":0.0125734207}
{"model":"qwen2-72b-instruct","pass1":0.6448,"pass@count":0.836,"win_rate":0.2723961423,"count":10.0,"SE(A)":0.0214024746,"SE_x(A)":0.0171483504,"SE_pred(A)":0.0128062485}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.6103333333,"pass@count":0.868,"win_rate":0.2574985082,"count":12.0,"SE(A)":0.021809473,"SE_x(A)":0.0160534643,"SE_pred(A)":0.0147627705}
{"model":"google_gemma_2_27b_it","pass1":0.523,"pass@count":0.742,"win_rate":0.2038442685,"count":10.0,"SE(A)":0.0223370096,"SE_x(A)":0.0185702211,"SE_pred(A)":0.0124132546}
{"model":"qwen2-7b-instruct","pass1":0.5166666667,"pass@count":0.802,"win_rate":0.2014069739,"count":12.0,"SE(A)":0.0223482537,"SE_x(A)":0.0167654647,"SE_pred(A)":0.0147771322}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.4935,"pass@count":0.792,"win_rate":0.1889308484,"count":12.0,"SE(A)":0.0223587902,"SE_x(A)":0.0167152729,"SE_pred(A)":0.0148497526}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.4813333333,"pass@count":0.778,"win_rate":0.1840168222,"count":9.0,"SE(A)":0.0223450914,"SE_x(A)":0.0165016093,"SE_pred(A)":0.0150665192}
{"model":"llama-3.1-8B-instruct","pass1":0.4764615385,"pass@count":0.586,"win_rate":0.1799020162,"count":13.0,"SE(A)":0.0223358878,"SE_x(A)":0.0196238888,"SE_pred(A)":0.0106674679}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.4705,"pass@count":0.766,"win_rate":0.1771047116,"count":12.0,"SE(A)":0.0223217271,"SE_x(A)":0.0167074321,"SE_pred(A)":0.0148027434}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.4689090909,"pass@count":0.8,"win_rate":0.1837265455,"count":11.0,"SE(A)":0.0223174082,"SE_x(A)":0.0153727439,"SE_pred(A)":0.0161785492}
{"model":"google_gemma_2_9b_it","pass1":0.4638181818,"pass@count":0.678,"win_rate":0.1694957496,"count":11.0,"SE(A)":0.0223020571,"SE_x(A)":0.019004304,"SE_pred(A)":0.0116712545}
{"model":"llama-3.2-3B-instruct","pass1":0.4412222222,"pass@count":0.556,"win_rate":0.1662142766,"count":18.0,"SE(A)":0.0222056377,"SE_x(A)":0.0193253572,"SE_pred(A)":0.0109371347}
{"model":"qwen1.5-72b-chat","pass1":0.4034,"pass@count":0.73,"win_rate":0.1472132666,"count":10.0,"SE(A)":0.0219393911,"SE_x(A)":0.0159848259,"SE_pred(A)":0.0150273824}
{"model":"qwen1.5-32b-chat","pass1":0.3891111111,"pass@count":0.704,"win_rate":0.1405748904,"count":9.0,"SE(A)":0.021803837,"SE_x(A)":0.0158066996,"SE_pred(A)":0.0150185071}
{"model":"qwen3-0.6b","pass1":0.3369230769,"pass@count":0.714,"win_rate":0.1221232679,"count":13.0,"SE(A)":0.0211379241,"SE_x(A)":0.014852835,"SE_pred(A)":0.0150401173}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.3276666667,"pass@count":0.664,"win_rate":0.1134266281,"count":12.0,"SE(A)":0.0209905323,"SE_x(A)":0.0148954749,"SE_pred(A)":0.0147894311}
{"model":"qwen1.5-14b-chat","pass1":0.3054545455,"pass@count":0.636,"win_rate":0.1030754118,"count":11.0,"SE(A)":0.0205986439,"SE_x(A)":0.0149864953,"SE_pred(A)":0.0141318467}
{"model":"llama-3.2-1B-instruct","pass1":0.2789090909,"pass@count":0.282,"win_rate":0.096670002,"count":11.0,"SE(A)":0.0200558625,"SE_x(A)":0.01992124,"SE_pred(A)":0.0023198746}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.2523636364,"pass@count":0.612,"win_rate":0.0843607258,"count":11.0,"SE(A)":0.0194255621,"SE_x(A)":0.0129378834,"SE_pred(A)":0.014490122}
{"model":"deepseek_v2_lite_chat","pass1":0.2253333333,"pass@count":0.558,"win_rate":0.0724561987,"count":9.0,"SE(A)":0.018684658,"SE_x(A)":0.0123425551,"SE_pred(A)":0.0140277503}
{"model":"google_codegemma_1.1_7b_it","pass1":0.2085,"pass@count":0.52,"win_rate":0.065508837,"count":12.0,"SE(A)":0.0181674296,"SE_x(A)":0.0128944803,"SE_pred(A)":0.0127979639}
{"model":"qwen1.5-7b-chat","pass1":0.1649090909,"pass@count":0.454,"win_rate":0.0519068812,"count":11.0,"SE(A)":0.0165960286,"SE_x(A)":0.0107640387,"SE_pred(A)":0.0126318501}
{"model":"google_gemma_3_1b_it","pass1":0.1451666667,"pass@count":0.432,"win_rate":0.0593038395,"count":12.0,"SE(A)":0.0157539395,"SE_x(A)":0.0103677211,"SE_pred(A)":0.0118615753}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.1286666667,"pass@count":0.41,"win_rate":0.038619574,"count":12.0,"SE(A)":0.0149740813,"SE_x(A)":0.0094794238,"SE_pred(A)":0.011591533}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.1008333333,"pass@count":0.376,"win_rate":0.0298181166,"count":12.0,"SE(A)":0.013465955,"SE_x(A)":0.008181565,"SE_pred(A)":0.0106955102}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.0803076923,"pass@count":0.326,"win_rate":0.0247817221,"count":13.0,"SE(A)":0.0121538773,"SE_x(A)":0.0069521305,"SE_pred(A)":0.0099691833}
{"model":"qwen2-1.5b-instruct","pass1":0.068,"pass@count":0.316,"win_rate":0.019608794,"count":12.0,"SE(A)":0.0112584191,"SE_x(A)":0.0053166861,"SE_pred(A)":0.0099239533}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.064,"pass@count":0.268,"win_rate":0.019937436,"count":12.0,"SE(A)":0.0109456841,"SE_x(A)":0.0057778679,"SE_pred(A)":0.009296464}
{"model":"google_gemma_7b_it","pass1":0.0576666667,"pass@count":0.214,"win_rate":0.0168257081,"count":12.0,"SE(A)":0.0104250873,"SE_x(A)":0.0068368889,"SE_pred(A)":0.0078701584}
{"model":"qwen2-0.5b-instruct","pass1":0.0290769231,"pass@count":0.194,"win_rate":0.0098521959,"count":13.0,"SE(A)":0.0075141807,"SE_x(A)":0.003072014,"SE_pred(A)":0.0068575244}
{"model":"qwen1.5-1.8b-chat","pass1":0.0141818182,"pass@count":0.106,"win_rate":0.0043449977,"count":11.0,"SE(A)":0.0052878529,"SE_x(A)":0.0016285876,"SE_pred(A)":0.0050308141}
{"model":"qwen1.5-0.5b-chat","pass1":0.008,"pass@count":0.066,"win_rate":0.0030726244,"count":12.0,"SE(A)":0.0039839679,"SE_x(A)":0.0010699193,"SE_pred(A)":0.0038376129}
{"model":"google_gemma_2b_it","pass1":0.0011666667,"pass@count":0.014,"win_rate":0.0004704957,"count":12.0,"SE(A)":0.0015266339,"SE_x(A)":0.0,"SE_pred(A)":0.0015275252}
