{"model":"qwen3-14b","pass1":0.7758030245,"pass@count":0.9146341463,"win_rate":0.3799897409,"count":1079.0,"SE(A)":0.0325663199,"SE_x(A)":0.0293549444,"SE_pred(A)":0.0141015045}
{"model":"qwen3-32b","pass1":0.7732893746,"pass@count":0.9329268293,"win_rate":0.3767004474,"count":1069.0,"SE(A)":0.032695278,"SE_x(A)":0.027376897,"SE_pred(A)":0.0178741914}
{"model":"google_gemma_3_27b_it","pass1":0.7569686411,"pass@count":0.7865853659,"win_rate":0.3617148268,"count":7.0,"SE(A)":0.0334925472,"SE_x(A)":0.0321713662,"SE_pred(A)":0.0093141782}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.7497172949,"pass@count":0.9085365854,"win_rate":0.3684887867,"count":1100.0,"SE(A)":0.033825347,"SE_x(A)":0.0267680198,"SE_pred(A)":0.0206791494}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.7484685459,"pass@count":0.9329268293,"win_rate":0.3658075808,"count":1079.0,"SE(A)":0.0338813732,"SE_x(A)":0.0263437232,"SE_pred(A)":0.0213062361}
{"model":"google_gemma_3_12b_it","pass1":0.7288096922,"pass@count":0.8536585366,"win_rate":0.3397382172,"count":1069.0,"SE(A)":0.0347154043,"SE_x(A)":0.0330628425,"SE_pred(A)":0.0105833711}
{"model":"llama-3.1-70B-instruct","pass1":0.7072172949,"pass@count":0.8841463415,"win_rate":0.3296169539,"count":1100.0,"SE(A)":0.0355326192,"SE_x(A)":0.0291775414,"SE_pred(A)":0.0202790065}
{"model":"qwen3-8b","pass1":0.7067184036,"pass@count":0.9207317073,"win_rate":0.3409254495,"count":1100.0,"SE(A)":0.0355503337,"SE_x(A)":0.028763139,"SE_pred(A)":0.0208927755}
{"model":"qwen3-4b","pass1":0.6967738359,"pass@count":0.8963414634,"win_rate":0.3230880942,"count":1100.0,"SE(A)":0.0358927992,"SE_x(A)":0.0318227622,"SE_pred(A)":0.0166013505}
{"model":"google_gemma_2_27b_it","pass1":0.6637416851,"pass@count":0.8658536585,"win_rate":0.2975526051,"count":1100.0,"SE(A)":0.0368904658,"SE_x(A)":0.0337992963,"SE_pred(A)":0.0147822204}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.6590435736,"pass@count":0.9268292683,"win_rate":0.2981337904,"count":890.0,"SE(A)":0.0370155819,"SE_x(A)":0.0293132939,"SE_pred(A)":0.0226027455}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.627172949,"pass@count":0.9268292683,"win_rate":0.305409453,"count":1100.0,"SE(A)":0.0377594311,"SE_x(A)":0.0233034398,"SE_pred(A)":0.0297106771}
{"model":"qwen2-math-72b-instruct","pass1":0.6168118467,"pass@count":0.8902439024,"win_rate":0.27799822,"count":140.0,"SE(A)":0.0379629949,"SE_x(A)":0.0284096311,"SE_pred(A)":0.0251809817}
{"model":"google_gemma_3_4b_it","pass1":0.609556541,"pass@count":0.7743902439,"win_rate":0.2706068413,"count":1100.0,"SE(A)":0.0380946642,"SE_x(A)":0.0355624133,"SE_pred(A)":0.0136571667}
{"model":"google_gemma_2_9b_it","pass1":0.552289357,"pass@count":0.7926829268,"win_rate":0.2336716281,"count":1100.0,"SE(A)":0.0388293502,"SE_x(A)":0.0360012079,"SE_pred(A)":0.0145475587}
{"model":"llama-3.1-8B-instruct","pass1":0.5484645233,"pass@count":0.9024390244,"win_rate":0.2286410715,"count":1100.0,"SE(A)":0.0388595964,"SE_x(A)":0.0307503144,"SE_pred(A)":0.0237589225}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.5191962306,"pass@count":0.9207317073,"win_rate":0.2467098795,"count":1100.0,"SE(A)":0.0390146552,"SE_x(A)":0.0232322501,"SE_pred(A)":0.0313433546}
{"model":"qwen2-7b-instruct","pass1":0.5009478936,"pass@count":0.9268292683,"win_rate":0.2161195228,"count":1100.0,"SE(A)":0.0390433703,"SE_x(A)":0.0248435248,"SE_pred(A)":0.030119496}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.4994345898,"pass@count":0.9390243902,"win_rate":0.2364270168,"count":1100.0,"SE(A)":0.0390434155,"SE_x(A)":0.0225757178,"SE_pred(A)":0.0318547526}
{"model":"qwen3-1.7b","pass1":0.4984645233,"pass@count":0.8658536585,"win_rate":0.2117848708,"count":1100.0,"SE(A)":0.0390432564,"SE_x(A)":0.0332232451,"SE_pred(A)":0.0205083361}
{"model":"qwen2-72b-instruct","pass1":0.4840022173,"pass@count":0.9329268293,"win_rate":0.2207095254,"count":1100.0,"SE(A)":0.0390234507,"SE_x(A)":0.0236329337,"SE_pred(A)":0.0310534079}
{"model":"google_codegemma_1.1_7b_it","pass1":0.4741297117,"pass@count":0.8475609756,"win_rate":0.1874163347,"count":1100.0,"SE(A)":0.0389911441,"SE_x(A)":0.0316308562,"SE_pred(A)":0.0227990845}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.4739966741,"pass@count":0.9390243902,"win_rate":0.2083544674,"count":1100.0,"SE(A)":0.0389906045,"SE_x(A)":0.0228195583,"SE_pred(A)":0.0316154234}
{"model":"qwen1.5-14b-chat","pass1":0.4468348115,"pass@count":0.8963414634,"win_rate":0.1769265015,"count":1100.0,"SE(A)":0.0388220976,"SE_x(A)":0.0303179522,"SE_pred(A)":0.0242482377}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.4428769401,"pass@count":0.9207317073,"win_rate":0.192176441,"count":1100.0,"SE(A)":0.0387878026,"SE_x(A)":0.0227675723,"SE_pred(A)":0.0314027273}
{"model":"llama-3.2-3B-instruct","pass1":0.4418348115,"pass@count":0.8719512195,"win_rate":0.1742230153,"count":1100.0,"SE(A)":0.0387783579,"SE_x(A)":0.0293424266,"SE_pred(A)":0.0253531663}
{"model":"deepseek_v2_lite_chat","pass1":0.4221008869,"pass@count":0.9024390244,"win_rate":0.1681718406,"count":1100.0,"SE(A)":0.0385666772,"SE_x(A)":0.027689649,"SE_pred(A)":0.0268453334}
{"model":"qwen1.5-32b-chat","pass1":0.4157427938,"pass@count":0.8353658537,"win_rate":0.1656760171,"count":1100.0,"SE(A)":0.0384850876,"SE_x(A)":0.0316202458,"SE_pred(A)":0.0219376851}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.4011031042,"pass@count":0.9207317073,"win_rate":0.1653122435,"count":1100.0,"SE(A)":0.0382720846,"SE_x(A)":0.0235581816,"SE_pred(A)":0.0301623034}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.3790354767,"pass@count":0.9146341463,"win_rate":0.1609137611,"count":1100.0,"SE(A)":0.0378836138,"SE_x(A)":0.0234808475,"SE_pred(A)":0.0297290766}
{"model":"qwen1.5-72b-chat","pass1":0.3690798226,"pass@count":0.8475609756,"win_rate":0.1442871916,"count":1100.0,"SE(A)":0.0376812617,"SE_x(A)":0.0296055015,"SE_pred(A)":0.023310765}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.3625997783,"pass@count":0.9207317073,"win_rate":0.1399310702,"count":1100.0,"SE(A)":0.0375403181,"SE_x(A)":0.0239870784,"SE_pred(A)":0.0288772497}
{"model":"google_gemma_3_1b_it","pass1":0.3609090909,"pass@count":0.5609756098,"win_rate":0.1376415422,"count":1100.0,"SE(A)":0.0375023348,"SE_x(A)":0.0352146372,"SE_pred(A)":0.0128978466}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.3530266075,"pass@count":0.9207317073,"win_rate":0.1384150538,"count":1100.0,"SE(A)":0.037318571,"SE_x(A)":0.022730759,"SE_pred(A)":0.0295971002}
{"model":"qwen1.5-7b-chat","pass1":0.3320288248,"pass@count":0.8414634146,"win_rate":0.1217824492,"count":1100.0,"SE(A)":0.0367743353,"SE_x(A)":0.0279588028,"SE_pred(A)":0.0238884298}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.3173558758,"pass@count":0.8719512195,"win_rate":0.1099744616,"count":1100.0,"SE(A)":0.0363453224,"SE_x(A)":0.0284375815,"SE_pred(A)":0.0226337452}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.3067073171,"pass@count":0.8841463415,"win_rate":0.1177703739,"count":1100.0,"SE(A)":0.0360079546,"SE_x(A)":0.0221952461,"SE_pred(A)":0.0283539035}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.2825720621,"pass@count":0.9207317073,"win_rate":0.117121314,"count":1100.0,"SE(A)":0.0351586207,"SE_x(A)":0.0197223486,"SE_pred(A)":0.0291059715}
{"model":"llama-3.2-1B-instruct","pass1":0.2589800443,"pass@count":0.7682926829,"win_rate":0.0871815046,"count":1100.0,"SE(A)":0.0342078814,"SE_x(A)":0.0252708148,"SE_pred(A)":0.0230556949}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.2239966741,"pass@count":0.8414634146,"win_rate":0.0723908096,"count":1100.0,"SE(A)":0.0325559709,"SE_x(A)":0.0229148114,"SE_pred(A)":0.0231258008}
{"model":"qwen3-0.6b","pass1":0.2119955654,"pass@count":0.8231707317,"win_rate":0.0698970388,"count":1100.0,"SE(A)":0.0319158044,"SE_x(A)":0.0228419136,"SE_pred(A)":0.0222904812}
{"model":"google_gemma_7b_it","pass1":0.2039855876,"pass@count":0.6585365854,"win_rate":0.0649877297,"count":1100.0,"SE(A)":0.0314657642,"SE_x(A)":0.0259948479,"SE_pred(A)":0.0177302621}
{"model":"qwen2-1.5b-instruct","pass1":0.1396157038,"pass@count":0.8536585366,"win_rate":0.0519931612,"count":1058.0,"SE(A)":0.0270639666,"SE_x(A)":0.0120281586,"SE_pred(A)":0.0242442094}
{"model":"google_gemma_2b_it","pass1":0.1370011086,"pass@count":0.4573170732,"win_rate":0.0359677219,"count":1100.0,"SE(A)":0.0268500585,"SE_x(A)":0.0228323626,"SE_pred(A)":0.0141282998}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.0776274945,"pass@count":0.6951219512,"win_rate":0.0245078846,"count":1100.0,"SE(A)":0.0208948393,"SE_x(A)":0.0115579732,"SE_pred(A)":0.0174071125}
{"model":"qwen2-0.5b-instruct","pass1":0.074113082,"pass@count":0.6768292683,"win_rate":0.021179091,"count":1100.0,"SE(A)":0.0204552356,"SE_x(A)":0.0107996723,"SE_pred(A)":0.017371924}
{"model":"qwen1.5-1.8b-chat","pass1":0.0542073171,"pass@count":0.6219512195,"win_rate":0.0151507744,"count":1100.0,"SE(A)":0.017680926,"SE_x(A)":0.0093047541,"SE_pred(A)":0.0150345169}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.0231818182,"pass@count":0.7134146341,"win_rate":0.0072244355,"count":1100.0,"SE(A)":0.0117505634,"SE_x(A)":0.0038013493,"SE_pred(A)":0.0111186997}
{"model":"qwen1.5-0.5b-chat","pass1":0.0179199571,"pass@count":0.237804878,"win_rate":0.0030172691,"count":1047.0,"SE(A)":0.0103590517,"SE_x(A)":0.0064441506,"SE_pred(A)":0.0081106643}
