{"model":"qwen3-32b","pass1":0.8214177407,"pass@count":0.9001137225,"win_rate":0.313040839,"count":10.0,"SE(A)":0.0037284971,"SE_x(A)":0.0032727052,"SE_pred(A)":0.0017863626}
{"model":"qwen3-14b","pass1":0.8120451099,"pass@count":0.8711144807,"win_rate":0.305987311,"count":10.0,"SE(A)":0.0038032029,"SE_x(A)":0.0034781071,"SE_pred(A)":0.0015385459}
{"model":"qwen3-8b","pass1":0.7887414708,"pass@count":0.8738627748,"win_rate":0.2910384265,"count":10.0,"SE(A)":0.0039738098,"SE_x(A)":0.0035176623,"SE_pred(A)":0.0018485713}
{"model":"llama-3.1-70B-instruct","pass1":0.7830742987,"pass@count":0.7830742987,"win_rate":0.290073261,"count":12.0,"SE(A)":0.0040122649,"SE_x(A)":0.0040122649,"SE_pred(A)":0.0}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.7811473338,"pass@count":0.9025777104,"win_rate":0.2908359044,"count":9.0,"SE(A)":0.0040250845,"SE_x(A)":0.0032832079,"SE_pred(A)":0.0023284869}
{"model":"qwen2-72b-instruct","pass1":0.7651630023,"pass@count":0.8880780895,"win_rate":0.2772076754,"count":7.0,"SE(A)":0.0041266045,"SE_x(A)":0.0033531051,"SE_pred(A)":0.0024053173}
{"model":"google_gemma_3_27b_it","pass1":0.7510140258,"pass@count":0.8138741471,"win_rate":0.2649782158,"count":10.0,"SE(A)":0.0042096314,"SE_x(A)":0.0039322823,"SE_pred(A)":0.0015027151}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.7509855951,"pass@count":0.9110121304,"win_rate":0.2855461506,"count":10.0,"SE(A)":0.0042097921,"SE_x(A)":0.0030516601,"SE_pred(A)":0.0028999517}
{"model":"qwen2-math-72b-instruct","pass1":0.7485121304,"pass@count":0.8876042456,"win_rate":0.2661541651,"count":10.0,"SE(A)":0.0042236756,"SE_x(A)":0.0034741137,"SE_pred(A)":0.0024020761}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.7472232752,"pass@count":0.8647649735,"win_rate":0.2630688359,"count":10.0,"SE(A)":0.0042308375,"SE_x(A)":0.0036933778,"SE_pred(A)":0.0020637215}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.7448193076,"pass@count":0.9086429113,"win_rate":0.2747386596,"count":12.0,"SE(A)":0.0042440645,"SE_x(A)":0.0031875609,"SE_pred(A)":0.0028020598}
{"model":"google_gemma_2_27b_it","pass1":0.7434609553,"pass@count":0.8648597422,"win_rate":0.2633513711,"count":9.0,"SE(A)":0.0042514632,"SE_x(A)":0.0036382154,"SE_pred(A)":0.00219962}
{"model":"google_gemma_3_12b_it","pass1":0.7421341926,"pass@count":0.8315011372,"win_rate":0.2590326483,"count":11.0,"SE(A)":0.0042586378,"SE_x(A)":0.0038629508,"SE_pred(A)":0.0017926538}
{"model":"qwen3-4b","pass1":0.7355635583,"pass@count":0.8269522365,"win_rate":0.2571320551,"count":12.0,"SE(A)":0.0042934199,"SE_x(A)":0.0038906091,"SE_pred(A)":0.0018156582}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.7326178292,"pass@count":0.9055155421,"win_rate":0.2645110471,"count":12.0,"SE(A)":0.0043086138,"SE_x(A)":0.0032730027,"SE_pred(A)":0.0028020719}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.7092210008,"pass@count":0.8871304018,"win_rate":0.2424492349,"count":10.0,"SE(A)":0.0044208413,"SE_x(A)":0.0035145493,"SE_pred(A)":0.0026817495}
{"model":"google_gemma_2_9b_it","pass1":0.7061220622,"pass@count":0.8588893101,"win_rate":0.242495123,"count":11.0,"SE(A)":0.0044346157,"SE_x(A)":0.0037322457,"SE_pred(A)":0.0023950279}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.6962092494,"pass@count":0.9075056861,"win_rate":0.241757249,"count":10.0,"SE(A)":0.0044770276,"SE_x(A)":0.0032881583,"SE_pred(A)":0.0030383864}
{"model":"qwen1.5-32b-chat","pass1":0.6711050038,"pass@count":0.8892153146,"win_rate":0.227152815,"count":10.0,"SE(A)":0.0045735821,"SE_x(A)":0.0034398043,"SE_pred(A)":0.0030141996}
{"model":"qwen1.5-72b-chat","pass1":0.669053937,"pass@count":0.8559514784,"win_rate":0.2246144123,"count":7.0,"SE(A)":0.0045808048,"SE_x(A)":0.0035921803,"SE_pred(A)":0.0028425364}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.6684356836,"pass@count":0.906368461,"win_rate":0.2353373494,"count":12.0,"SE(A)":0.0045829626,"SE_x(A)":0.0032008422,"SE_pred(A)":0.0032799627}
{"model":"qwen2-math-7b-instruct","pass1":0.6554602603,"pass@count":0.8688400303,"win_rate":0.2135947587,"count":12.0,"SE(A)":0.0046262112,"SE_x(A)":0.0036444684,"SE_pred(A)":0.0028495052}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.6463660487,"pass@count":0.8756633813,"win_rate":0.2101346767,"count":11.0,"SE(A)":0.0046542409,"SE_x(A)":0.0035733305,"SE_pred(A)":0.0029821581}
{"model":"google_gemma_3_4b_it","pass1":0.6433341109,"pass@count":0.7921721001,"win_rate":0.2044830226,"count":13.0,"SE(A)":0.0046631747,"SE_x(A)":0.0040986467,"SE_pred(A)":0.0022240264}
{"model":"qwen2-7b-instruct","pass1":0.6415134572,"pass@count":0.8881728582,"win_rate":0.2091110169,"count":12.0,"SE(A)":0.0046684415,"SE_x(A)":0.0034941539,"SE_pred(A)":0.0030960031}
{"model":"llama-3.1-8B-instruct","pass1":0.6413949962,"pass@count":0.6413949962,"win_rate":0.2121622635,"count":16.0,"SE(A)":0.0046687817,"SE_x(A)":0.0046687817,"SE_pred(A)":0.0}
{"model":"qwen2-math-1.5b-instruct","pass1":0.6153177913,"pass@count":0.8423995451,"win_rate":0.1920102547,"count":12.0,"SE(A)":0.0047362367,"SE_x(A)":0.0037886448,"SE_pred(A)":0.0028422013}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.6141568739,"pass@count":0.8815390447,"win_rate":0.1960962886,"count":12.0,"SE(A)":0.0047389012,"SE_x(A)":0.0034709837,"SE_pred(A)":0.003226369}
{"model":"qwen3-1.7b","pass1":0.587961208,"pass@count":0.7692380591,"win_rate":0.1843333312,"count":12.0,"SE(A)":0.0047915497,"SE_x(A)":0.0041049106,"SE_pred(A)":0.0024715699}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.5765494693,"pass@count":0.8727255497,"win_rate":0.177645892,"count":12.0,"SE(A)":0.0048100793,"SE_x(A)":0.0034787333,"SE_pred(A)":0.0033219388}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.5702473465,"pass@count":0.8825815011,"win_rate":0.191765845,"count":12.0,"SE(A)":0.0048191842,"SE_x(A)":0.0031839626,"SE_pred(A)":0.0036175847}
{"model":"qwen1.5-14b-chat","pass1":0.5572213798,"pass@count":0.8291319181,"win_rate":0.1694654132,"count":10.0,"SE(A)":0.0048354825,"SE_x(A)":0.0036595754,"SE_pred(A)":0.003160601}
{"model":"llama-3.2-3B-instruct","pass1":0.5526914329,"pass@count":0.5526914329,"win_rate":0.1690024483,"count":18.0,"SE(A)":0.0048403591,"SE_x(A)":0.0048403591,"SE_pred(A)":0.0}
{"model":"deepseek_v2_lite_chat","pass1":0.499308188,"pass@count":0.8056292646,"win_rate":0.145326826,"count":10.0,"SE(A)":0.0048674578,"SE_x(A)":0.0036173947,"SE_pred(A)":0.0032567778}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.4929002401,"pass@count":0.8250568613,"win_rate":0.1409327607,"count":12.0,"SE(A)":0.0048669718,"SE_x(A)":0.0034974953,"SE_pred(A)":0.0033845148}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.4923521607,"pass@count":0.8308377559,"win_rate":0.1481612008,"count":10.0,"SE(A)":0.0048668931,"SE_x(A)":0.0033767106,"SE_pred(A)":0.0035049214}
{"model":"qwen1.5-7b-chat","pass1":0.4297418843,"pass@count":0.7913191812,"win_rate":0.1222675425,"count":11.0,"SE(A)":0.0048191693,"SE_x(A)":0.0033341964,"SE_pred(A)":0.0034795873}
{"model":"google_codegemma_1.1_7b_it","pass1":0.3745698956,"pass@count":0.7142721759,"win_rate":0.0958217965,"count":13.0,"SE(A)":0.0047118172,"SE_x(A)":0.0034567889,"SE_pred(A)":0.0032018482}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.3626010867,"pass@count":0.7674374526,"win_rate":0.0999318855,"count":12.0,"SE(A)":0.0046800751,"SE_x(A)":0.0031384112,"SE_pred(A)":0.0034718119}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.3488596159,"pass@count":0.7400492798,"win_rate":0.0895607028,"count":12.0,"SE(A)":0.0046397574,"SE_x(A)":0.0031665265,"SE_pred(A)":0.0033912326}
{"model":"qwen3-0.6b","pass1":0.3004898816,"pass@count":0.631444276,"win_rate":0.076727426,"count":13.0,"SE(A)":0.0044631814,"SE_x(A)":0.0032865116,"SE_pred(A)":0.0030197399}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.2981267374,"pass@count":0.6811978772,"win_rate":0.0821358511,"count":12.0,"SE(A)":0.0044530997,"SE_x(A)":0.0030451707,"SE_pred(A)":0.0032491588}
{"model":"google_gemma_3_1b_it","pass1":0.2958285949,"pass@count":0.5653904473,"win_rate":0.0724884176,"count":12.0,"SE(A)":0.0044431592,"SE_x(A)":0.0035140049,"SE_pred(A)":0.0027190869}
{"model":"qwen2-1.5b-instruct","pass1":0.2582448825,"pass@count":0.7040371494,"win_rate":0.0655908689,"count":12.0,"SE(A)":0.0042606783,"SE_x(A)":0.0025636355,"SE_pred(A)":0.0034031092}
{"model":"llama-3.2-1B-instruct","pass1":0.2427028052,"pass@count":0.2427028052,"win_rate":0.0575200197,"count":22.0,"SE(A)":0.0041735266,"SE_x(A)":0.0041735266,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.2371114481,"pass@count":0.6400682335,"win_rate":0.0577445729,"count":12.0,"SE(A)":0.0041403725,"SE_x(A)":0.0026437462,"SE_pred(A)":0.0031864229}
{"model":"google_gemma_7b_it","pass1":0.1914723907,"pass@count":0.4791508719,"win_rate":0.0454433436,"count":12.0,"SE(A)":0.0038303029,"SE_x(A)":0.0027615722,"SE_pred(A)":0.0026542304}
{"model":"qwen1.5-1.8b-chat","pass1":0.15093218,"pass@count":0.5357278241,"win_rate":0.0430104453,"count":11.0,"SE(A)":0.0034849353,"SE_x(A)":0.0018521986,"SE_pred(A)":0.0029519713}
{"model":"qwen2-0.5b-instruct","pass1":0.1161391205,"pass@count":0.4861637604,"win_rate":0.0276410255,"count":12.0,"SE(A)":0.0031189889,"SE_x(A)":0.0016185823,"SE_pred(A)":0.0026661364}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.0903875313,"pass@count":0.4177407127,"win_rate":0.0219020967,"count":13.0,"SE(A)":0.002791355,"SE_x(A)":0.0014696294,"SE_pred(A)":0.0023731524}
{"model":"google_gemma_2b_it","pass1":0.0625552818,"pass@count":0.2388172858,"win_rate":0.0144701784,"count":12.0,"SE(A)":0.0023574226,"SE_x(A)":0.0015131446,"SE_pred(A)":0.0018077153}
{"model":"qwen1.5-0.5b-chat","pass1":0.0429375401,"pass@count":0.2786201668,"win_rate":0.0121629489,"count":13.0,"SE(A)":0.0019734282,"SE_x(A)":0.0008309406,"SE_pred(A)":0.0017899599}
