{"model":"qwen3-32b","pass1":0.6999205822,"pass@count":0.85546875,"win_rate":0.3644840352,"count":9.0,"SE(A)":0.0041780494,"SE_x(A)":0.0034925236,"SE_pred(A)":0.002293115}
{"model":"qwen3-14b","pass1":0.673337766,"pass@count":0.8161569149,"win_rate":0.3439100412,"count":10.0,"SE(A)":0.0042755997,"SE_x(A)":0.003703565,"SE_pred(A)":0.002136436}
{"model":"llama-3.1-70B-instruct","pass1":0.6387134309,"pass@count":0.6387134309,"win_rate":0.322780625,"count":12.0,"SE(A)":0.0043793537,"SE_x(A)":0.0043793537,"SE_pred(A)":0.0}
{"model":"qwen3-8b","pass1":0.6261552527,"pass@count":0.7821642287,"win_rate":0.3101914538,"count":10.0,"SE(A)":0.0044108038,"SE_x(A)":0.0038088864,"SE_pred(A)":0.0022243144}
{"model":"qwen2-72b-instruct","pass1":0.6222365359,"pass@count":0.7810006649,"win_rate":0.3097648655,"count":4.0,"SE(A)":0.0044199649,"SE_x(A)":0.0034903326,"SE_pred(A)":0.0027117647}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.604907746,"pass@count":0.7998670213,"win_rate":0.2947271603,"count":8.0,"SE(A)":0.0044568182,"SE_x(A)":0.0036252822,"SE_pred(A)":0.0025924038}
{"model":"google_gemma_3_12b_it","pass1":0.5867761726,"pass@count":0.775681516,"win_rate":0.2817761395,"count":11.0,"SE(A)":0.0044891075,"SE_x(A)":0.0038298319,"SE_pred(A)":0.0023418952}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.5770260786,"pass@count":0.7582280585,"win_rate":0.2755850972,"count":9.0,"SE(A)":0.0045038675,"SE_x(A)":0.003742209,"SE_pred(A)":0.0025061314}
{"model":"qwen3-4b","pass1":0.5757676499,"pass@count":0.7437666223,"win_rate":0.2777852217,"count":11.0,"SE(A)":0.0045056413,"SE_x(A)":0.0039029854,"SE_pred(A)":0.0022511126}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.5180075355,"pass@count":0.8218085106,"win_rate":0.2397649209,"count":9.0,"SE(A)":0.0045553238,"SE_x(A)":0.0033547051,"SE_pred(A)":0.0030817087}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.5151574967,"pass@count":0.7346243351,"win_rate":0.2364498708,"count":8.0,"SE(A)":0.004556186,"SE_x(A)":0.0036186022,"SE_pred(A)":0.0027684922}
{"model":"qwen2-math-72b-instruct","pass1":0.5106486868,"pass@count":0.7984541223,"win_rate":0.2380365856,"count":8.0,"SE(A)":0.0045572471,"SE_x(A)":0.003359065,"SE_pred(A)":0.0030798025}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.4814453125,"pass@count":0.754737367,"win_rate":0.2148656473,"count":12.0,"SE(A)":0.0045551413,"SE_x(A)":0.003518972,"SE_pred(A)":0.0028924294}
{"model":"qwen1.5-72b-chat","pass1":0.4748171543,"pass@count":0.4748171543,"win_rate":0.2172657293,"count":1.0,"SE(A)":0.0045524958,"SE_x(A)":null,"SE_pred(A)":null}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.4737616356,"pass@count":0.8193982713,"win_rate":0.2127727364,"count":10.0,"SE(A)":0.0045520003,"SE_x(A)":0.0032760198,"SE_pred(A)":0.0031604432}
{"model":"qwen1.5-32b-chat","pass1":0.4605115525,"pass@count":0.7495844415,"win_rate":0.2063851276,"count":8.0,"SE(A)":0.004544043,"SE_x(A)":0.0033515238,"SE_pred(A)":0.0030684873}
{"model":"llama-3.1-8B-instruct","pass1":0.4496343085,"pass@count":0.4496343085,"win_rate":0.1994744207,"count":15.0,"SE(A)":0.004535096,"SE_x(A)":0.004535096,"SE_pred(A)":0.0}
{"model":"qwen2-7b-instruct","pass1":0.442237367,"pass@count":0.78125,"win_rate":0.1954327006,"count":12.0,"SE(A)":0.0045277612,"SE_x(A)":0.0033557308,"SE_pred(A)":0.0030396862}
{"model":"qwen3-1.7b","pass1":0.4293481272,"pass@count":0.6801030585,"win_rate":0.191877567,"count":12.0,"SE(A)":0.0045125445,"SE_x(A)":0.0036728577,"SE_pred(A)":0.0026216739}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.4228474069,"pass@count":0.758643617,"win_rate":0.1878379522,"count":10.0,"SE(A)":0.0045036875,"SE_x(A)":0.003297336,"SE_pred(A)":0.0030676988}
{"model":"google_gemma_3_4b_it","pass1":0.4153667144,"pass@count":0.6620678191,"win_rate":0.1789997414,"count":13.0,"SE(A)":0.0044925064,"SE_x(A)":0.0037642982,"SE_pred(A)":0.0024520751}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.3920907882,"pass@count":0.7844082447,"win_rate":0.166824696,"count":11.0,"SE(A)":0.0044508583,"SE_x(A)":0.0031318328,"SE_pred(A)":0.0031625564}
{"model":"qwen1.5-14b-chat","pass1":0.3791805186,"pass@count":0.696143617,"win_rate":0.1603945408,"count":10.0,"SE(A)":0.0044232019,"SE_x(A)":0.0033097272,"SE_pred(A)":0.0029343519}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.3767661237,"pass@count":0.7835771277,"win_rate":0.1598209308,"count":12.0,"SE(A)":0.0044176625,"SE_x(A)":0.0030352353,"SE_pred(A)":0.0032098426}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.3665433843,"pass@count":0.792137633,"win_rate":0.1565568544,"count":12.0,"SE(A)":0.0043929091,"SE_x(A)":0.0029362838,"SE_pred(A)":0.0032673977}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.3612103834,"pass@count":0.6850066489,"win_rate":0.1470263364,"count":12.0,"SE(A)":0.0043791531,"SE_x(A)":0.0032096041,"SE_pred(A)":0.0029791649}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.3610787899,"pass@count":0.6812666223,"win_rate":0.1510333151,"count":12.0,"SE(A)":0.0043788063,"SE_x(A)":0.0032745785,"SE_pred(A)":0.0029070741}
{"model":"llama-3.2-3B-instruct","pass1":0.3503158245,"pass@count":0.3503158245,"win_rate":0.1496119491,"count":19.0,"SE(A)":0.0043492274,"SE_x(A)":0.0043492274,"SE_pred(A)":0.0}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.3368932846,"pass@count":0.7176695479,"win_rate":0.1425148622,"count":12.0,"SE(A)":0.0043089254,"SE_x(A)":0.0031208015,"SE_pred(A)":0.0029711002}
{"model":"qwen2-math-7b-instruct","pass1":0.3306114251,"pass@count":0.7242353723,"win_rate":0.1459527413,"count":12.0,"SE(A)":0.0042887345,"SE_x(A)":0.0029627655,"SE_pred(A)":0.003100849}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.294374723,"pass@count":0.7464261968,"win_rate":0.1205636499,"count":12.0,"SE(A)":0.004154975,"SE_x(A)":0.0027153045,"SE_pred(A)":0.0031449863}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.2906762522,"pass@count":0.6536735372,"win_rate":0.1194484609,"count":12.0,"SE(A)":0.0041395975,"SE_x(A)":0.0030277556,"SE_pred(A)":0.0028229353}
{"model":"deepseek_v2_lite_chat","pass1":0.2896193484,"pass@count":0.6745345745,"win_rate":0.1185460335,"count":10.0,"SE(A)":0.0041351421,"SE_x(A)":0.0028351354,"SE_pred(A)":0.0030102172}
{"model":"qwen1.5-7b-chat","pass1":0.2509391622,"pass@count":0.6245013298,"win_rate":0.101373962,"count":10.0,"SE(A)":0.003952518,"SE_x(A)":0.0025659936,"SE_pred(A)":0.0030063392}
{"model":"qwen2-math-1.5b-instruct","pass1":0.2509211547,"pass@count":0.6931515957,"win_rate":0.115005799,"count":12.0,"SE(A)":0.0039524237,"SE_x(A)":0.0024774456,"SE_pred(A)":0.0030795967}
{"model":"qwen3-0.6b","pass1":0.2382045315,"pass@count":0.5403922872,"win_rate":0.1068304322,"count":13.0,"SE(A)":0.0038835177,"SE_x(A)":0.0029187783,"SE_pred(A)":0.0025617266}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.2381427305,"pass@count":0.6716256649,"win_rate":0.0995761319,"count":12.0,"SE(A)":0.0038831714,"SE_x(A)":0.0025008399,"SE_pred(A)":0.0029706599}
{"model":"llama-3.2-1B-instruct","pass1":0.2151761968,"pass@count":0.2151761968,"win_rate":0.0944463624,"count":21.0,"SE(A)":0.0037464012,"SE_x(A)":0.0037464012,"SE_pred(A)":0.0}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.2047733821,"pass@count":0.5943317819,"win_rate":0.0821368922,"count":12.0,"SE(A)":0.0036788603,"SE_x(A)":0.0022234513,"SE_pred(A)":0.0029309175}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.2029518506,"pass@count":0.6477726064,"win_rate":0.0848223037,"count":12.0,"SE(A)":0.0036666535,"SE_x(A)":0.0021871163,"SE_pred(A)":0.0029429357}
{"model":"qwen2-1.5b-instruct","pass1":0.172165891,"pass@count":0.6300698138,"win_rate":0.0759430724,"count":12.0,"SE(A)":0.0034417261,"SE_x(A)":0.0018393762,"SE_pred(A)":0.0029089815}
{"model":"qwen1.5-1.8b-chat","pass1":0.1242769282,"pass@count":0.4563663564,"win_rate":0.0570516677,"count":10.0,"SE(A)":0.0030075284,"SE_x(A)":0.0016164267,"SE_pred(A)":0.0025362161}
{"model":"qwen2-0.5b-instruct","pass1":0.1172706117,"pass@count":0.5521941489,"win_rate":0.0615754104,"count":13.0,"SE(A)":0.0029331851,"SE_x(A)":0.0013352512,"SE_pred(A)":0.002611643}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.1043691182,"pass@count":0.53125,"win_rate":0.0579049898,"count":13.0,"SE(A)":0.0027872866,"SE_x(A)":0.0010458012,"SE_pred(A)":0.0025836537}
{"model":"qwen1.5-0.5b-chat","pass1":0.1034612827,"pass@count":0.5700631649,"win_rate":0.0583325308,"count":13.0,"SE(A)":0.0027765439,"SE_x(A)":0.000983216,"SE_pred(A)":0.002596629}
