{"model":"deepseek_r1_distill_qwen_32b","pass1":0.2659393939,"pass@count":0.5666666667,"win_rate":0.2202334973,"count":1100.0,"SE(A)":0.0806671417,"SE_x(A)":0.0680525792,"SE_pred(A)":0.0433132108}
{"model":"google_gemma_3_27b_it","pass1":0.2511017838,"pass@count":0.8,"win_rate":0.2048451518,"count":953.0,"SE(A)":0.079172739,"SE_x(A)":0.0672188901,"SE_pred(A)":0.041832325}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.2504848485,"pass@count":0.6333333333,"win_rate":0.2058418941,"count":1100.0,"SE(A)":0.079107983,"SE_x(A)":0.066994959,"SE_pred(A)":0.0420683781}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.2401515152,"pass@count":0.5666666667,"win_rate":0.1952002288,"count":1100.0,"SE(A)":0.0779911886,"SE_x(A)":0.0668227855,"SE_pred(A)":0.0402161763}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.2253939394,"pass@count":0.5666666667,"win_rate":0.1814182479,"count":1100.0,"SE(A)":0.07628707,"SE_x(A)":0.0661435646,"SE_pred(A)":0.0380098134}
{"model":"qwen3-14b","pass1":0.2002162496,"pass@count":0.7666666667,"win_rate":0.1614343599,"count":1079.0,"SE(A)":0.0730592689,"SE_x(A)":0.0576575553,"SE_pred(A)":0.0448694003}
{"model":"qwen3-8b","pass1":0.1862121212,"pass@count":0.7333333333,"win_rate":0.1483752693,"count":1100.0,"SE(A)":0.0710720684,"SE_x(A)":0.0551923689,"SE_pred(A)":0.0447776877}
{"model":"qwen3-32b","pass1":0.1848929664,"pass@count":0.8,"win_rate":0.1501036405,"count":1090.0,"SE(A)":0.0708772548,"SE_x(A)":0.0534349954,"SE_pred(A)":0.0465648635}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.1834242424,"pass@count":0.5666666667,"win_rate":0.1445184323,"count":1100.0,"SE(A)":0.070658755,"SE_x(A)":0.058151795,"SE_pred(A)":0.0401376182}
{"model":"google_gemma_3_12b_it","pass1":0.1728484848,"pass@count":0.7333333333,"win_rate":0.1352837772,"count":1100.0,"SE(A)":0.0690342635,"SE_x(A)":0.060474995,"SE_pred(A)":0.0332942115}
{"model":"qwen3-4b","pass1":0.1713333333,"pass@count":0.7333333333,"win_rate":0.1364176647,"count":1100.0,"SE(A)":0.0687939489,"SE_x(A)":0.0527087273,"SE_pred(A)":0.0442085679}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.1504545455,"pass@count":0.5,"win_rate":0.1170670944,"count":1100.0,"SE(A)":0.0652732654,"SE_x(A)":0.0502603355,"SE_pred(A)":0.041647303}
{"model":"qwen2-math-72b-instruct","pass1":0.1152380952,"pass@count":0.7,"win_rate":0.0923343352,"count":140.0,"SE(A)":0.0582975919,"SE_x(A)":0.0386015402,"SE_pred(A)":0.0436867293}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.1139090909,"pass@count":0.7333333333,"win_rate":0.089948986,"count":1100.0,"SE(A)":0.0580039682,"SE_x(A)":0.0405732025,"SE_pred(A)":0.0414520876}
{"model":"google_gemma_3_4b_it","pass1":0.108030303,"pass@count":0.5666666667,"win_rate":0.0825071624,"count":1100.0,"SE(A)":0.0566744377,"SE_x(A)":0.0448698707,"SE_pred(A)":0.0346220536}
{"model":"qwen3-1.7b","pass1":0.0854545455,"pass@count":0.5666666667,"win_rate":0.0674660639,"count":1100.0,"SE(A)":0.0510398753,"SE_x(A)":0.0329265021,"SE_pred(A)":0.0389989016}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.0795489651,"pass@count":0.7,"win_rate":0.0612379148,"count":1079.0,"SE(A)":0.0494034166,"SE_x(A)":0.0294759381,"SE_pred(A)":0.0396467735}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.0314242424,"pass@count":0.6666666667,"win_rate":0.0242313684,"count":1100.0,"SE(A)":0.0318521372,"SE_x(A)":0.0110089261,"SE_pred(A)":0.0298891651}
{"model":"qwen2-72b-instruct","pass1":0.0261818182,"pass@count":0.6666666667,"win_rate":0.0213291751,"count":1100.0,"SE(A)":0.0291526617,"SE_x(A)":0.0089361822,"SE_pred(A)":0.0277492763}
{"model":"llama-3.1-70B-instruct","pass1":0.024969697,"pass@count":0.5666666667,"win_rate":0.0198879861,"count":1100.0,"SE(A)":0.0284875477,"SE_x(A)":0.0111361396,"SE_pred(A)":0.0262207317}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.0139090909,"pass@count":0.6333333333,"win_rate":0.0113870545,"count":1100.0,"SE(A)":0.0213819457,"SE_x(A)":0.005269694,"SE_pred(A)":0.0207224016}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.0116666667,"pass@count":0.5333333333,"win_rate":0.0095493936,"count":1100.0,"SE(A)":0.0196048936,"SE_x(A)":0.0045254035,"SE_pred(A)":0.0190754443}
{"model":"qwen3-0.6b","pass1":0.0108181818,"pass@count":0.4666666667,"win_rate":0.0086990412,"count":1100.0,"SE(A)":0.0188866344,"SE_x(A)":0.0045416127,"SE_pred(A)":0.0183324497}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.0101818182,"pass@count":0.6666666667,"win_rate":0.0086118016,"count":1100.0,"SE(A)":0.0183286195,"SE_x(A)":0.0036724939,"SE_pred(A)":0.0179569229}
{"model":"google_gemma_2_27b_it","pass1":0.0083636364,"pass@count":0.4333333333,"win_rate":0.0068206716,"count":1100.0,"SE(A)":0.016626972,"SE_x(A)":0.004091659,"SE_pred(A)":0.0161156608}
{"model":"llama-3.1-8B-instruct","pass1":0.0080606061,"pass@count":0.5333333333,"win_rate":0.0069677677,"count":1100.0,"SE(A)":0.0163254736,"SE_x(A)":0.0023626916,"SE_pred(A)":0.0161535995}
{"model":"llama-3.2-3B-instruct","pass1":0.0076060606,"pass@count":0.4666666667,"win_rate":0.0064192571,"count":1100.0,"SE(A)":0.0158621231,"SE_x(A)":0.0035104561,"SE_pred(A)":0.0154687959}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.0067575758,"pass@count":0.6,"win_rate":0.0058482374,"count":1100.0,"SE(A)":0.0149576189,"SE_x(A)":0.0019106506,"SE_pred(A)":0.0148350861}
{"model":"qwen2-7b-instruct","pass1":0.0055151515,"pass@count":0.5,"win_rate":0.0046536945,"count":1100.0,"SE(A)":0.0135212606,"SE_x(A)":0.0021808877,"SE_pred(A)":0.0133442203}
{"model":"qwen1.5-72b-chat","pass1":0.0041818182,"pass@count":0.6,"win_rate":0.0038077,"count":1100.0,"SE(A)":0.0117818088,"SE_x(A)":0.0011096853,"SE_pred(A)":0.0117294338}
{"model":"qwen1.5-32b-chat","pass1":0.0040909091,"pass@count":0.5333333333,"win_rate":0.0036228675,"count":1100.0,"SE(A)":0.0116535739,"SE_x(A)":0.0011627802,"SE_pred(A)":0.0115954184}
{"model":"google_gemma_2_9b_it","pass1":0.0030606061,"pass@count":0.3,"win_rate":0.0024457679,"count":1100.0,"SE(A)":0.0100850364,"SE_x(A)":0.0018431715,"SE_pred(A)":0.0099151741}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.003060606,"pass@count":0.5333333333,"win_rate":0.0027770517,"count":1100.0,"SE(A)":0.0100850363,"SE_x(A)":0.0009206911,"SE_pred(A)":0.0100429222}
{"model":"google_gemma_3_1b_it","pass1":0.0027575758,"pass@count":0.3333333333,"win_rate":0.0020749177,"count":1100.0,"SE(A)":0.0095742215,"SE_x(A)":0.0013552049,"SE_pred(A)":0.0094778235}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.0025454545,"pass@count":0.4,"win_rate":0.0022888208,"count":1100.0,"SE(A)":0.0091995928,"SE_x(A)":0.0006731519,"SE_pred(A)":0.0091749318}
{"model":"qwen1.5-14b-chat","pass1":0.0020606061,"pass@count":0.5,"win_rate":0.0018471896,"count":1100.0,"SE(A)":0.0082792108,"SE_x(A)":0.0005909888,"SE_pred(A)":0.0082580908}
{"model":"deepseek_v2_lite_chat","pass1":0.0019393939,"pass@count":0.5333333333,"win_rate":0.0017488118,"count":1100.0,"SE(A)":0.0080325021,"SE_x(A)":0.0004174382,"SE_pred(A)":0.0080216479}
{"model":"google_codegemma_1.1_7b_it","pass1":0.0016060606,"pass@count":0.3666666667,"win_rate":0.0015143886,"count":1100.0,"SE(A)":0.0073109078,"SE_x(A)":0.000609701,"SE_pred(A)":0.0072854401}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.0016060606,"pass@count":0.3333333333,"win_rate":0.0015332653,"count":1100.0,"SE(A)":0.0073109078,"SE_x(A)":0.0008491153,"SE_pred(A)":0.0072614307}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.0015454545,"pass@count":0.3333333333,"win_rate":0.0013002214,"count":1100.0,"SE(A)":0.0071718573,"SE_x(A)":0.0006325825,"SE_pred(A)":0.0071439048}
{"model":"qwen1.5-7b-chat","pass1":0.0013030303,"pass@count":0.4666666667,"win_rate":0.0011537489,"count":1100.0,"SE(A)":0.0065861785,"SE_x(A)":0.0003214479,"SE_pred(A)":0.0065783294}
{"model":"google_gemma_7b_it","pass1":0.0012424242,"pass@count":0.2666666667,"win_rate":0.0010379327,"count":1100.0,"SE(A)":0.006431383,"SE_x(A)":0.0004244545,"SE_pred(A)":0.0064173613}
{"model":"llama-3.2-1B-instruct","pass1":0.0012424242,"pass@count":0.4,"win_rate":0.0011164115,"count":1100.0,"SE(A)":0.006431383,"SE_x(A)":0.0002643265,"SE_pred(A)":0.0064259489}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.000969697,"pass@count":0.3,"win_rate":0.0009126739,"count":1100.0,"SE(A)":0.0056825952,"SE_x(A)":0.0003928312,"SE_pred(A)":0.005669001}
{"model":"qwen1.5-0.5b-chat","pass1":0.0007272727,"pass@count":0.2666666667,"win_rate":0.0006651267,"count":1100.0,"SE(A)":0.0049218689,"SE_x(A)":0.0003162517,"SE_pred(A)":0.0049116981}
{"model":"qwen2-0.5b-instruct","pass1":0.0005454545,"pass@count":0.2666666667,"win_rate":0.0005262954,"count":1100.0,"SE(A)":0.0042628512,"SE_x(A)":0.0001747096,"SE_pred(A)":0.0042592696}
{"model":"qwen1.5-1.8b-chat","pass1":0.0004545454,"pass@count":0.2333333333,"win_rate":0.0004430438,"count":1100.0,"SE(A)":0.0038916099,"SE_x(A)":0.0001560854,"SE_pred(A)":0.0038884785}
{"model":"qwen2-1.5b-instruct","pass1":0.0003333333,"pass@count":0.2666666667,"win_rate":0.0002916086,"count":1100.0,"SE(A)":0.0033327777,"SE_x(A)":0.0000604084,"SE_pred(A)":0.0033322302}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.0003030303,"pass@count":0.2,"win_rate":0.0002933629,"count":1100.0,"SE(A)":0.003177727,"SE_x(A)":0.0000990284,"SE_pred(A)":0.0031761836}
{"model":"google_gemma_2b_it","pass1":0.0000909091,"pass@count":0.1,"win_rate":0.0000805778,"count":1100.0,"SE(A)":0.0017406974,"SE_x(A)":0.0,"SE_pred(A)":0.0017407766}
