{"model":"llama-3.1-70B-instruct","pass1":0.8336661035,"pass@count":0.8336661035,"win_rate":0.3695927049,"count":12.0,"SE(A)":0.0046149033,"SE_x(A)":0.0046149033,"SE_pred(A)":0.0}
{"model":"qwen3-14b","pass1":0.8251909356,"pass@count":0.9038550146,"win_rate":0.3618226613,"count":11.0,"SE(A)":0.0047069041,"SE_x(A)":0.0041779436,"SE_pred(A)":0.0021678868}
{"model":"google_gemma_3_12b_it","pass1":0.8184994624,"pass@count":0.9258178467,"win_rate":0.3604416766,"count":12.0,"SE(A)":0.0047766598,"SE_x(A)":0.0040046721,"SE_pred(A)":0.0026036667}
{"model":"qwen2-72b-instruct","pass1":0.790078329,"pass@count":0.9242819843,"win_rate":0.3424845195,"count":10.0,"SE(A)":0.0050470762,"SE_x(A)":0.003950585,"SE_pred(A)":0.0031409961}
{"model":"qwen2-math-72b-instruct","pass1":0.7851942866,"pass@count":0.925971433,"win_rate":0.3396277748,"count":10.0,"SE(A)":0.0050896466,"SE_x(A)":0.0039810004,"SE_pred(A)":0.0031711416}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.7825679619,"pass@count":0.9338043311,"win_rate":0.3387152983,"count":10.0,"SE(A)":0.0051120954,"SE_x(A)":0.0039120241,"SE_pred(A)":0.0032908338}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.774211164,"pass@count":0.9161419137,"win_rate":0.3299065582,"count":9.0,"SE(A)":0.0051815191,"SE_x(A)":0.0040445471,"SE_pred(A)":0.0032387927}
{"model":"qwen3-32b","pass1":0.7624191539,"pass@count":0.905390877,"win_rate":0.3311277513,"count":9.0,"SE(A)":0.0052744692,"SE_x(A)":0.0042062568,"SE_pred(A)":0.0031823622}
{"model":"qwen3-4b","pass1":0.7515742589,"pass@count":0.8596221779,"win_rate":0.3187974989,"count":12.0,"SE(A)":0.0053550114,"SE_x(A)":0.0047341972,"SE_pred(A)":0.0025027032}
{"model":"qwen3-8b","pass1":0.7294787842,"pass@count":0.857779143,"win_rate":0.3056595146,"count":11.0,"SE(A)":0.0055053274,"SE_x(A)":0.0048234285,"SE_pred(A)":0.0026538965}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.7135477025,"pass@count":0.9219781908,"win_rate":0.2991340484,"count":11.0,"SE(A)":0.0056029122,"SE_x(A)":0.0040979682,"SE_pred(A)":0.0038209006}
{"model":"qwen1.5-72b-chat","pass1":0.6829519275,"pass@count":0.8818921825,"win_rate":0.2817294548,"count":10.0,"SE(A)":0.0057667852,"SE_x(A)":0.0044539594,"SE_pred(A)":0.0036630667}
{"model":"llama-3.1-8B-instruct","pass1":0.6628782061,"pass@count":0.6628782061,"win_rate":0.2710496722,"count":15.0,"SE(A)":0.0058585,"SE_x(A)":0.0058585,"SE_pred(A)":0.0}
{"model":"qwen1.5-32b-chat","pass1":0.6617348419,"pass@count":0.8748272155,"win_rate":0.2682802727,"count":9.0,"SE(A)":0.0058633631,"SE_x(A)":0.004386825,"SE_pred(A)":0.0038903461}
{"model":"google_gemma_3_4b_it","pass1":0.6440150235,"pass@count":0.8431884503,"win_rate":0.2707488144,"count":11.0,"SE(A)":0.0059338968,"SE_x(A)":0.004892984,"SE_pred(A)":0.003357058}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.6221138586,"pass@count":0.9090769467,"win_rate":0.2482275861,"count":12.0,"SE(A)":0.0060088529,"SE_x(A)":0.0041980954,"SE_pred(A)":0.0042991055}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.6187849932,"pass@count":0.8381201044,"win_rate":0.2459421209,"count":11.0,"SE(A)":0.0060190927,"SE_x(A)":0.0047802516,"SE_pred(A)":0.0036576866}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.6015972969,"pass@count":0.9017048072,"win_rate":0.2384814412,"count":11.0,"SE(A)":0.0060672268,"SE_x(A)":0.0040558975,"SE_pred(A)":0.0045123094}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.598794348,"pass@count":0.9078482568,"win_rate":0.2363429695,"count":12.0,"SE(A)":0.0060743319,"SE_x(A)":0.0041265565,"SE_pred(A)":0.0044574701}
{"model":"qwen2-math-7b-instruct","pass1":0.5691766381,"pass@count":0.8312087237,"win_rate":0.2214546907,"count":11.0,"SE(A)":0.0061369037,"SE_x(A)":0.0047474422,"SE_pred(A)":0.003888879}
{"model":"llama-3.2-3B-instruct","pass1":0.5676547381,"pass@count":0.5676547381,"win_rate":0.2200546964,"count":18.0,"SE(A)":0.006139509,"SE_x(A)":0.006139509,"SE_pred(A)":0.0}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.5424921927,"pass@count":0.8688373522,"win_rate":0.2089631005,"count":12.0,"SE(A)":0.0061740786,"SE_x(A)":0.0043314518,"SE_pred(A)":0.0043997468}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.5128756463,"pass@count":0.8175395485,"win_rate":0.1909117941,"count":12.0,"SE(A)":0.0061944409,"SE_x(A)":0.0046744037,"SE_pred(A)":0.0040646092}
{"model":"qwen3-1.7b","pass1":0.4930118261,"pass@count":0.6975886961,"win_rate":0.1826107777,"count":12.0,"SE(A)":0.0061958905,"SE_x(A)":0.0053191577,"SE_pred(A)":0.0031773606}
{"model":"deepseek_v2_lite_chat","pass1":0.4612194747,"pass@count":0.769620642,"win_rate":0.1754861477,"count":9.0,"SE(A)":0.0061778295,"SE_x(A)":0.0044276252,"SE_pred(A)":0.0043083305}
{"model":"qwen1.5-14b-chat","pass1":0.4225017802,"pass@count":0.8196897558,"win_rate":0.1589968318,"count":11.0,"SE(A)":0.0061216113,"SE_x(A)":0.0038692805,"SE_pred(A)":0.0047437109}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.4217222137,"pass@count":0.8047918906,"win_rate":0.1538834159,"count":12.0,"SE(A)":0.0061200877,"SE_x(A)":0.0041909114,"SE_pred(A)":0.0044600151}
{"model":"qwen2-math-1.5b-instruct","pass1":0.4150412123,"pass@count":0.7660881585,"win_rate":0.1554854965,"count":12.0,"SE(A)":0.006106388,"SE_x(A)":0.0044190274,"SE_pred(A)":0.0042142818}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.405211693,"pass@count":0.7565658117,"win_rate":0.1429117217,"count":12.0,"SE(A)":0.0060841281,"SE_x(A)":0.0044885517,"SE_pred(A)":0.0041072519}
{"model":"qwen2-7b-instruct","pass1":0.3847591256,"pass@count":0.8049454769,"win_rate":0.1393425463,"count":12.0,"SE(A)":0.0060296653,"SE_x(A)":0.0038446578,"SE_pred(A)":0.0046449403}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.3589694363,"pass@count":0.7905083705,"win_rate":0.135592772,"count":12.0,"SE(A)":0.0059448959,"SE_x(A)":0.0036485445,"SE_pred(A)":0.0046936032}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.3559617058,"pass@count":0.7103363539,"win_rate":0.1288242107,"count":9.0,"SE(A)":0.00593381,"SE_x(A)":0.0037311218,"SE_pred(A)":0.0046139821}
{"model":"llama-3.2-1B-instruct","pass1":0.3486407618,"pass@count":0.3486407618,"win_rate":0.1327314389,"count":21.0,"SE(A)":0.0059057563,"SE_x(A)":0.0059057563,"SE_pred(A)":0.0}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.3312599191,"pass@count":0.7381354631,"win_rate":0.1151613117,"count":12.0,"SE(A)":0.0058329637,"SE_x(A)":0.003652554,"SE_pred(A)":0.0045477813}
{"model":"qwen3-0.6b","pass1":0.3311319306,"pass@count":0.6806942098,"win_rate":0.1215762248,"count":12.0,"SE(A)":0.0058323948,"SE_x(A)":0.0042198437,"SE_pred(A)":0.0040261332}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.2906875544,"pass@count":0.7367531869,"win_rate":0.1007295212,"count":12.0,"SE(A)":0.0056274049,"SE_x(A)":0.0031038145,"SE_pred(A)":0.004694041}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.269710234,"pass@count":0.6954384887,"win_rate":0.104572074,"count":12.0,"SE(A)":0.0055001239,"SE_x(A)":0.0034082379,"SE_pred(A)":0.0043168597}
{"model":"qwen1.5-7b-chat","pass1":0.2663743874,"pass@count":0.7092612502,"win_rate":0.0936735001,"count":11.0,"SE(A)":0.0054784743,"SE_x(A)":0.0031401154,"SE_pred(A)":0.0044892489}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.2335918702,"pass@count":0.6214099217,"win_rate":0.0778661682,"count":12.0,"SE(A)":0.0052436665,"SE_x(A)":0.0030478923,"SE_pred(A)":0.0042668948}
{"model":"qwen2-1.5b-instruct","pass1":0.2235191727,"pass@count":0.7275380126,"win_rate":0.0829412879,"count":12.0,"SE(A)":0.0051629618,"SE_x(A)":0.0025201035,"SE_pred(A)":0.004506135}
{"model":"qwen1.5-0.5b-chat","pass1":0.2165437977,"pass@count":0.6390723391,"win_rate":0.0847405849,"count":12.0,"SE(A)":0.0051045374,"SE_x(A)":0.002870512,"SE_pred(A)":0.0042209552}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.2153498475,"pass@count":0.5842420519,"win_rate":0.0725640743,"count":7.0,"SE(A)":0.0050943229,"SE_x(A)":0.0026014893,"SE_pred(A)":0.0043799976}
{"model":"qwen1.5-1.8b-chat","pass1":0.1982658717,"pass@count":0.5888496391,"win_rate":0.0707949053,"count":11.0,"SE(A)":0.0049410053,"SE_x(A)":0.0029152067,"SE_pred(A)":0.0039893738}
{"model":"qwen2-0.5b-instruct","pass1":0.1820124917,"pass@count":0.6191061281,"win_rate":0.070142784,"count":12.0,"SE(A)":0.0047818955,"SE_x(A)":0.0024515408,"SE_pred(A)":0.0041056635}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.1397506783,"pass@count":0.5307940409,"win_rate":0.0481090094,"count":12.0,"SE(A)":0.0042969993,"SE_x(A)":0.0020820017,"SE_pred(A)":0.003758919}
