{"model":"google_gemma_3_27b_it","pass1":0.941691364,"pass@count":0.9651250948,"win_rate":0.280337352,"count":11.0,"SE(A)":0.0064520591,"SE_x(A)":0.0058604513,"SE_pred(A)":0.0026989215}
{"model":"qwen3-32b","pass1":0.9390033772,"pass@count":0.9772554966,"win_rate":0.2800613443,"count":11.0,"SE(A)":0.0065896762,"SE_x(A)":0.0053804772,"SE_pred(A)":0.0038045101}
{"model":"llama-3.1-70B-instruct","pass1":0.9378316907,"pass@count":0.9378316907,"win_rate":0.2800850528,"count":11.0,"SE(A)":0.0066485139,"SE_x(A)":0.0066485139,"SE_pred(A)":0.0}
{"model":"qwen3-14b","pass1":0.934592322,"pass@count":0.9651250948,"win_rate":0.2773041539,"count":11.0,"SE(A)":0.0068077417,"SE_x(A)":0.0060225259,"SE_pred(A)":0.0031740399}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.926459439,"pass@count":0.9757391964,"win_rate":0.2764973543,"count":10.0,"SE(A)":0.0071871098,"SE_x(A)":0.005344221,"SE_pred(A)":0.004805606}
{"model":"qwen2-math-72b-instruct","pass1":0.9247746609,"pass@count":0.9840788476,"win_rate":0.273451153,"count":9.0,"SE(A)":0.0072623578,"SE_x(A)":0.0049462397,"SE_pred(A)":0.0053175702}
{"model":"qwen3-8b","pass1":0.9217037701,"pass@count":0.9613343442,"win_rate":0.2679319386,"count":11.0,"SE(A)":0.0073967972,"SE_x(A)":0.0063515997,"SE_pred(A)":0.0037907506}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.9214051049,"pass@count":0.9749810462,"win_rate":0.2700520001,"count":9.0,"SE(A)":0.0074096907,"SE_x(A)":0.0056065015,"SE_pred(A)":0.0048446524}
{"model":"google_gemma_3_12b_it","pass1":0.9213419257,"pass@count":0.9620924943,"win_rate":0.2694920381,"count":12.0,"SE(A)":0.0074124141,"SE_x(A)":0.0064603978,"SE_pred(A)":0.0036341633}
{"model":"qwen2-72b-instruct","pass1":0.9164518575,"pass@count":0.9810462472,"win_rate":0.2678263343,"count":10.0,"SE(A)":0.0076190501,"SE_x(A)":0.0054702336,"SE_pred(A)":0.0053034393}
{"model":"google_gemma_2_27b_it","pass1":0.9020301575,"pass@count":0.9575435936,"win_rate":0.2560761017,"count":9.0,"SE(A)":0.008185295,"SE_x(A)":0.0068691128,"SE_pred(A)":0.0044513305}
{"model":"qwen3-4b","pass1":0.8920669929,"pass@count":0.9340409401,"win_rate":0.2529443074,"count":11.0,"SE(A)":0.0085438473,"SE_x(A)":0.0078149225,"SE_pred(A)":0.0034531598}
{"model":"google_gemma_2_9b_it","pass1":0.8758701496,"pass@count":0.9514783927,"win_rate":0.2427521995,"count":11.0,"SE(A)":0.0090789491,"SE_x(A)":0.0074369082,"SE_pred(A)":0.005207659}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.8642911296,"pass@count":0.9795299469,"win_rate":0.2403172405,"count":11.0,"SE(A)":0.0094300012,"SE_x(A)":0.0063348513,"SE_pred(A)":0.0069853118}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.8641532842,"pass@count":0.973464746,"win_rate":0.2424517941,"count":11.0,"SE(A)":0.0094340368,"SE_x(A)":0.0065188331,"SE_pred(A)":0.0068195209}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.8592873389,"pass@count":0.9795299469,"win_rate":0.2507372389,"count":10.0,"SE(A)":0.0095744403,"SE_x(A)":0.0057414742,"SE_pred(A)":0.0076619437}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.8480253636,"pass@count":0.9810462472,"win_rate":0.2332146047,"count":11.0,"SE(A)":0.009884793,"SE_x(A)":0.0066664554,"SE_pred(A)":0.0072984591}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.8474050589,"pass@count":0.959818044,"win_rate":0.2405515694,"count":11.0,"SE(A)":0.0099013223,"SE_x(A)":0.0069888821,"SE_pred(A)":0.0070136802}
{"model":"qwen1.5-72b-chat","pass1":0.8422289613,"pass@count":0.966641395,"win_rate":0.2299741439,"count":10.0,"SE(A)":0.0100370556,"SE_x(A)":0.0072592213,"SE_pred(A)":0.006931536}
{"model":"google_gemma_3_4b_it","pass1":0.8344899983,"pass@count":0.9340409401,"win_rate":0.2254175559,"count":13.0,"SE(A)":0.0102329368,"SE_x(A)":0.0085562839,"SE_pred(A)":0.0056127534}
{"model":"qwen1.5-32b-chat","pass1":0.831897443,"pass@count":0.9689158453,"win_rate":0.2255405138,"count":11.0,"SE(A)":0.0102967379,"SE_x(A)":0.0071318805,"SE_pred(A)":0.0074269167}
{"model":"qwen2-math-7b-instruct","pass1":0.83031222,"pass@count":0.9605761941,"win_rate":0.2219391267,"count":11.0,"SE(A)":0.0103353122,"SE_x(A)":0.007588489,"SE_pred(A)":0.0070166597}
{"model":"llama-3.1-8B-instruct","pass1":0.8102602982,"pass@count":0.8104624716,"win_rate":0.2132277104,"count":15.0,"SE(A)":0.010796152,"SE_x(A)":0.010790573,"SE_pred(A)":0.0003470339}
{"model":"qwen2-math-1.5b-instruct","pass1":0.8041905024,"pass@count":0.9507202426,"win_rate":0.2112409505,"count":11.0,"SE(A)":0.0109263208,"SE_x(A)":0.0080938766,"SE_pred(A)":0.0073398671}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.7975739196,"pass@count":0.9605761941,"win_rate":0.2076568745,"count":12.0,"SE(A)":0.011063596,"SE_x(A)":0.0077624733,"SE_pred(A)":0.0078833473}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.7970684862,"pass@count":0.9636087945,"win_rate":0.2144778378,"count":12.0,"SE(A)":0.0110738891,"SE_x(A)":0.0071809384,"SE_pred(A)":0.0084300144}
{"model":"qwen2-7b-instruct","pass1":0.7796942128,"pass@count":0.9681576952,"win_rate":0.2040955505,"count":12.0,"SE(A)":0.0114117621,"SE_x(A)":0.0073906324,"SE_pred(A)":0.0086952209}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.7712282032,"pass@count":0.9673995451,"win_rate":0.1998856925,"count":12.0,"SE(A)":0.0115656567,"SE_x(A)":0.007665407,"SE_pred(A)":0.0086605975}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.7485698532,"pass@count":0.9590598939,"win_rate":0.1886611179,"count":11.0,"SE(A)":0.011945448,"SE_x(A)":0.0078162794,"SE_pred(A)":0.0090332444}
{"model":"llama-3.2-3B-instruct","pass1":0.7338893101,"pass@count":0.7338893101,"win_rate":0.1804322838,"count":17.0,"SE(A)":0.0121681357,"SE_x(A)":0.0121681357,"SE_pred(A)":0.0}
{"model":"qwen3-1.7b","pass1":0.7337514646,"pass@count":0.873388931,"win_rate":0.1806752336,"count":11.0,"SE(A)":0.0121701437,"SE_x(A)":0.010329378,"SE_pred(A)":0.0064355534}
{"model":"qwen1.5-14b-chat","pass1":0.7277482942,"pass@count":0.9279757392,"win_rate":0.1788258502,"count":10.0,"SE(A)":0.0122561341,"SE_x(A)":0.0090340821,"SE_pred(A)":0.008282402}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.6857467779,"pass@count":0.9423805914,"win_rate":0.1725286121,"count":12.0,"SE(A)":0.0127820161,"SE_x(A)":0.0078606536,"SE_pred(A)":0.0100791894}
{"model":"deepseek_v2_lite_chat","pass1":0.6768902061,"pass@count":0.9226686884,"win_rate":0.1601748941,"count":11.0,"SE(A)":0.0128769135,"SE_x(A)":0.0091996427,"SE_pred(A)":0.0090100763}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.6623475084,"pass@count":0.9249431387,"win_rate":0.1551962797,"count":11.0,"SE(A)":0.0130213357,"SE_x(A)":0.0088834551,"SE_pred(A)":0.0095204732}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.6566269212,"pass@count":0.9393479909,"win_rate":0.1573585394,"count":11.0,"SE(A)":0.0130743488,"SE_x(A)":0.008670543,"SE_pred(A)":0.0097857182}
{"model":"qwen1.5-7b-chat","pass1":0.5905357594,"pass@count":0.8885519333,"win_rate":0.1318010965,"count":12.0,"SE(A)":0.0135396891,"SE_x(A)":0.0092734114,"SE_pred(A)":0.0098654459}
{"model":"google_codegemma_1.1_7b_it","pass1":0.5483174899,"pass@count":0.8749052312,"win_rate":0.1183149769,"count":13.0,"SE(A)":0.0137028306,"SE_x(A)":0.0096555211,"SE_pred(A)":0.00972309}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.4985870839,"pass@count":0.8483699773,"win_rate":0.1029416401,"count":11.0,"SE(A)":0.013767208,"SE_x(A)":0.0092995219,"SE_pred(A)":0.0101515963}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.4832862361,"pass@count":0.8483699773,"win_rate":0.1003610739,"count":11.0,"SE(A)":0.013759569,"SE_x(A)":0.0093308675,"SE_pred(A)":0.0101124008}
{"model":"google_gemma_3_1b_it","pass1":0.446929492,"pass@count":0.706595906,"win_rate":0.0881986153,"count":12.0,"SE(A)":0.0136894929,"SE_x(A)":0.0111504215,"SE_pred(A)":0.0079416821}
{"model":"qwen3-0.6b","pass1":0.415874497,"pass@count":0.7422289613,"win_rate":0.0792113065,"count":13.0,"SE(A)":0.0135709994,"SE_x(A)":0.010033201,"SE_pred(A)":0.0091382111}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.3998208009,"pass@count":0.7566338135,"win_rate":0.0786702258,"count":11.0,"SE(A)":0.0134880996,"SE_x(A)":0.0093779091,"SE_pred(A)":0.0096945166}
{"model":"llama-3.2-1B-instruct","pass1":0.379833207,"pass@count":0.379833207,"win_rate":0.0706185061,"count":12.0,"SE(A)":0.0133637494,"SE_x(A)":0.0133637494,"SE_pred(A)":0.0}
{"model":"qwen2-1.5b-instruct","pass1":0.3793507478,"pass@count":0.8119787718,"win_rate":0.0742455901,"count":11.0,"SE(A)":0.0133604533,"SE_x(A)":0.0078452086,"SE_pred(A)":0.0108145464}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.3432352333,"pass@count":0.7482941622,"win_rate":0.0636520231,"count":11.0,"SE(A)":0.0130730962,"SE_x(A)":0.008419745,"SE_pred(A)":0.0100006869}
{"model":"google_gemma_7b_it","pass1":0.2887920142,"pass@count":0.6004548901,"win_rate":0.0511350252,"count":12.0,"SE(A)":0.0124786759,"SE_x(A)":0.0091615348,"SE_pred(A)":0.0084725222}
{"model":"qwen2-0.5b-instruct","pass1":0.1951944947,"pass@count":0.626990144,"win_rate":0.0312101909,"count":13.0,"SE(A)":0.0109133191,"SE_x(A)":0.0061563112,"SE_pred(A)":0.0090111246}
{"model":"qwen1.5-1.8b-chat","pass1":0.1581374779,"pass@count":0.5708870356,"win_rate":0.0252656095,"count":12.0,"SE(A)":0.0100465186,"SE_x(A)":0.0050307307,"SE_pred(A)":0.0086962225}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.1396162594,"pass@count":0.5140257771,"win_rate":0.0203587155,"count":13.0,"SE(A)":0.0095431489,"SE_x(A)":0.0055057489,"SE_pred(A)":0.0077947687}
{"model":"google_gemma_2b_it","pass1":0.099001769,"pass@count":0.3062926459,"win_rate":0.0146325548,"count":12.0,"SE(A)":0.0082235824,"SE_x(A)":0.0058023016,"SE_pred(A)":0.0058275727}
{"model":"qwen1.5-0.5b-chat","pass1":0.0686417449,"pass@count":0.3472327521,"win_rate":0.0102158519,"count":13.0,"SE(A)":0.0069619379,"SE_x(A)":0.0034005898,"SE_pred(A)":0.006074913}
