{"model":"qwen3-32b","pass1":0.8007444169,"pass@count":0.9454094293,"win_rate":0.337416268,"count":10.0,"SE(A)":0.0198975537,"SE_x(A)":0.0153721142,"SE_pred(A)":0.0126337148}
{"model":"qwen3-14b","pass1":0.7701330927,"pass@count":0.9230769231,"win_rate":0.3157313189,"count":11.0,"SE(A)":0.0209589053,"SE_x(A)":0.0172722219,"SE_pred(A)":0.0118720707}
{"model":"llama-3.1-70B-instruct","pass1":0.7518610422,"pass@count":0.7518610422,"win_rate":0.3024760482,"count":13.0,"SE(A)":0.0215161095,"SE_x(A)":0.0215161095,"SE_pred(A)":0.0}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.7491039427,"pass@count":0.8957816377,"win_rate":0.2990368899,"count":9.0,"SE(A)":0.0215956081,"SE_x(A)":0.0182566239,"SE_pred(A)":0.0115354224}
{"model":"qwen2-72b-instruct","pass1":0.7360421836,"pass@count":0.8114143921,"win_rate":0.2935850136,"count":8.0,"SE(A)":0.0219566507,"SE_x(A)":0.0203748358,"SE_pred(A)":0.0081829441}
{"model":"qwen3-8b","pass1":0.7223099481,"pass@count":0.9255583127,"win_rate":0.2888895556,"count":11.0,"SE(A)":0.0223094794,"SE_x(A)":0.0173045658,"SE_pred(A)":0.014080656}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.7132616487,"pass@count":0.8833746898,"win_rate":0.2802596892,"count":9.0,"SE(A)":0.0225275937,"SE_x(A)":0.0179182391,"SE_pred(A)":0.0136539074}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.7042183623,"pass@count":0.905707196,"win_rate":0.2733015728,"count":10.0,"SE(A)":0.0227345705,"SE_x(A)":0.0179486314,"SE_pred(A)":0.0139537567}
{"model":"qwen3-4b","pass1":0.699751861,"pass@count":0.8982630273,"win_rate":0.2785130392,"count":12.0,"SE(A)":0.0228328262,"SE_x(A)":0.018047269,"SE_pred(A)":0.0139869236}
{"model":"qwen1.5-72b-chat","pass1":0.6749379653,"pass@count":0.6749379653,"win_rate":0.2520940675,"count":1.0,"SE(A)":0.0233325675,"SE_x(A)":null,"SE_pred(A)":null}
{"model":"google_gemma_3_12b_it","pass1":0.670471464,"pass@count":0.8387096774,"win_rate":0.2546783997,"count":10.0,"SE(A)":0.0234144597,"SE_x(A)":0.0197049258,"SE_pred(A)":0.0126472456}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.66839612,"pass@count":0.841191067,"win_rate":0.250188236,"count":11.0,"SE(A)":0.0234516951,"SE_x(A)":0.0201303157,"SE_pred(A)":0.0120313087}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.6670426348,"pass@count":0.8759305211,"win_rate":0.2546554358,"count":11.0,"SE(A)":0.0234757019,"SE_x(A)":0.0181722218,"SE_pred(A)":0.0148619964}
{"model":"qwen2-math-72b-instruct","pass1":0.6667848281,"pass@count":0.8312655087,"win_rate":0.2510250964,"count":7.0,"SE(A)":0.0234802499,"SE_x(A)":0.0194205734,"SE_pred(A)":0.0131971007}
{"model":"qwen1.5-32b-chat","pass1":0.6650124069,"pass@count":0.6650124069,"win_rate":0.245442026,"count":1.0,"SE(A)":0.0235113037,"SE_x(A)":null,"SE_pred(A)":null}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.6316264381,"pass@count":0.8635235732,"win_rate":0.2279473551,"count":11.0,"SE(A)":0.0240282341,"SE_x(A)":0.0202205359,"SE_pred(A)":0.0129802142}
{"model":"qwen2-7b-instruct","pass1":0.5833333333,"pass@count":0.7717121588,"win_rate":0.2062349326,"count":12.0,"SE(A)":0.0245584104,"SE_x(A)":0.0212807005,"SE_pred(A)":0.0122575409}
{"model":"qwen1.5-14b-chat","pass1":0.5750056395,"pass@count":0.7320099256,"win_rate":0.199838837,"count":11.0,"SE(A)":0.0246249361,"SE_x(A)":0.0220615518,"SE_pred(A)":0.0109396257}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.5738777352,"pass@count":0.7196029777,"win_rate":0.2000347119,"count":11.0,"SE(A)":0.0246333954,"SE_x(A)":0.0225526786,"SE_pred(A)":0.0099086253}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.561827957,"pass@count":0.8039702233,"win_rate":0.1967892449,"count":12.0,"SE(A)":0.0247156185,"SE_x(A)":0.0206262764,"SE_pred(A)":0.0136168469}
{"model":"llama-3.1-8B-instruct","pass1":0.5359801489,"pass@count":0.5359801489,"win_rate":0.1820014541,"count":15.0,"SE(A)":0.0248422032,"SE_x(A)":0.0248422032,"SE_pred(A)":0.0}
{"model":"qwen3-1.7b","pass1":0.511579818,"pass@count":0.7915632754,"win_rate":0.1817330454,"count":12.0,"SE(A)":0.0249000936,"SE_x(A)":0.0192230262,"SE_pred(A)":0.0158268735}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.5099255583,"pass@count":0.8734491315,"win_rate":0.1741883007,"count":12.0,"SE(A)":0.0249018661,"SE_x(A)":0.0181880015,"SE_pred(A)":0.0170088076}
{"model":"qwen1.5-7b-chat","pass1":0.5053763441,"pass@count":0.7071960298,"win_rate":0.1678199865,"count":12.0,"SE(A)":0.0249053342,"SE_x(A)":0.0217091641,"SE_pred(A)":0.0122060584}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.4944732687,"pass@count":0.8089330025,"win_rate":0.1672967206,"count":11.0,"SE(A)":0.0249052525,"SE_x(A)":0.0188073586,"SE_pred(A)":0.0163265081}
{"model":"google_gemma_3_4b_it","pass1":0.4863523573,"pass@count":0.7667493797,"win_rate":0.1698628157,"count":13.0,"SE(A)":0.0248974942,"SE_x(A)":0.0198006396,"SE_pred(A)":0.0150937035}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.4795849312,"pass@count":0.6476426799,"win_rate":0.1519539589,"count":11.0,"SE(A)":0.0248860044,"SE_x(A)":0.0220322503,"SE_pred(A)":0.0115712213}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.4757500564,"pass@count":0.5384615385,"win_rate":0.1611192409,"count":11.0,"SE(A)":0.0248774635,"SE_x(A)":0.0238152665,"SE_pred(A)":0.0071917502}
{"model":"deepseek_v2_lite_chat","pass1":0.474622152,"pass@count":0.7642679901,"win_rate":0.1639196828,"count":11.0,"SE(A)":0.0248746717,"SE_x(A)":0.0198961112,"SE_pred(A)":0.0149296367}
{"model":"llama-3.2-3B-instruct","pass1":0.4689826303,"pass@count":0.4689826303,"win_rate":0.159908459,"count":15.0,"SE(A)":0.0248588034,"SE_x(A)":0.0248588034,"SE_pred(A)":0.0}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.4509925558,"pass@count":0.8436724566,"win_rate":0.1542589748,"count":12.0,"SE(A)":0.0247868467,"SE_x(A)":0.0161394947,"SE_pred(A)":0.0188123491}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.4410106023,"pass@count":0.7841191067,"win_rate":0.1531105746,"count":11.0,"SE(A)":0.024732828,"SE_x(A)":0.0167867138,"SE_pred(A)":0.0181636731}
{"model":"qwen2-math-7b-instruct","pass1":0.4375516956,"pass@count":0.7642679901,"win_rate":0.1531175828,"count":12.0,"SE(A)":0.0247117481,"SE_x(A)":0.0194037813,"SE_pred(A)":0.0153024105}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.4019851117,"pass@count":0.8933002481,"win_rate":0.1400790222,"count":12.0,"SE(A)":0.0244235314,"SE_x(A)":0.0151368314,"SE_pred(A)":0.0191672957}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.3740130837,"pass@count":0.8362282878,"win_rate":0.1269985889,"count":11.0,"SE(A)":0.0241031333,"SE_x(A)":0.0154616713,"SE_pred(A)":0.0184904775}
{"model":"qwen3-0.6b","pass1":0.3704905516,"pass@count":0.6451612903,"win_rate":0.1370675587,"count":13.0,"SE(A)":0.024056762,"SE_x(A)":0.0196538262,"SE_pred(A)":0.0138728121}
{"model":"qwen2-1.5b-instruct","pass1":0.3463606286,"pass@count":0.7543424318,"win_rate":0.1166155785,"count":12.0,"SE(A)":0.0237017732,"SE_x(A)":0.0166796881,"SE_pred(A)":0.016839301}
{"model":"qwen1.5-1.8b-chat","pass1":0.2760545906,"pass@count":0.7344913151,"win_rate":0.1034550003,"count":12.0,"SE(A)":0.0222688543,"SE_x(A)":0.0138362686,"SE_pred(A)":0.0174487691}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.2739867659,"pass@count":0.888337469,"win_rate":0.1031887636,"count":12.0,"SE(A)":0.0222169552,"SE_x(A)":0.0105202044,"SE_pred(A)":0.0195683008}
{"model":"llama-3.2-1B-instruct","pass1":0.2630272953,"pass@count":0.2630272953,"win_rate":0.0964580817,"count":11.0,"SE(A)":0.0219317652,"SE_x(A)":0.0219317652,"SE_pred(A)":0.0}
{"model":"qwen2-math-1.5b-instruct","pass1":0.2431761787,"pass@count":0.7717121588,"win_rate":0.1047814439,"count":11.0,"SE(A)":0.0213700426,"SE_x(A)":0.0109768145,"SE_pred(A)":0.0183354374}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.2419354839,"pass@count":0.746898263,"win_rate":0.092139549,"count":12.0,"SE(A)":0.021332922,"SE_x(A)":0.009836549,"SE_pred(A)":0.0189297614}
{"model":"qwen2-0.5b-instruct","pass1":0.2267608322,"pass@count":0.8610421836,"win_rate":0.1001160982,"count":13.0,"SE(A)":0.0208587568,"SE_x(A)":0.0084225316,"SE_pred(A)":0.0190826805}
{"model":"qwen1.5-0.5b-chat","pass1":0.2166443978,"pass@count":0.8610421836,"win_rate":0.0962335341,"count":13.0,"SE(A)":0.0205211023,"SE_x(A)":0.0069599898,"SE_pred(A)":0.0193047709}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.2046192021,"pass@count":0.9131513648,"win_rate":0.0933454203,"count":13.0,"SE(A)":0.0200959352,"SE_x(A)":0.003251027,"SE_pred(A)":0.0198312237}
