{"model":"google_gemma_3_27b_it","pass1":0.86525,"pass@count":0.956,"win_rate":0.4327175386,"count":8.0,"SE(A)":0.0152703921,"SE_x(A)":0.0123218396,"SE_pred(A)":0.0090198194}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.8484825688,"pass@count":0.982,"win_rate":0.4211227764,"count":1090.0,"SE(A)":0.0160349555,"SE_x(A)":0.0120790641,"SE_pred(A)":0.0105459002}
{"model":"qwen3-32b","pass1":0.8234057143,"pass@count":0.98,"win_rate":0.3987942105,"count":700.0,"SE(A)":0.0170533717,"SE_x(A)":0.0131807449,"SE_pred(A)":0.0108206032}
{"model":"qwen3-14b","pass1":0.8229323448,"pass@count":0.978,"win_rate":0.3980491977,"count":1079.0,"SE(A)":0.0170713034,"SE_x(A)":0.0135970306,"SE_pred(A)":0.0103223136}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.8206296296,"pass@count":0.978,"win_rate":0.4012547528,"count":1080.0,"SE(A)":0.0171578927,"SE_x(A)":0.0125884581,"SE_pred(A)":0.011658645}
{"model":"qwen2-math-72b-instruct","pass1":0.8112571429,"pass@count":0.946,"win_rate":0.3872144865,"count":35.0,"SE(A)":0.0174996566,"SE_x(A)":0.0145120987,"SE_pred(A)":0.0097794159}
{"model":"google_gemma_3_12b_it","pass1":0.8023517306,"pass@count":0.984,"win_rate":0.3826336876,"count":1069.0,"SE(A)":0.0178091792,"SE_x(A)":0.0147175676,"SE_pred(A)":0.0100279642}
{"model":"qwen3-8b","pass1":0.7971545455,"pass@count":0.982,"win_rate":0.3781068923,"count":1100.0,"SE(A)":0.0179832798,"SE_x(A)":0.0144773149,"SE_pred(A)":0.0106679756}
{"model":"qwen3-4b","pass1":0.7819309091,"pass@count":0.978,"win_rate":0.3683005424,"count":1100.0,"SE(A)":0.0184669956,"SE_x(A)":0.0146946375,"SE_pred(A)":0.0111847017}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.7810054545,"pass@count":0.982,"win_rate":0.3723186999,"count":1100.0,"SE(A)":0.018495185,"SE_x(A)":0.0137704454,"SE_pred(A)":0.0123469309}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.7697972603,"pass@count":0.972,"win_rate":0.3566752578,"count":1095.0,"SE(A)":0.018826016,"SE_x(A)":0.0151217784,"SE_pred(A)":0.0112138617}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.726012975,"pass@count":0.98,"win_rate":0.3266618306,"count":1079.0,"SE(A)":0.0199458334,"SE_x(A)":0.0157012664,"SE_pred(A)":0.0123006709}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.7054929577,"pass@count":0.952,"win_rate":0.3284087607,"count":71.0,"SE(A)":0.020384928,"SE_x(A)":0.0145259366,"SE_pred(A)":0.014301834}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.6866818182,"pass@count":0.98,"win_rate":0.3074555826,"count":1100.0,"SE(A)":0.0207436689,"SE_x(A)":0.0152551003,"SE_pred(A)":0.0140563762}
{"model":"qwen3-1.7b","pass1":0.6792581818,"pass@count":0.976,"win_rate":0.2992774339,"count":1100.0,"SE(A)":0.0208742188,"SE_x(A)":0.0163953915,"SE_pred(A)":0.0129199128}
{"model":"llama-3.1-70B-instruct","pass1":0.6642602871,"pass@count":0.886,"win_rate":0.291498549,"count":1045.0,"SE(A)":0.0211195908,"SE_x(A)":0.0169232209,"SE_pred(A)":0.01263494}
{"model":"google_gemma_3_4b_it","pass1":0.6568654545,"pass@count":0.972,"win_rate":0.294196925,"count":1100.0,"SE(A)":0.0212317323,"SE_x(A)":0.0170730674,"SE_pred(A)":0.0126212847}
{"model":"qwen2-72b-instruct","pass1":0.6531069767,"pass@count":0.964,"win_rate":0.2794232536,"count":215.0,"SE(A)":0.0212865335,"SE_x(A)":0.0167390773,"SE_pred(A)":0.0131498973}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.6242527273,"pass@count":0.968,"win_rate":0.2648394105,"count":1100.0,"SE(A)":0.0216592364,"SE_x(A)":0.0160284275,"SE_pred(A)":0.0145674991}
{"model":"google_gemma_2_27b_it","pass1":0.5305628415,"pass@count":0.938,"win_rate":0.2078387507,"count":1098.0,"SE(A)":0.022318867,"SE_x(A)":0.0187575285,"SE_pred(A)":0.0120949142}
{"model":"qwen2-7b-instruct","pass1":0.5248927273,"pass@count":0.968,"win_rate":0.2053516404,"count":1100.0,"SE(A)":0.0223329511,"SE_x(A)":0.0168987658,"SE_pred(A)":0.0146011102}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.4931163636,"pass@count":0.974,"win_rate":0.1898097506,"count":1100.0,"SE(A)":0.0223585606,"SE_x(A)":0.0164997217,"SE_pred(A)":0.0150885525}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.4852363636,"pass@count":0.978,"win_rate":0.186374609,"count":1100.0,"SE(A)":0.02235093,"SE_x(A)":0.0163041015,"SE_pred(A)":0.0152885691}
{"model":"llama-3.1-8B-instruct","pass1":0.4841509091,"pass@count":0.938,"win_rate":0.1872187089,"count":1100.0,"SE(A)":0.0223494432,"SE_x(A)":0.0168155728,"SE_pred(A)":0.0147218927}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.47788,"pass@count":0.918,"win_rate":0.1842964911,"count":100.0,"SE(A)":0.0223387871,"SE_x(A)":0.0163093082,"SE_pred(A)":0.0152652507}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.4743054545,"pass@count":0.972,"win_rate":0.1872122798,"count":1100.0,"SE(A)":0.0223311348,"SE_x(A)":0.014937578,"SE_pred(A)":0.0165996489}
{"model":"google_gemma_2_9b_it","pass1":0.4740745455,"pass@count":0.906,"win_rate":0.1757886672,"count":1100.0,"SE(A)":0.022330601,"SE_x(A)":0.0187964595,"SE_pred(A)":0.0120560712}
{"model":"llama-3.2-3B-instruct","pass1":0.4446963636,"pass@count":0.918,"win_rate":0.1667068161,"count":1100.0,"SE(A)":0.0222234789,"SE_x(A)":0.0169048549,"SE_pred(A)":0.0144259799}
{"model":"qwen1.5-72b-chat","pass1":0.4057179487,"pass@count":0.934,"win_rate":0.1486334165,"count":390.0,"SE(A)":0.0219595489,"SE_x(A)":0.0160084316,"SE_pred(A)":0.0150316968}
{"model":"qwen1.5-32b-chat","pass1":0.399,"pass@count":0.822,"win_rate":0.1461870705,"count":30.0,"SE(A)":0.021899726,"SE_x(A)":0.0158440046,"SE_pred(A)":0.0151183834}
{"model":"qwen3-0.6b","pass1":0.3344472727,"pass@count":0.95,"win_rate":0.1214333511,"count":1100.0,"SE(A)":0.0210993978,"SE_x(A)":0.014327527,"SE_pred(A)":0.0154889173}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.3310745455,"pass@count":0.956,"win_rate":0.1166638888,"count":1100.0,"SE(A)":0.0210458638,"SE_x(A)":0.0142255163,"SE_pred(A)":0.0155100957}
{"model":"qwen1.5-14b-chat","pass1":0.3087363636,"pass@count":0.952,"win_rate":0.1039670761,"count":1100.0,"SE(A)":0.0206600204,"SE_x(A)":0.01469023,"SE_pred(A)":0.014526995}
{"model":"llama-3.2-1B-instruct","pass1":0.2662054545,"pass@count":0.878,"win_rate":0.0892108861,"count":1100.0,"SE(A)":0.0197656323,"SE_x(A)":0.0136826423,"SE_pred(A)":0.0142641341}
{"model":"google_codegemma_1.1_7b_it","pass1":0.2194854545,"pass@count":0.912,"win_rate":0.070418465,"count":1100.0,"SE(A)":0.0185100832,"SE_x(A)":0.0131862693,"SE_pred(A)":0.0129902071}
{"model":"deepseek_v2_lite_chat","pass1":0.2142363636,"pass@count":0.938,"win_rate":0.0697684324,"count":1100.0,"SE(A)":0.0183487953,"SE_x(A)":0.011478496,"SE_pred(A)":0.0143151116}
{"model":"qwen1.5-7b-chat","pass1":0.1702236364,"pass@count":0.922,"win_rate":0.0527021776,"count":1100.0,"SE(A)":0.0168075905,"SE_x(A)":0.0107137107,"SE_pred(A)":0.0129503476}
{"model":"google_gemma_3_1b_it","pass1":0.1447254545,"pass@count":0.866,"win_rate":0.0600539432,"count":1100.0,"SE(A)":0.0157340394,"SE_x(A)":0.0100482068,"SE_pred(A)":0.0121075817}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.1393836364,"pass@count":0.914,"win_rate":0.0433616915,"count":1100.0,"SE(A)":0.0154890825,"SE_x(A)":0.009695017,"SE_pred(A)":0.0120796657}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.1044087432,"pass@count":0.86,"win_rate":0.032158251,"count":915.0,"SE(A)":0.013675347,"SE_x(A)":0.0078844596,"SE_pred(A)":0.0111736481}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.0865945455,"pass@count":0.874,"win_rate":0.0277105308,"count":1100.0,"SE(A)":0.0125774346,"SE_x(A)":0.006803558,"SE_pred(A)":0.0105784431}
{"model":"qwen2-1.5b-instruct","pass1":0.0718223062,"pass@count":0.894,"win_rate":0.0216764779,"count":1058.0,"SE(A)":0.0115467625,"SE_x(A)":0.0051362831,"SE_pred(A)":0.0103414854}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.0604018182,"pass@count":0.836,"win_rate":0.0187636998,"count":1100.0,"SE(A)":0.0106539606,"SE_x(A)":0.0048496941,"SE_pred(A)":0.0094861659}
{"model":"google_gemma_7b_it","pass1":0.0577509091,"pass@count":0.648,"win_rate":0.0171540381,"count":1100.0,"SE(A)":0.0104322329,"SE_x(A)":0.0067698528,"SE_pred(A)":0.0079372902}
{"model":"qwen2-0.5b-instruct","pass1":0.0333709091,"pass@count":0.832,"win_rate":0.0113338407,"count":1100.0,"SE(A)":0.0080320971,"SE_x(A)":0.0027557629,"SE_pred(A)":0.0075445579}
{"model":"qwen1.5-1.8b-chat","pass1":0.0153436364,"pass@count":0.738,"win_rate":0.0053894265,"count":1100.0,"SE(A)":0.0054969463,"SE_x(A)":0.0013935104,"SE_pred(A)":0.0053173816}
{"model":"qwen1.5-0.5b-chat","pass1":0.0091991001,"pass@count":0.648,"win_rate":0.0038605039,"count":889.0,"SE(A)":0.0042695378,"SE_x(A)":0.0008874929,"SE_pred(A)":0.0041762794}
{"model":"google_gemma_2b_it","pass1":0.00196,"pass@count":0.266,"win_rate":0.0006826603,"count":1100.0,"SE(A)":0.0019779577,"SE_x(A)":0.0004141989,"SE_pred(A)":0.0019341034}
