{"model":"qwen2-math-72b-instruct","pass1":0.783030303,"pass@count":0.92,"win_rate":0.4365575023,"count":11.0,"SE(A)":0.0475946562,"SE_x(A)":0.0373165383,"SE_pred(A)":0.0295419578}
{"model":"qwen2-72b-instruct","pass1":0.7733333333,"pass@count":0.8266666667,"win_rate":0.4315767719,"count":11.0,"SE(A)":0.0483444432,"SE_x(A)":0.0455916585,"SE_pred(A)":0.016080605}
{"model":"google_gemma_3_27b_it","pass1":0.7444444444,"pass@count":0.8933333333,"win_rate":0.4216257679,"count":12.0,"SE(A)":0.0503649235,"SE_x(A)":0.039311318,"SE_pred(A)":0.0314840561}
{"model":"qwen3-8b","pass1":0.72,"pass@count":0.9066666667,"win_rate":0.3916615488,"count":12.0,"SE(A)":0.0518459256,"SE_x(A)":0.0432647476,"SE_pred(A)":0.0285685424}
{"model":"qwen3-4b","pass1":0.7188888889,"pass@count":0.8933333333,"win_rate":0.3972479125,"count":12.0,"SE(A)":0.0519085933,"SE_x(A)":0.0410578598,"SE_pred(A)":0.0317608912}
{"model":"qwen3-14b","pass1":0.7111111111,"pass@count":0.8133333333,"win_rate":0.3869457783,"count":12.0,"SE(A)":0.0523363607,"SE_x(A)":0.0458041178,"SE_pred(A)":0.0253195071}
{"model":"qwen3-32b","pass1":0.6787878788,"pass@count":0.8933333333,"win_rate":0.3758583711,"count":11.0,"SE(A)":0.0539178257,"SE_x(A)":0.0411705078,"SE_pred(A)":0.0348155312}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.6242424242,"pass@count":0.8666666667,"win_rate":0.3322084679,"count":11.0,"SE(A)":0.055924213,"SE_x(A)":0.0427187469,"SE_pred(A)":0.0360919141}
{"model":"qwen1.5-72b-chat","pass1":0.6012121212,"pass@count":0.76,"win_rate":0.3234570093,"count":11.0,"SE(A)":0.0565397921,"SE_x(A)":0.0507238573,"SE_pred(A)":0.0249767569}
{"model":"google_gemma_2_27b_it","pass1":0.592,"pass@count":0.8533333333,"win_rate":0.3231707599,"count":10.0,"SE(A)":0.0567492731,"SE_x(A)":0.036389335,"SE_pred(A)":0.0435464843}
{"model":"qwen1.5-32b-chat","pass1":0.5842424242,"pass@count":0.7066666667,"win_rate":0.2988067868,"count":11.0,"SE(A)":0.056909661,"SE_x(A)":0.051072561,"SE_pred(A)":0.0251058366}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.5806060606,"pass@count":0.8,"win_rate":0.2983902833,"count":11.0,"SE(A)":0.0569798401,"SE_x(A)":0.0449268221,"SE_pred(A)":0.0350468662}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.5622222222,"pass@count":0.7866666667,"win_rate":0.2857247734,"count":12.0,"SE(A)":0.0572862281,"SE_x(A)":0.046949663,"SE_pred(A)":0.0328243976}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.552,"pass@count":0.7866666667,"win_rate":0.2853025672,"count":10.0,"SE(A)":0.057421947,"SE_x(A)":0.0428433379,"SE_pred(A)":0.0382325567}
{"model":"google_gemma_3_12b_it","pass1":0.543030303,"pass@count":0.7866666667,"win_rate":0.2858413619,"count":11.0,"SE(A)":0.0575208244,"SE_x(A)":0.043950831,"SE_pred(A)":0.0371075423}
{"model":"qwen1.5-14b-chat","pass1":0.5266666667,"pass@count":0.7733333333,"win_rate":0.2653905387,"count":12.0,"SE(A)":0.0576528564,"SE_x(A)":0.0459535705,"SE_pred(A)":0.0348155312}
{"model":"qwen2-math-7b-instruct","pass1":0.5177777778,"pass@count":0.6666666667,"win_rate":0.2677873297,"count":6.0,"SE(A)":0.0576985211,"SE_x(A)":0.0470012696,"SE_pred(A)":0.0334664011}
{"model":"qwen3-1.7b","pass1":0.4822222222,"pass@count":0.7733333333,"win_rate":0.2420086584,"count":12.0,"SE(A)":0.0576985211,"SE_x(A)":0.0432029291,"SE_pred(A)":0.0382442971}
{"model":"qwen1.5-7b-chat","pass1":0.4633333333,"pass@count":0.72,"win_rate":0.2410155944,"count":12.0,"SE(A)":0.0575795746,"SE_x(A)":0.0478309312,"SE_pred(A)":0.0320563477}
{"model":"qwen2-7b-instruct","pass1":0.4606060606,"pass@count":0.72,"win_rate":0.2315759638,"count":11.0,"SE(A)":0.057555552,"SE_x(A)":0.0458666838,"SE_pred(A)":0.0347690795}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.4557575758,"pass@count":0.8133333333,"win_rate":0.2328225867,"count":11.0,"SE(A)":0.0575085626,"SE_x(A)":0.0412403943,"SE_pred(A)":0.0400807266}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.4096969697,"pass@count":0.6,"win_rate":0.1950114855,"count":11.0,"SE(A)":0.0567856041,"SE_x(A)":0.0492894849,"SE_pred(A)":0.0281984311}
{"model":"qwen3-0.6b","pass1":0.3979487179,"pass@count":0.68,"win_rate":0.1992268603,"count":13.0,"SE(A)":0.0565196763,"SE_x(A)":0.0452300327,"SE_pred(A)":0.0338927418}
{"model":"deepseek_v2_lite_chat","pass1":0.3915151515,"pass@count":0.7466666667,"win_rate":0.1989292477,"count":11.0,"SE(A)":0.0563596827,"SE_x(A)":0.0366122143,"SE_pred(A)":0.0428480991}
{"model":"google_gemma_2_9b_it","pass1":0.3878787879,"pass@count":0.7333333333,"win_rate":0.2002633015,"count":11.0,"SE(A)":0.0562647117,"SE_x(A)":0.0380703841,"SE_pred(A)":0.0414290193}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.383030303,"pass@count":0.84,"win_rate":0.1890838794,"count":11.0,"SE(A)":0.0561329481,"SE_x(A)":0.0330441821,"SE_pred(A)":0.0453760939}
{"model":"qwen2-math-1.5b-instruct","pass1":0.3733333333,"pass@count":0.6,"win_rate":0.196386892,"count":4.0,"SE(A)":0.0558516554,"SE_x(A)":0.0401035696,"SE_pred(A)":0.0388730126}
{"model":"llama-3.2-3B-instruct","pass1":0.3733333333,"pass@count":0.3733333333,"win_rate":0.1747007925,"count":17.0,"SE(A)":0.0558516554,"SE_x(A)":0.0558516554,"SE_pred(A)":0.0}
{"model":"llama-3.1-8B-instruct","pass1":0.3733333333,"pass@count":0.3733333333,"win_rate":0.1794570629,"count":15.0,"SE(A)":0.0558516554,"SE_x(A)":0.0558516554,"SE_pred(A)":0.0}
{"model":"google_gemma_3_4b_it","pass1":0.3702564103,"pass@count":0.6133333333,"win_rate":0.1923690252,"count":13.0,"SE(A)":0.0557574032,"SE_x(A)":0.0439638446,"SE_pred(A)":0.0342938533}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.3533333333,"pass@count":0.8666666667,"win_rate":0.1761271147,"count":12.0,"SE(A)":0.0551952762,"SE_x(A)":0.0318252806,"SE_pred(A)":0.0450962308}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.2977777778,"pass@count":0.6933333333,"win_rate":0.1521656851,"count":12.0,"SE(A)":0.0528022945,"SE_x(A)":0.0342333272,"SE_pred(A)":0.0402015126}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.2557575757,"pass@count":0.92,"win_rate":0.137619651,"count":11.0,"SE(A)":0.0503779897,"SE_x(A)":0.0126076145,"SE_pred(A)":0.04877489}
{"model":"google_gemma_7b_it","pass1":0.2441025641,"pass@count":0.4133333333,"win_rate":0.1128138641,"count":13.0,"SE(A)":0.0496006051,"SE_x(A)":0.0406898221,"SE_pred(A)":0.0283647388}
{"model":"qwen2-1.5b-instruct","pass1":0.2430769231,"pass@count":0.76,"win_rate":0.1232327344,"count":13.0,"SE(A)":0.0495298607,"SE_x(A)":0.0290139809,"SE_pred(A)":0.0401421974}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.24,"pass@count":0.4666666667,"win_rate":0.1077150552,"count":10.0,"SE(A)":0.049315312,"SE_x(A)":0.0355333264,"SE_pred(A)":0.0341962383}
{"model":"google_codegemma_1.1_7b_it","pass1":0.2338461538,"pass@count":0.6,"win_rate":0.1142390724,"count":13.0,"SE(A)":0.0488756422,"SE_x(A)":0.0332511281,"SE_pred(A)":0.0358216538}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.2,"pass@count":0.76,"win_rate":0.1022119291,"count":11.0,"SE(A)":0.0461880215,"SE_x(A)":0.022598829,"SE_pred(A)":0.0402818354}
{"model":"qwen1.5-0.5b-chat","pass1":0.1948717949,"pass@count":0.8266666667,"win_rate":0.1144968717,"count":13.0,"SE(A)":0.0457379169,"SE_x(A)":0.0116791049,"SE_pred(A)":0.0442216639}
{"model":"llama-3.2-1B-instruct","pass1":0.1866666667,"pass@count":0.1866666667,"win_rate":0.0957178555,"count":12.0,"SE(A)":0.0449921804,"SE_x(A)":0.0449921804,"SE_pred(A)":0.0}
{"model":"qwen1.5-1.8b-chat","pass1":0.183030303,"pass@count":0.7733333333,"win_rate":0.0978632789,"count":11.0,"SE(A)":0.0446512727,"SE_x(A)":0.0153411965,"SE_pred(A)":0.0419330876}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.1825641026,"pass@count":0.84,"win_rate":0.1114488626,"count":13.0,"SE(A)":0.0446070923,"SE_x(A)":0.0107837103,"SE_pred(A)":0.0432839956}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.1818181818,"pass@count":0.5333333333,"win_rate":0.0864376775,"count":11.0,"SE(A)":0.0445361771,"SE_x(A)":0.0249532161,"SE_pred(A)":0.0368891323}
{"model":"qwen2-0.5b-instruct","pass1":0.1815384615,"pass@count":0.8,"win_rate":0.1060287992,"count":13.0,"SE(A)":0.0445095119,"SE_x(A)":0.0120024835,"SE_pred(A)":0.04286067}
{"model":"google_gemma_3_1b_it","pass1":0.1794871795,"pass@count":0.48,"win_rate":0.1123017297,"count":13.0,"SE(A)":0.0443127569,"SE_x(A)":0.0319379192,"SE_pred(A)":0.0307179059}
{"model":"google_gemma_2b_it","pass1":0.1712820513,"pass@count":0.4533333333,"win_rate":0.0937801566,"count":13.0,"SE(A)":0.0435039477,"SE_x(A)":0.0321100453,"SE_pred(A)":0.0293519754}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.1466666667,"pass@count":0.5333333333,"win_rate":0.0668938702,"count":11.0,"SE(A)":0.0408502233,"SE_x(A)":0.019049466,"SE_pred(A)":0.0361366654}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.1284848485,"pass@count":0.52,"win_rate":0.0597891474,"count":11.0,"SE(A)":0.0386396156,"SE_x(A)":0.0155599102,"SE_pred(A)":0.0353681932}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.1128205128,"pass@count":0.4666666667,"win_rate":0.0476681752,"count":13.0,"SE(A)":0.0365316383,"SE_x(A)":0.0183997065,"SE_pred(A)":0.0315596482}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.0678787879,"pass@count":0.3866666667,"win_rate":0.0308876457,"count":11.0,"SE(A)":0.0290450817,"SE_x(A)":0.0089882418,"SE_pred(A)":0.0276193462}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.04,"pass@count":0.3466666667,"win_rate":0.0169663499,"count":13.0,"SE(A)":0.022627417,"SE_x(A)":0.0049245012,"SE_pred(A)":0.0220850467}
