{"model":"qwen3-32b","pass1":0.4941152597,"pass@count":0.8303571429,"win_rate":0.319232841,"count":11.0,"SE(A)":0.0236211434,"SE_x(A)":0.0170052125,"SE_pred(A)":0.0163945467}
{"model":"llama-3.1-70B-instruct","pass1":0.4375,"pass@count":0.4375,"win_rate":0.2758703075,"count":13.0,"SE(A)":0.0234375,"SE_x(A)":0.0234375,"SE_pred(A)":0.0}
{"model":"google_gemma_3_27b_it","pass1":0.4339923469,"pass@count":0.7098214286,"win_rate":0.2748065736,"count":7.0,"SE(A)":0.0234160254,"SE_x(A)":0.0172351913,"SE_pred(A)":0.0158511333}
{"model":"qwen3-14b","pass1":0.4168019481,"pass@count":0.671875,"win_rate":0.2574145922,"count":11.0,"SE(A)":0.0232934542,"SE_x(A)":0.0185670672,"SE_pred(A)":0.0140658816}
{"model":"qwen2-math-72b-instruct","pass1":0.4064732143,"pass@count":0.8035714286,"win_rate":0.2542201685,"count":10.0,"SE(A)":0.0232058308,"SE_x(A)":0.0152453714,"SE_pred(A)":0.017495406}
{"model":"qwen3-8b","pass1":0.4013798701,"pass@count":0.6629464286,"win_rate":0.24780063,"count":11.0,"SE(A)":0.0231587143,"SE_x(A)":0.018360778,"SE_pred(A)":0.0141141021}
{"model":"qwen2.5-coder-32b-instruct","pass1":0.3950892857,"pass@count":0.7232142857,"win_rate":0.2431341263,"count":10.0,"SE(A)":0.02309693,"SE_x(A)":0.0168592516,"SE_pred(A)":0.0157871406}
{"model":"qwen2-72b-instruct","pass1":0.3899553571,"pass@count":0.7008928571,"win_rate":0.2353204377,"count":10.0,"SE(A)":0.0230435427,"SE_x(A)":0.0175076348,"SE_pred(A)":0.0149829096}
{"model":"google_gemma_3_12b_it","pass1":0.3738839286,"pass@count":0.7678571429,"win_rate":0.233650838,"count":12.0,"SE(A)":0.0228589784,"SE_x(A)":0.0159542109,"SE_pred(A)":0.0163705848}
{"model":"qwen3-4b","pass1":0.3673735119,"pass@count":0.6316964286,"win_rate":0.2245298556,"count":12.0,"SE(A)":0.0227765845,"SE_x(A)":0.0183911407,"SE_pred(A)":0.0134364707}
{"model":"qwen2.5-coder-14b-instruct","pass1":0.3461850649,"pass@count":0.8214285714,"win_rate":0.2134271679,"count":11.0,"SE(A)":0.0224772189,"SE_x(A)":0.0131678935,"SE_pred(A)":0.0182162551}
{"model":"qwen1.5-32b-chat","pass1":0.3252840909,"pass@count":0.7589285714,"win_rate":0.1974193469,"count":11.0,"SE(A)":0.0221336419,"SE_x(A)":0.0143143737,"SE_pred(A)":0.0168818486}
{"model":"mistralai_mixtral_8x22b_instruct_v0.1","pass1":0.3222402597,"pass@count":0.7455357143,"win_rate":0.1955398342,"count":11.0,"SE(A)":0.0220794767,"SE_x(A)":0.0139605481,"SE_pred(A)":0.0171057414}
{"model":"llama-3.1-8B-instruct","pass1":0.3169642857,"pass@count":0.3169642857,"win_rate":0.1992764393,"count":15.0,"SE(A)":0.0219830458,"SE_x(A)":0.0219830458,"SE_pred(A)":0.0}
{"model":"qwen3-1.7b","pass1":0.31640625,"pass@count":0.6674107143,"win_rate":0.2015413601,"count":12.0,"SE(A)":0.0219726562,"SE_x(A)":0.0158239053,"SE_pred(A)":0.0152447251}
{"model":"qwen1.5-72b-chat","pass1":0.3154017857,"pass@count":0.6741071429,"win_rate":0.1899015704,"count":10.0,"SE(A)":0.0219538629,"SE_x(A)":0.0150934154,"SE_pred(A)":0.0159424248}
{"model":"deepseek_r1_distill_qwen_32b","pass1":0.3145089286,"pass@count":0.5357142857,"win_rate":0.1900274767,"count":10.0,"SE(A)":0.021937058,"SE_x(A)":0.0177665614,"SE_pred(A)":0.0128679374}
{"model":"qwen2.5-coder-7b-instruct","pass1":0.3090503247,"pass@count":0.8816964286,"win_rate":0.1948629915,"count":11.0,"SE(A)":0.0218322655,"SE_x(A)":0.0106480543,"SE_pred(A)":0.0190595581}
{"model":"mistralai_mathstral_7b_v0.1","pass1":0.3084415584,"pass@count":0.8125,"win_rate":0.1903427886,"count":11.0,"SE(A)":0.0218203585,"SE_x(A)":0.0118261224,"SE_pred(A)":0.0183376899}
{"model":"mistralai_ministral_8b_instruct_2410","pass1":0.3082217262,"pass@count":0.8147321429,"win_rate":0.1925966431,"count":12.0,"SE(A)":0.0218160478,"SE_x(A)":0.0115137,"SE_pred(A)":0.0185303711}
{"model":"google_gemma_2_9b_it","pass1":0.306547619,"pass@count":0.65625,"win_rate":0.1805579407,"count":12.0,"SE(A)":0.0217830299,"SE_x(A)":0.016179134,"SE_pred(A)":0.0145854728}
{"model":"google_gemma_3_4b_it","pass1":0.3044299451,"pass@count":0.7276785714,"win_rate":0.1921392313,"count":13.0,"SE(A)":0.0217407797,"SE_x(A)":0.0147468652,"SE_pred(A)":0.0159747134}
{"model":"mistralai_mixtral_8x7b_instruct_v0.1","pass1":0.2976866883,"pass@count":0.7410714286,"win_rate":0.1804240909,"count":11.0,"SE(A)":0.0216026063,"SE_x(A)":0.013333432,"SE_pred(A)":0.0169968288}
{"model":"llama-3.2-3B-instruct","pass1":0.296875,"pass@count":0.296875,"win_rate":0.1871233603,"count":17.0,"SE(A)":0.0215855976,"SE_x(A)":0.0215855976,"SE_pred(A)":0.0}
{"model":"deepseek_r1_distill_qwen_14b","pass1":0.2863230519,"pass@count":0.515625,"win_rate":0.1696515754,"count":11.0,"SE(A)":0.0213569871,"SE_x(A)":0.0169861415,"SE_pred(A)":0.0129457288}
{"model":"qwen1.5-14b-chat","pass1":0.2863230519,"pass@count":0.6875,"win_rate":0.1760173182,"count":11.0,"SE(A)":0.0213569871,"SE_x(A)":0.013900651,"SE_pred(A)":0.0162139693}
{"model":"qwen2-math-7b-instruct","pass1":0.2845982143,"pass@count":0.8169642857,"win_rate":0.1830923693,"count":12.0,"SE(A)":0.0213182764,"SE_x(A)":0.0105562288,"SE_pred(A)":0.0185212025}
{"model":"google_codegemma_1.1_7b_it","pass1":0.2807348901,"pass@count":0.765625,"win_rate":0.1823169522,"count":13.0,"SE(A)":0.0212301804,"SE_x(A)":0.0121578139,"SE_pred(A)":0.0174042559}
{"model":"qwen2-7b-instruct","pass1":0.2767857143,"pass@count":0.7633928571,"win_rate":0.1706633459,"count":11.0,"SE(A)":0.0211381184,"SE_x(A)":0.0121304421,"SE_pred(A)":0.0173110492}
{"model":"mistralai_mistral_7b_instruct_v0.3","pass1":0.2737418831,"pass@count":0.7366071429,"win_rate":0.167775523,"count":11.0,"SE(A)":0.0210657593,"SE_x(A)":0.0126649133,"SE_pred(A)":0.0168334841}
{"model":"qwen2.5-coder-3b-instruct","pass1":0.26953125,"pass@count":0.890625,"win_rate":0.1774968063,"count":12.0,"SE(A)":0.0209636247,"SE_x(A)":0.0075143654,"SE_pred(A)":0.0195705869}
{"model":"google_gemma_7b_it","pass1":0.2671130952,"pass@count":0.609375,"win_rate":0.1709131943,"count":12.0,"SE(A)":0.0209038877,"SE_x(A)":0.0147740532,"SE_pred(A)":0.0147885048}
{"model":"deepseek_r1_distill_llama_70b","pass1":0.2631696429,"pass@count":0.4486607143,"win_rate":0.1533499522,"count":10.0,"SE(A)":0.0208047568,"SE_x(A)":0.0173085163,"SE_pred(A)":0.0115435337}
{"model":"mistralai_mistral_7b_instruct_v0.1","pass1":0.2518262987,"pass@count":0.8370535714,"win_rate":0.1666779706,"count":11.0,"SE(A)":0.0205075016,"SE_x(A)":0.0085046364,"SE_pred(A)":0.018660889}
{"model":"google_gemma_3_1b_it","pass1":0.2455357143,"pass@count":0.7366071429,"win_rate":0.1674882752,"count":12.0,"SE(A)":0.0203346954,"SE_x(A)":0.0104278119,"SE_pred(A)":0.0174573932}
{"model":"deepseek_v2_lite_chat","pass1":0.244724026,"pass@count":0.78125,"win_rate":0.1579675943,"count":11.0,"SE(A)":0.020311974,"SE_x(A)":0.0098606164,"SE_pred(A)":0.0177579428}
{"model":"deepseek_r1_distill_qwen_7b","pass1":0.2435064935,"pass@count":0.5022321429,"win_rate":0.1426927852,"count":11.0,"SE(A)":0.0202777082,"SE_x(A)":0.0152760224,"SE_pred(A)":0.0133352386}
{"model":"mistralai_mistral_7b_instruct_v0.2","pass1":0.2408685065,"pass@count":0.7075892857,"win_rate":0.1477857951,"count":11.0,"SE(A)":0.0202027043,"SE_x(A)":0.0115062006,"SE_pred(A)":0.0166059209}
{"model":"deepseek_r1_distill_llama_8b","pass1":0.2380952381,"pass@count":0.5245535714,"win_rate":0.1364203958,"count":12.0,"SE(A)":0.0201227204,"SE_x(A)":0.0148288018,"SE_pred(A)":0.0136025921}
{"model":"qwen3-0.6b","pass1":0.236092033,"pass@count":0.6830357143,"win_rate":0.1558586568,"count":13.0,"SE(A)":0.0200642153,"SE_x(A)":0.0120979015,"SE_pred(A)":0.0160066709}
{"model":"qwen2-math-1.5b-instruct","pass1":0.2309253247,"pass@count":0.8147321429,"win_rate":0.1527283246,"count":11.0,"SE(A)":0.0199104479,"SE_x(A)":0.0078546227,"SE_pred(A)":0.0182956508}
{"model":"qwen1.5-7b-chat","pass1":0.2038690476,"pass@count":0.7254464286,"win_rate":0.1291791697,"count":12.0,"SE(A)":0.0190339487,"SE_x(A)":0.0085266927,"SE_pred(A)":0.0170172476}
{"model":"qwen2.5-coder-1.5b-instruct","pass1":0.19140625,"pass@count":0.8883928571,"win_rate":0.130874185,"count":12.0,"SE(A)":0.0185867835,"SE_x(A)":0.0034398132,"SE_pred(A)":0.0182657112}
{"model":"qwen2.5-coder-0.5b-instruct","pass1":0.1804601648,"pass@count":0.8816964286,"win_rate":0.1299356532,"count":13.0,"SE(A)":0.0181692375,"SE_x(A)":0.0036100161,"SE_pred(A)":0.0178069923}
{"model":"llama-3.2-1B-instruct","pass1":0.1540178571,"pass@count":0.1540178571,"win_rate":0.1081758115,"count":12.0,"SE(A)":0.0170540342,"SE_x(A)":0.0170540342,"SE_pred(A)":0.0}
{"model":"qwen2-1.5b-instruct","pass1":0.1248139881,"pass@count":0.703125,"win_rate":0.0835331446,"count":12.0,"SE(A)":0.0156150294,"SE_x(A)":0.0035652296,"SE_pred(A)":0.0152025748}
{"model":"google_gemma_2b_it","pass1":0.1205357143,"pass@count":0.5,"win_rate":0.0833763288,"count":12.0,"SE(A)":0.0153825371,"SE_x(A)":0.0071722348,"SE_pred(A)":0.0136081407}
{"model":"qwen2-0.5b-instruct","pass1":0.1195054945,"pass@count":0.75,"win_rate":0.0848173526,"count":13.0,"SE(A)":0.0153256271,"SE_x(A)":0.0027795132,"SE_pred(A)":0.0150714682}
{"model":"deepseek_r1_distill_qwen_1.5b","pass1":0.1091889881,"pass@count":0.4017857143,"win_rate":0.0607623163,"count":12.0,"SE(A)":0.0147347646,"SE_x(A)":0.0080877949,"SE_pred(A)":0.0123166904}
{"model":"qwen1.5-0.5b-chat","pass1":0.0860233516,"pass@count":0.6339285714,"win_rate":0.061819341,"count":13.0,"SE(A)":0.0132475852,"SE_x(A)":0.0019806498,"SE_pred(A)":0.0130986847}
{"model":"qwen1.5-1.8b-chat","pass1":0.0792410714,"pass@count":0.5446428571,"win_rate":0.0537440589,"count":12.0,"SE(A)":0.0127617177,"SE_x(A)":0.0028730019,"SE_pred(A)":0.0124341183}
