{"benchmark_id":"gsm8k_plus_cot","size":10552,"models":52,"total_pairs":1342,"close_pairs":176,"no_solve":82,"tau-":589,"SE(A)":{"count":49.0,"mean":0.0044008666,"std":0.0003853441,"min":0.0031189889,"25%":0.0042097921,"50%":0.0044530997,"75%":0.0046800751,"max":0.0048674578},"SE_x(A)":{"count":49.0,"mean":0.0034437067,"std":0.0005561632,"min":0.0016185823,"25%":0.0032727052,"50%":0.0034781071,"75%":0.0036595754,"max":0.0048403591},"SE_pred(A)":{"count":49.0,"mean":0.0025463711,"std":0.000940756,"min":0.0,"25%":0.0022240264,"50%":0.0028422013,"75%":0.0032018482,"max":0.0036175847},"SE(A-B)":{"count":176.0,"mean":0.0044441209,"std":0.000579672,"min":0.0029093132,"25%":0.0039776496,"50%":0.0045014737,"75%":0.0048580433,"max":0.0055163284},"SE_x(A-B)":{"count":176.0,"mean":0.0025469739,"std":0.0006074949,"min":0.001182499,"25%":0.0021034162,"50%":0.0025146532,"75%":0.0028758819,"max":0.0041311939},"SE_pred(A-B)":{"count":176.0,"mean":0.003566867,"std":0.0007145999,"min":0.0018485713,"25%":0.0030760445,"50%":0.0035607955,"75%":0.0041286374,"max":0.0049114352},"SE_signtest":{"count":176.0,"mean":0.00444543,"std":0.0005797155,"min":0.0029107436,"25%":0.0039786406,"50%":0.0045020639,"75%":0.0048586321,"max":0.0055166696},"corr(A,B)":{"count":176.0,"mean":0.7074946282,"std":0.088705208,"min":0.4924000953,"25%":0.6511458855,"50%":0.699427152,"75%":0.7647973055,"max":0.8714847333},"sum(A!=B)":{"count":176.0,"mean":2237.5840840329,"std":562.1972822584,"min":943.36,"25%":1762.5456349162,"50%":2256.8254545397,"75%":2628.4374999762,"max":3388.6249999638}}
