{"benchmark_id":"gsm8k_cot","size":1319,"models":52,"total_pairs":1306,"close_pairs":471,"no_solve":2,"tau-":13,"SE(A)":{"count":50.0,"mean":0.0106229768,"std":0.0023399886,"min":0.0064520591,"25%":0.0091667121,"50%":0.0108547355,"75%":0.0128531892,"max":0.013767208},"SE_x(A)":{"count":50.0,"mean":0.0077889686,"std":0.0018787648,"min":0.0049462397,"25%":0.0063787992,"50%":0.007626948,"75%":0.0091296716,"max":0.0133637494},"SE_pred(A)":{"count":50.0,"mean":0.0067634427,"std":0.0029176313,"min":0.0,"25%":0.004935404,"50%":0.0073833919,"75%":0.0090108625,"max":0.0108145464},"SE(A-B)":{"count":471.0,"mean":0.0113600001,"std":0.0025951809,"min":0.0062133359,"25%":0.0086866733,"50%":0.0116792524,"75%":0.0131970449,"max":0.0161735284},"SE_x(A-B)":{"count":471.0,"mean":0.0063717784,"std":0.0018635408,"min":0.0033334539,"25%":0.0050874568,"50%":0.0059296376,"75%":0.0071332129,"max":0.0124577092},"SE_pred(A-B)":{"count":471.0,"mean":0.0091760638,"std":0.0027424655,"min":0.0026989215,"25%":0.0068873869,"50%":0.0094978093,"75%":0.0111187634,"max":0.0147298388},"SE_signtest":{"count":471.0,"mean":0.0113961522,"std":0.0026094328,"min":0.0062145229,"25%":0.0087254515,"50%":0.01175818,"75%":0.0132368894,"max":0.0161833465},"corr(A,B)":{"count":471.0,"mean":0.6064230579,"std":0.0886754838,"min":0.3223344419,"25%":0.5493048313,"50%":0.6231927331,"75%":0.6705861976,"max":0.7881330173},"sum(A!=B)":{"count":471.0,"mean":237.7678599277,"std":101.8567761298,"min":67.1900826427,"25%":132.4545454518,"50%":240.5303030228,"75%":304.8333333279,"max":455.6446280899}}
