{"benchmark_id":"aime2025_cot","size":30,"models":52,"total_pairs":2266,"close_pairs":182,"no_solve":8,"tau-":1,"SE(A)":{"count":14.0,"mean":0.0716024538,"std":0.0068161739,"min":0.0582785391,"25%":0.0689935654,"50%":0.0715355708,"75%":0.0776105005,"max":0.0798395238},"SE_x(A)":{"count":14.0,"mean":0.0586532163,"std":0.0109564158,"min":0.0370969824,"25%":0.0531800218,"50%":0.0624332708,"75%":0.0664517833,"max":0.0736619235},"SE_pred(A)":{"count":14.0,"mean":0.0395370373,"std":0.0077106488,"min":0.0269679945,"25%":0.0339456672,"50%":0.042860183,"75%":0.0451892457,"max":0.0481999204},"SE(A-B)":{"count":182.0,"mean":0.0671075618,"std":0.0068534792,"min":0.041708856,"25%":0.0624757418,"50%":0.0673498941,"75%":0.0717040165,"max":0.0814973883},"SE_x(A-B)":{"count":182.0,"mean":0.0341273426,"std":0.0123290212,"min":0.0,"25%":0.0278839122,"50%":0.0339900586,"75%":0.0424491121,"max":0.0611928489},"SE_pred(A-B)":{"count":182.0,"mean":0.0564811649,"std":0.0068487326,"min":0.0383146639,"25%":0.0524484381,"50%":0.0564821714,"75%":0.0622694397,"max":0.0664770029},"SE_signtest":{"count":182.0,"mean":0.0680776819,"std":0.0074572012,"min":0.0418121005,"25%":0.0628945896,"50%":0.0679794978,"75%":0.0726783882,"max":0.0852802865},"corr(A,B)":{"count":182.0,"mean":0.8037041893,"std":0.0957247984,"min":0.5253476897,"25%":0.7433917497,"50%":0.8336347382,"75%":0.868046162,"max":0.9559689093},"sum(A!=B)":{"count":182.0,"mean":4.2208875697,"std":0.9099100358,"min":1.5734265733,"25%":3.5601689976,"50%":4.159090909,"75%":4.7539335664,"max":6.5454545455}}
