{"benchmark_id":"aime2025_cot","size":30,"models":50,"total_pairs":2122,"close_pairs":210,"no_solve":1,"tau-":4,"SE(A)":{"count":15.0,"mean":0.0703313963,"std":0.0079169094,"min":0.0566744377,"25%":0.0670336071,"50%":0.0708772548,"75%":0.0771391293,"max":0.0806671417},"SE_x(A)":{"count":15.0,"mean":0.0564772109,"std":0.009825656,"min":0.0386015402,"25%":0.0514845314,"50%":0.0576575553,"75%":0.0664831751,"max":0.0680525792},"SE_pred(A)":{"count":15.0,"mean":0.0413800284,"std":0.0037198616,"min":0.0332942115,"25%":0.0401768972,"50%":0.041832325,"75%":0.0439476486,"max":0.0465648635},"SE(A-B)":{"count":210.0,"mean":0.0684021182,"std":0.0060642622,"min":0.0558241563,"25%":0.0629143541,"50%":0.0685984942,"75%":0.0728574044,"max":0.0822505292},"SE_x(A-B)":{"count":210.0,"mean":0.0334628734,"std":0.0120804685,"min":0.0073654382,"25%":0.0245558132,"50%":0.0340763674,"75%":0.0418647065,"max":0.0545947342},"SE_pred(A-B)":{"count":210.0,"mean":0.058643062,"std":0.0033895798,"min":0.0480332292,"25%":0.056384783,"50%":0.0591047804,"75%":0.0611517715,"max":0.0646649024},"SE_signtest":{"count":210.0,"mean":0.0696097341,"std":0.0069323735,"min":0.0558891399,"25%":0.0638311449,"50%":0.0692060846,"75%":0.0739328487,"max":0.086730496},"corr(A,B)":{"count":210.0,"mean":0.8186574744,"std":0.1097919785,"min":0.5846941571,"25%":0.7552911535,"50%":0.8158955377,"75%":0.9195520995,"max":0.9936024762},"sum(A!=B)":{"count":210.0,"mean":4.4040096305,"std":0.8832420282,"min":2.8112363637,"25%":3.6669735538,"50%":4.3105339268,"75%":4.9194595041,"max":6.7699610387}}
