{"benchmark_id":"aime2024_cot","size":30,"models":52,"total_pairs":2188,"close_pairs":222,"no_solve":6,"tau-":0,"SE(A)":{"count":16.0,"mean":0.0727875017,"std":0.0100199061,"min":0.056744228,"25%":0.0673944234,"50%":0.0723083375,"75%":0.0801233347,"max":0.0885640299},"SE_x(A)":{"count":16.0,"mean":0.0565746484,"std":0.0092194851,"min":0.044900074,"25%":0.0497470287,"50%":0.0535795627,"75%":0.0640721432,"max":0.0730296743},"SE_pred(A)":{"count":16.0,"mean":0.0438369542,"std":0.0142394063,"min":0.0,"25%":0.0399575672,"50%":0.0458202885,"75%":0.0495905249,"max":0.0614636297},"SE(A-B)":{"count":222.0,"mean":0.0807759491,"std":0.0087691757,"min":0.0560716766,"25%":0.0746805529,"50%":0.081435172,"75%":0.0873412753,"max":0.0987044948},"SE_x(A-B)":{"count":222.0,"mean":0.0473101556,"std":0.0145552393,"min":0.0150286828,"25%":0.0382969846,"50%":0.0454583294,"75%":0.0566061261,"max":0.0869190672},"SE_pred(A-B)":{"count":222.0,"mean":0.0635756307,"std":0.0105258128,"min":0.0344509606,"25%":0.0586068066,"50%":0.0635906522,"75%":0.0706094014,"max":0.0852802865},"SE_signtest":{"count":222.0,"mean":0.0825962284,"std":0.0094053021,"min":0.0561083608,"25%":0.0765780486,"50%":0.0818555844,"75%":0.0886803182,"max":0.10396192},"corr(A,B)":{"count":222.0,"mean":0.6106028586,"std":0.1567793726,"min":0.0785120998,"25%":0.5103088408,"50%":0.6252122865,"75%":0.7249621677,"max":0.9368576133},"sum(A!=B)":{"count":222.0,"mean":6.2191783695,"std":1.3898410029,"min":2.8333333334,"25%":5.2777777779,"50%":6.0303030302,"75%":7.0777972026,"max":9.7272727272}}
