{"benchmark_id":"mgsm_cot","size":2750,"models":52,"total_pairs":1012,"close_pairs":216,"no_solve":10,"tau-":43,"SE(A)":{"count":47.0,"mean":0.0083015565,"std":0.0010810464,"min":0.0058599488,"25%":0.0075575528,"50%":0.0085797501,"75%":0.0093440132,"max":0.0095335401},"SE_x(A)":{"count":27.0,"mean":0.0066492208,"std":0.0011153078,"min":0.0041765457,"25%":0.0060489985,"50%":0.006692489,"75%":0.0070592033,"max":0.009532505},"SE_pred(A)":{"count":27.0,"mean":0.0045595691,"std":0.0021642157,"min":0.0,"25%":0.0040127292,"50%":0.0052870519,"75%":0.0060096508,"max":0.0067304934},"SE(A-B)":{"count":216.0,"mean":0.0090096447,"std":0.0012728272,"min":0.0058303061,"25%":0.0078908092,"50%":0.009386066,"75%":0.0099734389,"max":0.0108045411},"SE_x(A-B)":{"count":216.0,"mean":0.005346783,"std":0.0012645137,"min":0.0033352823,"25%":0.0045126576,"50%":0.0050744372,"75%":0.0057969637,"max":0.0089110542},"SE_pred(A-B)":{"count":216.0,"mean":0.0070681791,"std":0.0016308782,"min":0.0028315421,"25%":0.0058912841,"50%":0.0073828401,"75%":0.0084347094,"max":0.0094618147},"SE_signtest":{"count":216.0,"mean":0.0090248005,"std":0.0012762561,"min":0.0058304524,"25%":0.0078975577,"50%":0.0093950196,"75%":0.0100045643,"max":0.0108227351},"corr(A,B)":{"count":216.0,"mean":0.6359522219,"std":0.1127867376,"min":0.3276372533,"25%":0.5611708252,"50%":0.655904932,"75%":0.7245653824,"max":0.8352524177},"sum(A!=B)":{"count":216.0,"mean":628.2041159605,"std":167.5942149373,"min":257.0809523804,"25%":471.69125,"50%":667.5189102455,"75%":756.9431397159,"max":885.8076922943}}
