{"benchmark_id":"mmlu_pro_cot","size":12032,"models":45,"total_pairs":1166,"close_pairs":134,"no_solve":8,"tau-":1623,"SE(A)":{"count":45.0,"mean":0.0041687166,"std":0.0004957312,"min":0.0027765439,"25%":0.003952518,"50%":0.0043793537,"75%":0.0045038675,"max":0.0045572471},"SE_x(A)":{"count":44.0,"mean":0.0031006256,"std":0.0008196792,"min":0.000983216,"25%":0.0028051777,"50%":0.0032866779,"75%":0.0036371761,"max":0.004535096},"SE_pred(A)":{"count":44.0,"mean":0.0025574874,"std":0.0008682083,"min":0.0,"25%":0.0025286949,"50%":0.0028997518,"75%":0.0030466894,"max":0.0032673977},"SE(A-B)":{"count":134.0,"mean":0.004861473,"std":0.0003336269,"min":0.0037691001,"25%":0.0047643779,"50%":0.0049452263,"75%":0.0050225067,"max":0.0053884515},"SE_x(A-B)":{"count":128.0,"mean":0.002895599,"std":0.0008040825,"min":0.0008879003,"25%":0.0023601301,"50%":0.0028541883,"75%":0.0035005109,"max":0.004522358},"SE_pred(A-B)":{"count":128.0,"mean":0.003787703,"std":0.0005891851,"min":0.0022243144,"25%":0.0033433814,"50%":0.0040099371,"75%":0.0042867899,"max":0.0045802814},"SE_signtest":{"count":134.0,"mean":0.0048630252,"std":0.0003336703,"min":0.0037691092,"25%":0.0047682037,"50%":0.0049479392,"75%":0.0050234259,"max":0.0053887559},"corr(A,B)":{"count":134.0,"mean":0.5402827015,"std":0.1244536956,"min":0.2396533508,"25%":0.453512775,"50%":0.5502611083,"75%":0.652633293,"max":0.7329188971},"sum(A!=B)":{"count":134.0,"mean":3439.6423476893,"std":443.1798204496,"min":2056.6153844425,"25%":3291.4365384265,"50%":3544.2499999782,"75%":3653.2187499537,"max":4203.9027777461}}
