{"benchmark_id":"gpqa_cot","size":448,"models":51,"total_pairs":2232,"close_pairs":1763,"no_solve":0,"tau-":96,"SE(A)":{"count":49.0,"mean":0.0209060991,"std":0.0021952993,"min":0.0147347646,"25%":0.0202777082,"50%":0.0213569871,"75%":0.0220794767,"max":0.0236211434},"SE_x(A)":{"count":49.0,"mean":0.0133283893,"std":0.0048052096,"min":0.0027795132,"25%":0.0105562288,"50%":0.0139605481,"75%":0.0169861415,"max":0.0234375},"SE_pred(A)":{"count":49.0,"mean":0.014783371,"std":0.0048426994,"min":0.0,"25%":0.0140658816,"50%":0.0160066709,"75%":0.0174573932,"max":0.0195705869},"SE(A-B)":{"count":1763.0,"mean":0.0275475104,"std":0.0014895427,"min":0.0185951295,"25%":0.0270577506,"50%":0.0277881578,"75%":0.0284181986,"max":0.0304793205},"SE_x(A-B)":{"count":1763.0,"mean":0.0157206059,"std":0.0041837588,"min":0.0033701197,"25%":0.0128996422,"50%":0.0156845408,"75%":0.0181578347,"max":0.0285409882},"SE_pred(A-B)":{"count":1763.0,"mean":0.0220323286,"std":0.0033217774,"min":0.0,"25%":0.0208068642,"50%":0.0227677908,"75%":0.024309347,"max":0.0273180275},"SE_signtest":{"count":1763.0,"mean":0.027768947,"std":0.0015309434,"min":0.0187526571,"25%":0.0272620104,"50%":0.0279323322,"75%":0.0286372891,"max":0.0313662966},"corr(A,B)":{"count":1763.0,"mean":0.3078145571,"std":0.163971533,"min":-0.1250432269,"25%":0.2061945512,"50%":0.302977495,"75%":0.4215722992,"max":0.8964780652},"sum(A!=B)":{"count":1763.0,"mean":155.2358885057,"std":16.056706969,"min":70.58,"25%":149.1666666662,"50%":156.5923076894,"75%":164.5962121189,"max":197.4615384616}}
