{"benchmark_id":"cruxeval_output_cot","size":800,"models":52,"total_pairs":1244,"close_pairs":578,"no_solve":20,"tau-":9,"SE(A)":{"count":49.0,"mean":0.016010948,"std":0.0017134731,"min":0.0112255811,"25%":0.0151086272,"50%":0.0162829604,"75%":0.0173800419,"max":0.0176776312},"SE_x(A)":{"count":49.0,"mean":0.012012493,"std":0.0024896527,"min":0.0059334828,"25%":0.0104868449,"50%":0.012744511,"75%":0.0134780817,"max":0.0175603869},"SE_pred(A)":{"count":49.0,"mean":0.0099024057,"std":0.0033199348,"min":0.0,"25%":0.0095522769,"50%":0.010941765,"75%":0.0115033551,"max":0.0144845137},"SE(A-B)":{"count":578.0,"mean":0.0179901798,"std":0.0014286653,"min":0.0128078222,"25%":0.0172752269,"50%":0.0178758882,"75%":0.0184966636,"max":0.0212911863},"SE_x(A-B)":{"count":578.0,"mean":0.0092983607,"std":0.0021565979,"min":0.0045294857,"25%":0.0077702869,"50%":0.0090183735,"75%":0.0102571523,"max":0.0153044601},"SE_pred(A-B)":{"count":578.0,"mean":0.0151455064,"std":0.0022813117,"min":0.007730823,"25%":0.0143586166,"50%":0.0156628968,"75%":0.016469741,"max":0.0197117405},"SE_signtest":{"count":578.0,"mean":0.0180933051,"std":0.0014424454,"min":0.0129763407,"25%":0.0173756915,"50%":0.0179930606,"75%":0.0186245805,"max":0.0214658459},"corr(A,B)":{"count":578.0,"mean":0.6454794157,"std":0.1111753317,"min":0.28366367,"25%":0.5818095549,"50%":0.6496557562,"75%":0.7212792798,"max":0.8611973885},"sum(A!=B)":{"count":578.0,"mean":210.844632012,"std":33.3734186666,"min":107.7666666657,"25%":193.2253787858,"50%":207.2003030285,"75%":222.0,"max":294.9008264405}}
