| model |
pass1 |
pass@count |
win_rate |
count |
SE(A) |
SE_x(A) |
SE_pred(A) |
| gpt-4-0613+cot |
77.2 |
90.8 |
42 |
10 |
1.5 |
1.2 |
0.86 |
| gpt-4-0613 |
68 |
75.1 |
33.8 |
10 |
1.6 |
1.6 |
0.53 |
| gpt-3.5-turbo-0613+cot |
56.5 |
82.2 |
25.7 |
10 |
1.8 |
1.3 |
1.1 |
| deepseek-instruct-33b |
48.4 |
66.1 |
18.3 |
10 |
1.8 |
1.5 |
0.88 |
| gpt-3.5-turbo-0613 |
47.5 |
63.6 |
18 |
10 |
1.8 |
1.6 |
0.82 |
| deepseek-base-33b |
45.5 |
67.1 |
16.1 |
10 |
1.8 |
1.5 |
0.96 |
| codetulu-2-34b |
43.8 |
64 |
14.9 |
10 |
1.8 |
1.5 |
0.92 |
| codellama-34b+cot |
42.8 |
77 |
16.5 |
10 |
1.7 |
1.2 |
1.3 |
| magicoder-ds-7b |
41.7 |
63.1 |
13.8 |
10 |
1.7 |
1.5 |
0.94 |
| wizard-34b |
41.4 |
58.5 |
13.3 |
10 |
1.7 |
1.5 |
0.84 |
| deepseek-instruct-6.7b |
40.5 |
57 |
12.9 |
10 |
1.7 |
1.5 |
0.83 |
| codellama-python-34b |
39.7 |
57.9 |
12.3 |
10 |
1.7 |
1.5 |
0.87 |
| codellama-34b |
39.3 |
61.6 |
12.3 |
10 |
1.7 |
1.4 |
0.97 |
| phind |
38.9 |
57.4 |
12.2 |
10 |
1.7 |
1.5 |
0.89 |
| deepseek-base-6.7b |
38.3 |
60.5 |
11.8 |
10 |
1.7 |
1.4 |
0.97 |
| wizard-13b |
36.9 |
58 |
10.9 |
10 |
1.7 |
1.4 |
0.93 |
| codellama-python-13b |
36.4 |
58.1 |
10.7 |
10 |
1.7 |
1.4 |
0.96 |
| mixtral-8x7b |
36.3 |
60.9 |
10.8 |
10 |
1.7 |
1.4 |
0.99 |
| codellama-13b |
36.1 |
61 |
10.5 |
10 |
1.7 |
1.4 |
0.98 |
| codellama-13b+cot |
34.9 |
70.1 |
12.1 |
10 |
1.7 |
1.1 |
1.3 |
| codellama-python-7b |
32.4 |
55.1 |
8.91 |
10 |
1.7 |
1.4 |
0.95 |
| codellama-7b |
30.9 |
55.2 |
8.22 |
10 |
1.6 |
1.3 |
0.98 |
| starcoderbase-16b |
30.7 |
54 |
8.17 |
10 |
1.6 |
1.3 |
0.94 |
| mistral-7b |
30.1 |
55.5 |
8.19 |
10 |
1.6 |
1.3 |
1 |
| phi-2 |
29.7 |
52.9 |
8.18 |
10 |
1.6 |
1.3 |
0.96 |
| codellama-7b+cot |
29.1 |
64.6 |
8.99 |
10 |
1.6 |
1.1 |
1.2 |
| starcoderbase-7b |
28.9 |
51.2 |
7.3 |
10 |
1.6 |
1.3 |
0.93 |
| deepseek-instruct-1.3b |
27.4 |
45.1 |
7.41 |
10 |
1.6 |
1.3 |
0.83 |
| deepseek-base-1.3b |
25.9 |
49.4 |
6.94 |
10 |
1.5 |
1.2 |
0.99 |
| phi-1.5 |
21.7 |
45.9 |
6.39 |
10 |
1.5 |
1.1 |
0.96 |
| phi-1 |
19.3 |
37.4 |
5.43 |
10 |
1.4 |
1.1 |
0.82 |