| llama-3.1-70B-instruct |
83.4 |
83.4 |
37 |
12 |
0.46 |
0.46 |
0 |
| qwen3-14b |
82.5 |
90.4 |
36.2 |
11 |
0.47 |
0.42 |
0.22 |
| google_gemma_3_12b_it |
81.8 |
92.6 |
36 |
12 |
0.48 |
0.4 |
0.26 |
| qwen2-72b-instruct |
79 |
92.4 |
34.2 |
10 |
0.5 |
0.4 |
0.31 |
| qwen2-math-72b-instruct |
78.5 |
92.6 |
34 |
10 |
0.51 |
0.4 |
0.32 |
| qwen2.5-coder-32b-instruct |
78.3 |
93.4 |
33.9 |
10 |
0.51 |
0.39 |
0.33 |
| mistralai_mixtral_8x22b_instruct_v0.1 |
77.4 |
91.6 |
33 |
9 |
0.52 |
0.4 |
0.32 |
| qwen3-32b |
76.2 |
90.5 |
33.1 |
9 |
0.53 |
0.42 |
0.32 |
| qwen3-4b |
75.2 |
86 |
31.9 |
12 |
0.54 |
0.47 |
0.25 |
| qwen3-8b |
72.9 |
85.8 |
30.6 |
11 |
0.55 |
0.48 |
0.27 |
| qwen2.5-coder-14b-instruct |
71.4 |
92.2 |
29.9 |
11 |
0.56 |
0.41 |
0.38 |
| qwen1.5-72b-chat |
68.3 |
88.2 |
28.2 |
10 |
0.58 |
0.45 |
0.37 |
| llama-3.1-8B-instruct |
66.3 |
66.3 |
27.1 |
15 |
0.59 |
0.59 |
0 |
| qwen1.5-32b-chat |
66.2 |
87.5 |
26.8 |
9 |
0.59 |
0.44 |
0.39 |
| google_gemma_3_4b_it |
64.4 |
84.3 |
27.1 |
11 |
0.59 |
0.49 |
0.34 |
| mistralai_mathstral_7b_v0.1 |
62.2 |
90.9 |
24.8 |
12 |
0.6 |
0.42 |
0.43 |
| mistralai_mixtral_8x7b_instruct_v0.1 |
61.9 |
83.8 |
24.6 |
11 |
0.6 |
0.48 |
0.37 |
| mistralai_ministral_8b_instruct_2410 |
60.2 |
90.2 |
23.8 |
11 |
0.61 |
0.41 |
0.45 |
| qwen2.5-coder-7b-instruct |
59.9 |
90.8 |
23.6 |
12 |
0.61 |
0.41 |
0.45 |
| qwen2-math-7b-instruct |
56.9 |
83.1 |
22.1 |
11 |
0.61 |
0.47 |
0.39 |
| llama-3.2-3B-instruct |
56.8 |
56.8 |
22 |
18 |
0.61 |
0.61 |
0 |
| qwen2.5-coder-3b-instruct |
54.2 |
86.9 |
20.9 |
12 |
0.62 |
0.43 |
0.44 |
| mistralai_mistral_7b_instruct_v0.3 |
51.3 |
81.8 |
19.1 |
12 |
0.62 |
0.47 |
0.41 |
| qwen3-1.7b |
49.3 |
69.8 |
18.3 |
12 |
0.62 |
0.53 |
0.32 |
| deepseek_v2_lite_chat |
46.1 |
77 |
17.5 |
9 |
0.62 |
0.44 |
0.43 |
| qwen1.5-14b-chat |
42.3 |
82 |
15.9 |
11 |
0.61 |
0.39 |
0.47 |
| mistralai_mistral_7b_instruct_v0.1 |
42.2 |
80.5 |
15.4 |
12 |
0.61 |
0.42 |
0.45 |
| qwen2-math-1.5b-instruct |
41.5 |
76.6 |
15.5 |
12 |
0.61 |
0.44 |
0.42 |
| mistralai_mistral_7b_instruct_v0.2 |
40.5 |
75.7 |
14.3 |
12 |
0.61 |
0.45 |
0.41 |
| qwen2-7b-instruct |
38.5 |
80.5 |
13.9 |
12 |
0.6 |
0.38 |
0.46 |
| qwen2.5-coder-1.5b-instruct |
35.9 |
79.1 |
13.6 |
12 |
0.59 |
0.36 |
0.47 |
| deepseek_r1_distill_llama_70b |
35.6 |
71 |
12.9 |
9 |
0.59 |
0.37 |
0.46 |
| llama-3.2-1B-instruct |
34.9 |
34.9 |
13.3 |
21 |
0.59 |
0.59 |
0 |
| deepseek_r1_distill_qwen_7b |
33.1 |
73.8 |
11.5 |
12 |
0.58 |
0.37 |
0.45 |
| qwen3-0.6b |
33.1 |
68.1 |
12.2 |
12 |
0.58 |
0.42 |
0.4 |
| deepseek_r1_distill_llama_8b |
29.1 |
73.7 |
10.1 |
12 |
0.56 |
0.31 |
0.47 |
| qwen2.5-coder-0.5b-instruct |
27 |
69.5 |
10.5 |
12 |
0.55 |
0.34 |
0.43 |
| qwen1.5-7b-chat |
26.6 |
70.9 |
9.37 |
11 |
0.55 |
0.31 |
0.45 |
| deepseek_r1_distill_qwen_14b |
23.4 |
62.1 |
7.79 |
12 |
0.52 |
0.3 |
0.43 |
| qwen2-1.5b-instruct |
22.4 |
72.8 |
8.29 |
12 |
0.52 |
0.25 |
0.45 |
| qwen1.5-0.5b-chat |
21.7 |
63.9 |
8.47 |
12 |
0.51 |
0.29 |
0.42 |
| deepseek_r1_distill_qwen_32b |
21.5 |
58.4 |
7.26 |
7 |
0.51 |
0.26 |
0.44 |
| qwen1.5-1.8b-chat |
19.8 |
58.9 |
7.08 |
11 |
0.49 |
0.29 |
0.4 |
| qwen2-0.5b-instruct |
18.2 |
61.9 |
7.01 |
12 |
0.48 |
0.25 |
0.41 |
| deepseek_r1_distill_qwen_1.5b |
14 |
53.1 |
4.81 |
12 |
0.43 |
0.21 |
0.38 |