Skip to content

Commit 938cf72

Browse files
authored
Faster AVX2 matrix multiplications for MoE models (#428)
1 parent 00e4f72 commit 938cf72

File tree

7 files changed

+216
-97
lines changed

7 files changed

+216
-97
lines changed

llama.cpp/ggml.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10642,6 +10642,20 @@ static void ggml_compute_forward_mul_mat_id(
1064210642
const int64_t nr0 = ne01; // src0 rows
1064310643
const int64_t nr1 = cne1; // src1 rows
1064410644

10645+
if ((vec_dot_type == GGML_TYPE_Q8_K || vec_dot_type == GGML_TYPE_Q8_0 ||
10646+
vec_dot_type == GGML_TYPE_Q8_1) && dst->type == GGML_TYPE_F32) {
10647+
if (ne13 == 1) {
10648+
if (!llamafile_mixmul_iqk(nr0, nr1, ne00, ne11, src0->type,
10649+
(const char *)src0_cur,
10650+
(const char *)wdata,
10651+
(float *)dst->data, nb1, nb2,
10652+
matrix_rows + cur_a*ne12,
10653+
ith, nth)) goto IQK_MulMat_Not_Available;
10654+
continue;
10655+
}
10656+
}
10657+
IQK_MulMat_Not_Available:;
10658+
1064510659
// distribute the thread work across the inner or outer loop based on which one is larger
1064610660

1064710661
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows

0 commit comments

Comments
 (0)