Performance improvements on Arm for legacy and k-quants (#453)

Mozilla-Ocho · May 30, 2024 · 293a528 · 293a528
1 parent 73088c3
commit 293a528
Show file tree

Hide file tree

Showing 10 changed files with 1,264 additions and 87 deletions.
diff --git a/llama.cpp/ggml-common.h b/llama.cpp/ggml-common.h
@@ -203,6 +203,18 @@ typedef struct {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
 
+//[kawrakow] Need these two for performance on Arm
+typedef struct {
+ ggml_half d[8];
+ int8_t qs[4*QK8_1];
+} block_q8_1_x4;
+static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");
+typedef struct {
+ ggml_half d[4];
+ int8_t qs[4*QK8_0];
+} block_q8_0_x4;
+static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");
+
 //
 // Super-block quantization structures
 //
@@ -313,10 +325,11 @@ typedef struct {
 static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
 
 // This is only used for intermediate quantization and dot products
+// [kawrakow] Note: I have switched the order of bsums and qs. This results in some performance gain on Arm
 typedef struct {
  float d; // delta
- int8_t qs[QK_K]; // quants
  int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+ int8_t qs[QK_K]; // quants
 } block_q8_K;
 static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
 

diff --git a/llama.cpp/ggml-quants.inc b/llama.cpp/ggml-quants.inc
@@ -873,7 +873,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
  block_q8_0 * restrict y = vy;
 
 #if defined(__ARM_NEON)
+ // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+ block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
+ int nb4 = 4*(nb/4);
  for (int i = 0; i < nb; i++) {
+ int i4 = i/4, ir = i%4;
  float32x4_t srcv [8];
  float32x4_t asrcv[8];
  float32x4_t amaxv[8];
@@ -890,16 +894,29 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
  const float d = amax / ((1 << 7) - 1);
  const float id = d ? 1.0f/d : 0.0f;
 
- y[i].d = GGML_FP32_TO_FP16(d);
+ // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+ if (i < nb4) {
+ y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+ } else {
+ y[i].d = GGML_FP32_TO_FP16(d);
+ }
 
  for (int j = 0; j < 8; j++) {
  const float32x4_t v = vmulq_n_f32(srcv[j], id);
  const int32x4_t vi = vcvtnq_s32_f32(v);
 
- y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
- y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
- y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
- y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+ // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+ if (i < nb4) {
+ y4[i4].qs[32*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
+ y4[i4].qs[32*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
+ y4[i4].qs[32*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
+ y4[i4].qs[32*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
+ } else {
+ y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+ y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+ y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+ y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+ }
  }
  }
 #elif defined(__wasm_simd128__)
@@ -1192,7 +1209,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
  block_q8_1 * restrict y = vy;
 
 #if defined(__ARM_NEON)
+ // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+ block_q8_1_x4 * restrict y4 = vy;
+ int nb4 = 4*(nb/4);
  for (int i = 0; i < nb; i++) {
+ int i4 = i/4, ir = i%4;
  float32x4_t srcv [8];
  float32x4_t asrcv[8];
  float32x4_t amaxv[8];
@@ -1209,23 +1230,41 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
  const float d = amax / ((1 << 7) - 1);
  const float id = d ? 1.0f/d : 0.0f;
 
- y[i].d = GGML_FP32_TO_FP16(d);
+ // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+ if (i < nb4) {
+ y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+ } else {
+ y[i].d = GGML_FP32_TO_FP16(d);
+ }
 
  int32x4_t accv = vdupq_n_s32(0);
 
  for (int j = 0; j < 8; j++) {
  const float32x4_t v = vmulq_n_f32(srcv[j], id);
  const int32x4_t vi = vcvtnq_s32_f32(v);
 
- y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
- y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
- y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
- y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+ // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+ if (i < nb4) {
+ y4[i4].qs[QK8_1*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
+ y4[i4].qs[QK8_1*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
+ y4[i4].qs[QK8_1*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
+ y4[i4].qs[QK8_1*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
+ } else {
+ y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+ y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+ y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+ y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+ }
 
  accv = vaddq_s32(accv, vi);
  }
 
- y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+ // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+ if (i < nb4) {
+ y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+ } else {
+ y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+ }
  }
 #elif defined(__wasm_simd128__)
  for (int i = 0; i < nb; i++) {

diff --git a/llama.cpp/quantize/quantize.cpp b/llama.cpp/quantize/quantize.cpp
@@ -65,10 +65,12 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";
 
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
- std::string ftype_str;
+ std::string ftype_str; ftype_str.reserve(ftype_str_in.size());
 
+ bool is_number = true;
  for (auto ch : ftype_str_in) {
  ftype_str.push_back(std::toupper(ch));
+ if (!std::isdigit(ftype_str.back())) is_number = false;
  }
  for (auto & it : QUANT_OPTIONS) {
  if (it.name == ftype_str) {
@@ -77,6 +79,9 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
  return true;
  }
  }
+ // On my system (OS Ventura 13.2.1) calling std::stoi with invalid input leads to a crash (Segmentation fault 11)
+ // Hence the check above and the early return
+ if (!is_number) return false;
  try {
  int ftype_int = std::stoi(ftype_str);
  for (auto & it : QUANT_OPTIONS) {

diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk
@@ -91,6 +91,7 @@ o/$(MODE)/llamafile: \
 o/$(MODE)/llamafile/sgemm.o: private CXXFLAGS += -Os
 o/$(MODE)/llamafile/iqk_mul_mat_amd_avx2.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c
 o/$(MODE)/llamafile/iqk_mul_mat_amd_zen4.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c -Xx86_64-mavx512f -Xx86_64-mavx512vl -Xx86_64-mavx512vnni -Xx86_64-mavx512bw -Xx86_64-mavx512dq
+o/$(MODE)/llamafile/iqk_mul_mat_arm82.o: private TARGET_ARCH += -Xaarch64-march=armv8.2-a+dotprod+fp16
 o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
 o/$(MODE)/llamafile/tinyblas_cpu_mixmul_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
 o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_fma.o: private TARGET_ARCH += -Xx86_64-mtune=bdver2 -Xx86_64-mf16c -Xx86_64-mfma