Skip to content

Commit dfa12b7

Browse files
ikawrakowIwan Kawrakow
andauthored
IQ4_KS_R4 (#150)
* iq4_ks_r4: Zen4 * iq4_ks_r4: AVX2 * iq4_ks_r4: WIP * iq4_ks_r4: slightly better Zen4 * iq4_ks_r4: slightly better Zen4 * iq4_ks_r4: NEON * Minor --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 59d742b commit dfa12b7

File tree

10 files changed

+364
-2
lines changed

10 files changed

+364
-2
lines changed

examples/quantize/quantize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
5151
{ "Q8_0_R4", LLAMA_FTYPE_MOSTLY_Q8_0_R4, " 8.50 bpw quantization", },
5252
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
5353
{ "IQ4_KS", LLAMA_FTYPE_MOSTLY_IQ4_KS, " 4.25 bpw non-linear quantization", },
54+
{ "IQ4_KS_R4",LLAMA_FTYPE_MOSTLY_IQ4_KS_R4,"IQ4_KS repacked", },
5455
{ "IQ4_KSS", LLAMA_FTYPE_MOSTLY_IQ4_KSS, " 4.0 bpw non-linear quantization", },
5556
{ "IQ2_K", LLAMA_FTYPE_MOSTLY_IQ2_K, " 2.375 bpw non-linear quantization",},
5657
{ "IQ2_K_R4", LLAMA_FTYPE_MOSTLY_IQ2_K_R4, "IQ2_K repacked",},

ggml/include/ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,7 @@ extern "C" {
427427
GGML_TYPE_IQ3_K_R4 = 338,
428428
GGML_TYPE_IQ4_K_R4 = 339,
429429
GGML_TYPE_IQ5_K_R4 = 340,
430+
GGML_TYPE_IQ4_KS_R4 = 344,
430431
GGML_TYPE_Q8_K_R8 = 399,
431432
GGML_TYPE_COUNT,
432433
};
@@ -504,6 +505,7 @@ extern "C" {
504505
GGML_FTYPE_MOSTLY_IQ3_K_R4 = 331, // except 1d tensors
505506
GGML_FTYPE_MOSTLY_IQ4_K_R4 = 332, // except 1d tensors
506507
GGML_FTYPE_MOSTLY_IQ5_K_R4 = 333, // except 1d tensors
508+
GGML_FTYPE_MOSTLY_IQ4_KS_R4 = 337, // except 1d tensors
507509
GGML_FTYPE_MOSTLY_Q8_K_R8 = 399, // except 1d tensors
508510
};
509511

ggml/src/ggml-common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,12 @@ typedef struct {
508508
} block_iq4_ks;
509509
static_assert(sizeof(block_iq4_ks) == QK_K/32 + QK_K/2, "wrong iq4_ks block size/padding");
510510

511+
typedef struct {
512+
uint8_t scales[QK_K/8];
513+
uint8_t qs[QK_K*2];
514+
} block_iq4_ks_r4;
515+
static_assert(sizeof(block_iq4_ks_r4) == 4*sizeof(block_iq4_ks), "wrong iq4_ks_r4 block size/padding");
516+
511517
typedef struct {
512518
uint32_t qs[QK_K/8];
513519
} block_iq4_kss;

ggml/src/ggml-quants.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15211,6 +15211,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
1521115211
case GGML_TYPE_IQ3_K_R4: break;
1521215212
case GGML_TYPE_IQ4_K_R4: break;
1521315213
case GGML_TYPE_IQ5_K_R4: break;
15214+
case GGML_TYPE_IQ4_KS_R4: break;
1521415215
case GGML_TYPE_Q8_K_R8: break;
1521515216
case GGML_TYPE_BF16_R16: break;
1521615217
case GGML_TYPE_Q4_0_4_4:

ggml/src/ggml.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,6 +1165,23 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
11651165
.nrows = 1,
11661166
.row_meta_size = 4,
11671167
},
1168+
[GGML_TYPE_IQ4_KS_R4] = {
1169+
.type_name = "iq4_ks_r4",
1170+
.blck_size = QK_K,
1171+
.type_size = sizeof(block_iq4_ks),
1172+
.is_quantized = true,
1173+
.to_float = (ggml_to_float_t) dequantize_row_iq4_ks_r4,
1174+
.from_float = quantize_row_iq4_ks_r4,
1175+
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_ks_r4_ref,
1176+
.vec_dot = vec_dot_iq4_ks_r4_q8_k,
1177+
#if defined __AVX2__
1178+
.vec_dot_type = GGML_TYPE_Q8_K32,
1179+
#else
1180+
.vec_dot_type = GGML_TYPE_Q8_K,
1181+
#endif
1182+
.nrows = 1,
1183+
.row_meta_size = 4,
1184+
},
11681185
[GGML_TYPE_IQ4_KSS] = {
11691186
.type_name = "iq4_kss",
11701187
.blck_size = QK_K,
@@ -4197,6 +4214,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
41974214
case GGML_FTYPE_MOSTLY_Q8_0_R4: wtype = GGML_TYPE_Q8_0_R4; break;
41984215
case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break;
41994216
case GGML_FTYPE_MOSTLY_IQ4_KS: wtype = GGML_TYPE_IQ4_KS; break;
4217+
case GGML_FTYPE_MOSTLY_IQ4_KS_R4: wtype = GGML_TYPE_IQ4_KS_R4;break;
42004218
case GGML_FTYPE_MOSTLY_IQ4_KSS: wtype = GGML_TYPE_IQ4_KSS; break;
42014219
case GGML_FTYPE_MOSTLY_IQ2_K: wtype = GGML_TYPE_IQ2_K; break;
42024220
case GGML_FTYPE_MOSTLY_IQ2_K_R4: wtype = GGML_TYPE_IQ2_K_R4; break;
@@ -10737,6 +10755,7 @@ static void ggml_compute_forward_add(
1073710755
case GGML_TYPE_Q8_0_R4:
1073810756
case GGML_TYPE_IQ4_XS:
1073910757
case GGML_TYPE_IQ4_KS:
10758+
case GGML_TYPE_IQ4_KS_R4:
1074010759
case GGML_TYPE_IQ4_KSS:
1074110760
case GGML_TYPE_IQ2_K:
1074210761
case GGML_TYPE_IQ2_K_R4:
@@ -11196,6 +11215,7 @@ static void ggml_compute_forward_add1(
1119611215
case GGML_TYPE_Q8_0_R4:
1119711216
case GGML_TYPE_IQ4_XS:
1119811217
case GGML_TYPE_IQ4_KS:
11218+
case GGML_TYPE_IQ4_KS_R4:
1119911219
case GGML_TYPE_IQ4_KSS:
1120011220
case GGML_TYPE_IQ2_K:
1120111221
case GGML_TYPE_IQ2_K_R4:
@@ -11352,6 +11372,7 @@ static void ggml_compute_forward_acc(
1135211372
case GGML_TYPE_Q8_0_R4:
1135311373
case GGML_TYPE_IQ4_XS:
1135411374
case GGML_TYPE_IQ4_KS:
11375+
case GGML_TYPE_IQ4_KS_R4:
1135511376
case GGML_TYPE_IQ4_KSS:
1135611377
case GGML_TYPE_IQ2_K:
1135711378
case GGML_TYPE_IQ2_K_R4:
@@ -14554,6 +14575,7 @@ static void ggml_compute_forward_out_prod(
1455414575
case GGML_TYPE_Q8_0_R4:
1455514576
case GGML_TYPE_IQ4_XS:
1455614577
case GGML_TYPE_IQ4_KS:
14578+
case GGML_TYPE_IQ4_KS_R4:
1455714579
case GGML_TYPE_IQ4_KSS:
1455814580
case GGML_TYPE_IQ2_K:
1455914581
case GGML_TYPE_IQ2_K_R4:
@@ -14950,6 +14972,7 @@ static void ggml_compute_forward_set(
1495014972
case GGML_TYPE_Q8_0_R4:
1495114973
case GGML_TYPE_IQ4_XS:
1495214974
case GGML_TYPE_IQ4_KS:
14975+
case GGML_TYPE_IQ4_KS_R4:
1495314976
case GGML_TYPE_IQ4_KSS:
1495414977
case GGML_TYPE_IQ2_K:
1495514978
case GGML_TYPE_IQ2_K_R4:
@@ -15240,6 +15263,7 @@ static void ggml_compute_forward_get_rows(
1524015263
case GGML_TYPE_Q8_0_R4:
1524115264
case GGML_TYPE_IQ4_XS:
1524215265
case GGML_TYPE_IQ4_KS:
15266+
case GGML_TYPE_IQ4_KS_R4:
1524315267
case GGML_TYPE_IQ4_KSS:
1524415268
case GGML_TYPE_IQ2_K:
1524515269
case GGML_TYPE_IQ2_K_R4:
@@ -15859,6 +15883,7 @@ static void ggml_compute_forward_clamp(
1585915883
case GGML_TYPE_Q8_0_R4:
1586015884
case GGML_TYPE_IQ4_XS:
1586115885
case GGML_TYPE_IQ4_KS:
15886+
case GGML_TYPE_IQ4_KS_R4:
1586215887
case GGML_TYPE_IQ4_KSS:
1586315888
case GGML_TYPE_IQ2_K:
1586415889
case GGML_TYPE_IQ2_K_R4:
@@ -22706,6 +22731,7 @@ size_t ggml_quantize_chunk(
2270622731
case GGML_TYPE_Q8_0_R4: result = quantize_q8_0_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2270722732
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2270822733
case GGML_TYPE_IQ4_KS: result = quantize_iq4_ks (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
22734+
case GGML_TYPE_IQ4_KS_R4:result = quantize_iq4_ks_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2270922735
case GGML_TYPE_IQ4_KSS: result = quantize_iq4_kss(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2271022736
case GGML_TYPE_IQ2_K: result = quantize_iq2_k (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
2271122737
case GGML_TYPE_IQ2_K_R4:result = quantize_iq2_k_r4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;

0 commit comments

Comments
 (0)