| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #ifndef GGUF_FORMAT_H |
| #define GGUF_FORMAT_H |
|
|
| #include <stdint.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <math.h> |
|
|
| |
| |
| |
|
|
| #define GGUF_MAGIC 0x46554747 |
| #define GGUF_VERSION 3 |
| #define GGUF_DEFAULT_ALIGNMENT 32 |
|
|
| |
| |
| |
|
|
| typedef enum { |
| GGML_TYPE_F32 = 0, |
| GGML_TYPE_F16 = 1, |
| GGML_TYPE_Q4_0 = 2, |
| GGML_TYPE_Q4_1 = 3, |
| GGML_TYPE_Q5_0 = 6, |
| GGML_TYPE_Q5_1 = 7, |
| GGML_TYPE_Q8_0 = 8, |
| GGML_TYPE_Q8_1 = 9, |
| GGML_TYPE_Q2_K = 10, |
| GGML_TYPE_Q3_K = 11, |
| GGML_TYPE_Q4_K = 12, |
| GGML_TYPE_Q5_K = 13, |
| GGML_TYPE_Q6_K = 14, |
| GGML_TYPE_Q8_K = 15, |
| GGML_TYPE_IQ2_XXS = 16, |
| GGML_TYPE_IQ2_XS = 17, |
| GGML_TYPE_IQ3_XXS = 18, |
| GGML_TYPE_IQ1_S = 19, |
| GGML_TYPE_IQ4_NL = 20, |
| GGML_TYPE_IQ3_S = 21, |
| GGML_TYPE_IQ2_S = 22, |
| GGML_TYPE_IQ4_XS = 23, |
| GGML_TYPE_I8 = 24, |
| GGML_TYPE_I16 = 25, |
| GGML_TYPE_I32 = 26, |
| GGML_TYPE_I64 = 27, |
| GGML_TYPE_F64 = 28, |
| GGML_TYPE_IQ1_M = 29, |
| GGML_TYPE_BF16 = 30, |
| GGML_TYPE_COUNT |
| } GGMLType; |
|
|
| |
| |
| |
|
|
| typedef enum { |
| GGUF_TYPE_UINT8 = 0, |
| GGUF_TYPE_INT8 = 1, |
| GGUF_TYPE_UINT16 = 2, |
| GGUF_TYPE_INT16 = 3, |
| GGUF_TYPE_UINT32 = 4, |
| GGUF_TYPE_INT32 = 5, |
| GGUF_TYPE_FLOAT32 = 6, |
| GGUF_TYPE_BOOL = 7, |
| GGUF_TYPE_STRING = 8, |
| GGUF_TYPE_ARRAY = 9, |
| GGUF_TYPE_UINT64 = 10, |
| GGUF_TYPE_INT64 = 11, |
| GGUF_TYPE_FLOAT64 = 12 |
| } GGUFValueType; |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #define QK8_0 32 |
|
|
| typedef struct { |
| uint16_t d; |
| int8_t qs[QK8_0]; |
| } BlockQ8_0; |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #define QK4_0 32 |
|
|
| typedef struct { |
| uint16_t d; |
| uint8_t qs[QK4_0/2]; |
| } BlockQ4_0; |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #define QK_K 256 |
|
|
| typedef struct { |
| uint8_t scales[QK_K/16]; |
| uint8_t qs[QK_K/4]; |
| uint16_t d; |
| uint16_t dmin; |
| } BlockQ2K; |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| static inline uint16_t gguf_fp32_to_fp16(float f) |
| { |
| |
| union { float f; uint32_t u; } fu; |
| fu.f = f; |
| uint32_t x = fu.u; |
|
|
| uint16_t sign = (x >> 16) & 0x8000; |
| int32_t exponent = ((x >> 23) & 0xFF) - 127 + 15; |
| uint32_t mantissa = x & 0x7FFFFF; |
|
|
| if (exponent <= 0) { |
| |
| if (exponent < -10) return sign; |
| mantissa = (mantissa | 0x800000) >> (1 - exponent); |
| return sign | (uint16_t)(mantissa >> 13); |
| } else if (exponent >= 0x1F) { |
| |
| return sign | 0x7C00 | (uint16_t)(mantissa ? (mantissa >> 13) : 0); |
| } |
|
|
| |
| return sign | (uint16_t)(exponent << 10) | (uint16_t)(mantissa >> 13); |
| } |
|
|
| static inline float gguf_fp16_to_fp32(uint16_t h) |
| { |
| uint32_t sign = (uint32_t)(h & 0x8000) << 16; |
| int32_t exponent = (h >> 10) & 0x1F; |
| uint32_t mantissa = h & 0x03FF; |
|
|
| uint32_t result; |
|
|
| if (exponent == 0) { |
| if (mantissa == 0) { |
| result = sign; |
| } else { |
| |
| exponent = 1; |
| while (!(mantissa & 0x0400)) { |
| mantissa <<= 1; |
| exponent--; |
| } |
| mantissa &= 0x03FF; |
| result = sign | ((uint32_t)(exponent + 127 - 15) << 23) | (mantissa << 13); |
| } |
| } else if (exponent == 0x1F) { |
| result = sign | 0x7F800000 | (mantissa << 13); |
| } else { |
| result = sign | ((uint32_t)(exponent + 127 - 15) << 23) | (mantissa << 13); |
| } |
|
|
| union { uint32_t u; float f; } uf; |
| uf.u = result; |
| return uf.f; |
| } |
|
|
| |
| static inline float gguf_bf16_to_fp32(uint16_t bf) |
| { |
| union { uint32_t u; float f; } uf; |
| uf.u = (uint32_t)bf << 16; |
| return uf.f; |
| } |
|
|
| |
| |
| |
|
|
| static inline void gguf_write_string(FILE *fp, const char *s) |
| { |
| uint64_t len = strlen(s); |
| fwrite(&len, sizeof(uint64_t), 1, fp); |
| fwrite(s, 1, len, fp); |
| } |
|
|
| |
| |
| |
| |
| |
|
|
| static inline void gguf_write_kv_string(FILE *fp, const char *key, const char *val) |
| { |
| gguf_write_string(fp, key); |
| uint32_t vtype = GGUF_TYPE_STRING; |
| fwrite(&vtype, sizeof(uint32_t), 1, fp); |
| gguf_write_string(fp, val); |
| } |
|
|
| static inline void gguf_write_kv_uint32(FILE *fp, const char *key, uint32_t val) |
| { |
| gguf_write_string(fp, key); |
| uint32_t vtype = GGUF_TYPE_UINT32; |
| fwrite(&vtype, sizeof(uint32_t), 1, fp); |
| fwrite(&val, sizeof(uint32_t), 1, fp); |
| } |
|
|
| static inline void gguf_write_kv_int32(FILE *fp, const char *key, int32_t val) |
| { |
| gguf_write_string(fp, key); |
| uint32_t vtype = GGUF_TYPE_INT32; |
| fwrite(&vtype, sizeof(uint32_t), 1, fp); |
| fwrite(&val, sizeof(int32_t), 1, fp); |
| } |
|
|
| static inline void gguf_write_kv_uint64(FILE *fp, const char *key, uint64_t val) |
| { |
| gguf_write_string(fp, key); |
| uint32_t vtype = GGUF_TYPE_UINT64; |
| fwrite(&vtype, sizeof(uint32_t), 1, fp); |
| fwrite(&val, sizeof(uint64_t), 1, fp); |
| } |
|
|
| static inline void gguf_write_kv_float32(FILE *fp, const char *key, float val) |
| { |
| gguf_write_string(fp, key); |
| uint32_t vtype = GGUF_TYPE_FLOAT32; |
| fwrite(&vtype, sizeof(uint32_t), 1, fp); |
| fwrite(&val, sizeof(float), 1, fp); |
| } |
|
|
| static inline void gguf_write_kv_bool(FILE *fp, const char *key, int val) |
| { |
| gguf_write_string(fp, key); |
| uint32_t vtype = GGUF_TYPE_BOOL; |
| fwrite(&vtype, sizeof(uint32_t), 1, fp); |
| uint8_t b = val ? 1 : 0; |
| fwrite(&b, sizeof(uint8_t), 1, fp); |
| } |
|
|
| |
| static inline void gguf_write_kv_float32_array(FILE *fp, const char *key, |
| const float *vals, uint64_t count) |
| { |
| gguf_write_string(fp, key); |
| uint32_t vtype = GGUF_TYPE_ARRAY; |
| fwrite(&vtype, sizeof(uint32_t), 1, fp); |
| uint32_t subtype = GGUF_TYPE_FLOAT32; |
| fwrite(&subtype, sizeof(uint32_t), 1, fp); |
| fwrite(&count, sizeof(uint64_t), 1, fp); |
| fwrite(vals, sizeof(float), count, fp); |
| } |
|
|
| |
| static inline void gguf_write_kv_int32_array(FILE *fp, const char *key, |
| const int32_t *vals, uint64_t count) |
| { |
| gguf_write_string(fp, key); |
| uint32_t vtype = GGUF_TYPE_ARRAY; |
| fwrite(&vtype, sizeof(uint32_t), 1, fp); |
| uint32_t subtype = GGUF_TYPE_INT32; |
| fwrite(&subtype, sizeof(uint32_t), 1, fp); |
| fwrite(&count, sizeof(uint64_t), 1, fp); |
| fwrite(vals, sizeof(int32_t), count, fp); |
| } |
|
|
| |
| static inline void gguf_write_kv_string_array(FILE *fp, const char *key, |
| const char **vals, uint64_t count) |
| { |
| gguf_write_string(fp, key); |
| uint32_t vtype = GGUF_TYPE_ARRAY; |
| fwrite(&vtype, sizeof(uint32_t), 1, fp); |
| uint32_t subtype = GGUF_TYPE_STRING; |
| fwrite(&subtype, sizeof(uint32_t), 1, fp); |
| fwrite(&count, sizeof(uint64_t), 1, fp); |
| for (uint64_t i = 0; i < count; i++) { |
| gguf_write_string(fp, vals[i] ? vals[i] : ""); |
| } |
| } |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| static inline void gguf_write_tensor_info(FILE *fp, const char *name, |
| uint32_t n_dims, const uint64_t *dims, |
| GGMLType type, uint64_t offset) |
| { |
| gguf_write_string(fp, name); |
| fwrite(&n_dims, sizeof(uint32_t), 1, fp); |
| for (uint32_t i = 0; i < n_dims; i++) { |
| fwrite(&dims[i], sizeof(uint64_t), 1, fp); |
| } |
| uint32_t t = (uint32_t)type; |
| fwrite(&t, sizeof(uint32_t), 1, fp); |
| fwrite(&offset, sizeof(uint64_t), 1, fp); |
| } |
|
|
| |
| |
| |
|
|
| static inline void gguf_write_header(FILE *fp, uint64_t tensor_count, |
| uint64_t metadata_kv_count) |
| { |
| uint32_t magic = GGUF_MAGIC; |
| uint32_t version = GGUF_VERSION; |
| fwrite(&magic, sizeof(uint32_t), 1, fp); |
| fwrite(&version, sizeof(uint32_t), 1, fp); |
| fwrite(&tensor_count, sizeof(uint64_t), 1, fp); |
| fwrite(&metadata_kv_count, sizeof(uint64_t), 1, fp); |
| } |
|
|
| |
| |
| |
|
|
| static inline void gguf_write_padding(FILE *fp, uint32_t alignment) |
| { |
| long pos = ftell(fp); |
| long pad = (alignment - (pos % alignment)) % alignment; |
| if (pad > 0) { |
| uint8_t zeros[64] = {0}; |
| while (pad > 0) { |
| long write_n = (pad > 64) ? 64 : pad; |
| fwrite(zeros, 1, write_n, fp); |
| pad -= write_n; |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| static inline void gguf_quantize_q8_0_reference(const float *x, |
| BlockQ8_0 *y, |
| int64_t n_elements) |
| { |
| int64_t n_blocks = n_elements / QK8_0; |
|
|
| for (int64_t i = 0; i < n_blocks; i++) { |
| float amax = 0.0f; |
| for (int j = 0; j < QK8_0; j++) { |
| float v = fabsf(x[i * QK8_0 + j]); |
| if (v > amax) amax = v; |
| } |
|
|
| float d = amax / 127.0f; |
| float id = (d != 0.0f) ? 1.0f / d : 0.0f; |
|
|
| y[i].d = gguf_fp32_to_fp16(d); |
|
|
| for (int j = 0; j < QK8_0; j++) { |
| float v = x[i * QK8_0 + j] * id; |
| y[i].qs[j] = (int8_t)roundf(v); |
| } |
| } |
| } |
|
|
| |
| static inline void gguf_dequantize_q8_0_block(const BlockQ8_0 *block, |
| float *out) |
| { |
| float d = gguf_fp16_to_fp32(block->d); |
| for (int j = 0; j < QK8_0; j++) { |
| out[j] = (float)block->qs[j] * d; |
| } |
| } |
|
|
| |
| static inline float gguf_q8_0_block_error(const float *original, |
| const BlockQ8_0 *block) |
| { |
| float deq[QK8_0]; |
| gguf_dequantize_q8_0_block(block, deq); |
| float err = 0.0f; |
| for (int j = 0; j < QK8_0; j++) { |
| float diff = original[j] - deq[j]; |
| err += diff * diff; |
| } |
| return err; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| static inline int gguf_nearest_int(float fval) |
| { |
| float val = fval + 12582912.f; |
| int i; |
| memcpy(&i, &val, sizeof(int)); |
| return (i & 0x007fffff) - 0x00400000; |
| } |
|
|
| |
| |
| |
| static inline float gguf_make_qkx_quants(int n, int nmax, |
| const float *x, uint8_t *L, |
| float *the_min) |
| { |
| float min_val = x[0]; |
| float max_val = x[0]; |
| for (int i = 1; i < n; i++) { |
| if (x[i] < min_val) min_val = x[i]; |
| if (x[i] > max_val) max_val = x[i]; |
| } |
| if (max_val == min_val) { |
| for (int i = 0; i < n; i++) L[i] = 0; |
| *the_min = -min_val; |
| return 0.0f; |
| } |
| if (min_val > 0) min_val = 0; |
|
|
| float iscale = nmax / (max_val - min_val); |
| float scale = 1.0f / iscale; |
|
|
| |
| for (int itry = 0; itry < 5; itry++) { |
| float sumlx = 0; |
| int suml2 = 0; |
| int did_change = 0; |
| for (int i = 0; i < n; i++) { |
| int l = gguf_nearest_int(iscale * (x[i] - min_val)); |
| if (l < 0) l = 0; |
| if (l > nmax) l = nmax; |
| if (l != (int)L[i]) { L[i] = l; did_change = 1; } |
| sumlx += (x[i] - min_val) * l; |
| suml2 += l * l; |
| } |
| if (suml2 > 0) scale = sumlx / suml2; |
| float sum = 0; |
| for (int i = 0; i < n; i++) { |
| sum += x[i] - scale * L[i]; |
| } |
| min_val = 0.7f * min_val + 0.3f * sum / n; |
| if (min_val > 0) min_val = 0; |
| if (scale > 1e-15f) iscale = 1.0f / scale; |
| if (!did_change) break; |
| } |
|
|
| *the_min = -min_val; |
| return scale; |
| } |
|
|
| static inline void gguf_quantize_q2_k_reference(const float *x, |
| BlockQ2K *y, |
| int64_t n_elements) |
| { |
| int64_t n_blocks = n_elements / QK_K; |
| const float q4scale = 15.0f; |
|
|
| for (int64_t i = 0; i < n_blocks; i++) { |
| const float *block_x = x + i * QK_K; |
| uint8_t L[QK_K]; |
| float mins[QK_K / 16]; |
| float scales[QK_K / 16]; |
|
|
| float max_scale = 0.0f; |
| float max_min = 0.0f; |
|
|
| |
| for (int j = 0; j < QK_K / 16; j++) { |
| scales[j] = gguf_make_qkx_quants(16, 3, |
| block_x + 16 * j, |
| L + 16 * j, &mins[j]); |
| if (scales[j] > max_scale) max_scale = scales[j]; |
| if (mins[j] > max_min) max_min = mins[j]; |
| } |
|
|
| |
| if (max_scale > 0) { |
| float iscale = q4scale / max_scale; |
| for (int j = 0; j < QK_K / 16; j++) { |
| int l = gguf_nearest_int(iscale * scales[j]); |
| if (l < 0) l = 0; |
| if (l > 15) l = 15; |
| y[i].scales[j] = (uint8_t)l; |
| } |
| y[i].d = gguf_fp32_to_fp16(max_scale / q4scale); |
| } else { |
| for (int j = 0; j < QK_K / 16; j++) y[i].scales[j] = 0; |
| y[i].d = gguf_fp32_to_fp16(0.0f); |
| } |
|
|
| |
| if (max_min > 0) { |
| float iscale = q4scale / max_min; |
| for (int j = 0; j < QK_K / 16; j++) { |
| int l = gguf_nearest_int(iscale * mins[j]); |
| if (l < 0) l = 0; |
| if (l > 15) l = 15; |
| y[i].scales[j] |= ((uint8_t)l << 4); |
| } |
| y[i].dmin = gguf_fp32_to_fp16(max_min / q4scale); |
| } else { |
| y[i].dmin = gguf_fp32_to_fp16(0.0f); |
| } |
|
|
| |
| for (int j = 0; j < QK_K / 16; j++) { |
| float d = gguf_fp16_to_fp32(y[i].d) * (y[i].scales[j] & 0xF); |
| if (d < 1e-15f) { |
| for (int ii = 0; ii < 16; ii++) L[16 * j + ii] = 0; |
| continue; |
| } |
| float dm = gguf_fp16_to_fp32(y[i].dmin) * (y[i].scales[j] >> 4); |
| for (int ii = 0; ii < 16; ii++) { |
| int l = gguf_nearest_int((block_x[16 * j + ii] + dm) / d); |
| if (l < 0) l = 0; |
| if (l > 3) l = 3; |
| L[16 * j + ii] = (uint8_t)l; |
| } |
| } |
|
|
| |
| |
| for (int j = 0; j < QK_K; j += 128) { |
| for (int l = 0; l < 32; l++) { |
| y[i].qs[j / 4 + l] = L[j + l] |
| | (L[j + l + 32] << 2) |
| | (L[j + l + 64] << 4) |
| | (L[j + l + 96] << 6); |
| } |
| } |
| } |
| } |
|
|
| |
| static inline void gguf_dequantize_q2_k_block(const BlockQ2K *block, |
| float *out) |
| { |
| float d = gguf_fp16_to_fp32(block->d); |
| float dmin = gguf_fp16_to_fp32(block->dmin); |
|
|
| const uint8_t *q = block->qs; |
| int is = 0; |
|
|
| for (int n = 0; n < QK_K; n += 128) { |
| int shift = 0; |
| for (int j = 0; j < 4; j++) { |
| uint8_t sc = block->scales[is++]; |
| float dl = d * (sc & 0xF); |
| float ml = dmin * (sc >> 4); |
| for (int l = 0; l < 16; l++) { |
| *out++ = dl * ((float)((q[l] >> shift) & 3)) - ml; |
| } |
|
|
| sc = block->scales[is++]; |
| dl = d * (sc & 0xF); |
| ml = dmin * (sc >> 4); |
| for (int l = 0; l < 16; l++) { |
| *out++ = dl * ((float)((q[l + 16] >> shift) & 3)) - ml; |
| } |
|
|
| shift += 2; |
| } |
| q += 32; |
| } |
| } |
|
|
| |
| static inline float gguf_q2_k_block_error(const float *original, |
| const BlockQ2K *block) |
| { |
| float deq[QK_K]; |
| gguf_dequantize_q2_k_block(block, deq); |
| float err = 0.0f; |
| for (int j = 0; j < QK_K; j++) { |
| float diff = original[j] - deq[j]; |
| err += diff * diff; |
| } |
| return err; |
| } |
|
|
| |
| |
| |
|
|
| |
| static inline int64_t ggml_type_block_size(GGMLType type) |
| { |
| switch (type) { |
| case GGML_TYPE_F32: return 1; |
| case GGML_TYPE_F16: return 1; |
| case GGML_TYPE_Q8_0: return QK8_0; |
| case GGML_TYPE_Q2_K: return QK_K; |
| case GGML_TYPE_Q4_0: return 32; |
| case GGML_TYPE_Q4_1: return 32; |
| case GGML_TYPE_Q5_0: return 32; |
| case GGML_TYPE_Q5_1: return 32; |
| case GGML_TYPE_Q4_K: return 256; |
| case GGML_TYPE_Q5_K: return 256; |
| case GGML_TYPE_Q6_K: return 256; |
| default: return 1; |
| } |
| } |
|
|
| |
| static inline int64_t ggml_type_bytes_per_block(GGMLType type) |
| { |
| switch (type) { |
| case GGML_TYPE_F32: return 4; |
| case GGML_TYPE_F16: return 2; |
| case GGML_TYPE_Q8_0: return sizeof(BlockQ8_0); |
| case GGML_TYPE_Q2_K: return sizeof(BlockQ2K); |
| case GGML_TYPE_Q4_0: return 18; |
| case GGML_TYPE_Q4_1: return 20; |
| default: return 4; |
| } |
| } |
|
|
| |
| static inline int64_t ggml_type_size(GGMLType type, int64_t n_elements) |
| { |
| int64_t block_size = ggml_type_block_size(type); |
| int64_t bytes_per_block = ggml_type_bytes_per_block(type); |
| int64_t n_blocks = (n_elements + block_size - 1) / block_size; |
| return n_blocks * bytes_per_block; |
| } |
|
|
| #endif |
|
|