It's only calibrated for Gemma, atm.

07b428c verified about 1 month ago

28.6 kB

	/*
	* gguf_format.h — GGUF v3 Binary Format Writer
	*
	* ╔═══════════════════════════════════════════════════════════════╗
	* ║ HExState GGUF Output Module ║
	* ║ Implements the GGUF v3 binary specification for writing ║
	* ║ quantized LLM weight files compatible with llama.cpp ║
	* ╚═══════════════════════════════════════════════════════════════╝
	*
	* File Layout:
	* 1. Header: magic(4) + version(4) + tensor_count(8) + kv_count(8)
	* 2. Metadata: Key-Value pairs (variable length)
	* 3. Tensor Info: Per-tensor descriptors (name, dims, type, offset)
	* 4. Padding: Align to GGUF_DEFAULT_ALIGNMENT bytes
	* 5. Tensor Data: Raw quantized weight data
	*
	* All values are little-endian.
	*/

	#ifndef GGUF_FORMAT_H
	#define GGUF_FORMAT_H

	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>

	/* ═══════════════════════════════════════════════════════════════════════
	* GGUF CONSTANTS
	* ═══════════════════════════════════════════════════════════════════════ */

	#define GGUF_MAGIC 0x46554747 /* "GGUF" in little-endian */
	#define GGUF_VERSION 3
	#define GGUF_DEFAULT_ALIGNMENT 32

	/* ═══════════════════════════════════════════════════════════════════════
	* GGML TENSOR TYPES
	* ═══════════════════════════════════════════════════════════════════════ */

	typedef enum {
	GGML_TYPE_F32 = 0,
	GGML_TYPE_F16 = 1,
	GGML_TYPE_Q4_0 = 2,
	GGML_TYPE_Q4_1 = 3,
	GGML_TYPE_Q5_0 = 6,
	GGML_TYPE_Q5_1 = 7,
	GGML_TYPE_Q8_0 = 8,
	GGML_TYPE_Q8_1 = 9,
	GGML_TYPE_Q2_K = 10,
	GGML_TYPE_Q3_K = 11,
	GGML_TYPE_Q4_K = 12,
	GGML_TYPE_Q5_K = 13,
	GGML_TYPE_Q6_K = 14,
	GGML_TYPE_Q8_K = 15,
	GGML_TYPE_IQ2_XXS = 16,
	GGML_TYPE_IQ2_XS = 17,
	GGML_TYPE_IQ3_XXS = 18,
	GGML_TYPE_IQ1_S = 19,
	GGML_TYPE_IQ4_NL = 20,
	GGML_TYPE_IQ3_S = 21,
	GGML_TYPE_IQ2_S = 22,
	GGML_TYPE_IQ4_XS = 23,
	GGML_TYPE_I8 = 24,
	GGML_TYPE_I16 = 25,
	GGML_TYPE_I32 = 26,
	GGML_TYPE_I64 = 27,
	GGML_TYPE_F64 = 28,
	GGML_TYPE_IQ1_M = 29,
	GGML_TYPE_BF16 = 30,
	GGML_TYPE_COUNT
	} GGMLType;

	/* ═══════════════════════════════════════════════════════════════════════
	* GGUF METADATA VALUE TYPES
	* ═══════════════════════════════════════════════════════════════════════ */

	typedef enum {
	GGUF_TYPE_UINT8 = 0,
	GGUF_TYPE_INT8 = 1,
	GGUF_TYPE_UINT16 = 2,
	GGUF_TYPE_INT16 = 3,
	GGUF_TYPE_UINT32 = 4,
	GGUF_TYPE_INT32 = 5,
	GGUF_TYPE_FLOAT32 = 6,
	GGUF_TYPE_BOOL = 7,
	GGUF_TYPE_STRING = 8,
	GGUF_TYPE_ARRAY = 9,
	GGUF_TYPE_UINT64 = 10,
	GGUF_TYPE_INT64 = 11,
	GGUF_TYPE_FLOAT64 = 12
	} GGUFValueType;

	/* ═══════════════════════════════════════════════════════════════════════
	* Q8_0 BLOCK STRUCTURE
	*
	* The fundamental quantized unit: 32 weights + 1 fp16 scale.
	* Total: 34 bytes per block = 8.5 bits per weight.
	*
	* Dequantization: w_i = qs[i] * d
	* ═══════════════════════════════════════════════════════════════════════ */

	#define QK8_0 32 /* Block size for Q8_0 */

	typedef struct {
	uint16_t d; /* fp16 scale (delta) */
	int8_t qs[QK8_0]; /* quantized values [-127, 127] */
	} BlockQ8_0;

	/* Verify: sizeof(BlockQ8_0) should be 34 bytes (2 + 32) */

	/* ═══════════════════════════════════════════════════════════════════════
	* Q4_0 BLOCK STRUCTURE
	*
	* 32 weights per block with 4-bit quantization.
	* Layout: 1 fp16 scale + 16 bytes packed quants (2 weights per byte)
	* Total: 18 bytes per block = 4.5 bits per weight.
	*
	* Dequantization: w_i = (q_i - 8) * d
	* where q_i in {0..15}, stored as nibbles
	* ═══════════════════════════════════════════════════════════════════════ */

	#define QK4_0 32 /* Block size for Q4_0 */

	typedef struct {
	uint16_t d; /* fp16 scale (delta) */
	uint8_t qs[QK4_0/2]; /* 16 bytes: packed 4-bit quants (2 per byte) */
	} BlockQ4_0;

	/* sizeof(BlockQ4_0) = 2 + 16 = 18 bytes for 32 weights */

	/* ═══════════════════════════════════════════════════════════════════════
	* Q2_K BLOCK STRUCTURE (K-Quant, 2-bit)
	*
	* 256-weight superblock divided into 16 sub-blocks of 16 weights.
	*
	* Layout (must match ggml block_q2_K):
	* d: fp16 super-block scale for scales
	* dmin: fp16 super-block scale for mins
	* scales[16]: Per-sub-block scale (low 4 bits) + min (high 4 bits)
	* qs[64]: Packed 2-bit quants (4 weights per byte)
	*
	* Dequantization: w_i = d * scale_j * q_i - dmin * min_j
	* where j = sub-block index, q_i in {0, 1, 2, 3}
	*
	* Effective: 2.625 bits per weight (84 bytes / 256 weights)
	* ═══════════════════════════════════════════════════════════════════════ */

	#define QK_K 256 /* K-quant superblock size */

	typedef struct {
	uint8_t scales[QK_K/16]; /* 16 bytes: scale(4bit) \| min(4bit) */
	uint8_t qs[QK_K/4]; /* 64 bytes: packed 2-bit quants */
	uint16_t d; /* fp16 super-block scale */
	uint16_t dmin; /* fp16 super-block min scale */
	} BlockQ2K;

	/* sizeof(BlockQ2K) = 2 + 2 + 16 + 64 = 84 bytes for 256 weights */

	/* ═══════════════════════════════════════════════════════════════════════
	* FP16 ←→ FP32 CONVERSION
	*
	* IEEE 754 half-precision (binary16):
	* 1 sign bit, 5 exponent bits, 10 mantissa bits
	* ═══════════════════════════════════════════════════════════════════════ */

	static inline uint16_t gguf_fp32_to_fp16(float f)
	{
	/* Use the union approach for bit manipulation */
	union { float f; uint32_t u; } fu;
	fu.f = f;
	uint32_t x = fu.u;

	uint16_t sign = (x >> 16) & 0x8000;
	int32_t exponent = ((x >> 23) & 0xFF) - 127 + 15;
	uint32_t mantissa = x & 0x7FFFFF;

	if (exponent <= 0) {
	/* Subnormal or zero */
	if (exponent < -10) return sign; /* too small → ±0 */
	mantissa = (mantissa \| 0x800000) >> (1 - exponent);
	return sign \| (uint16_t)(mantissa >> 13);
	} else if (exponent >= 0x1F) {
	/* Infinity or NaN */
	return sign \| 0x7C00 \| (uint16_t)(mantissa ? (mantissa >> 13) : 0);
	}

	/* Normalized */
	return sign \| (uint16_t)(exponent << 10) \| (uint16_t)(mantissa >> 13);
	}

	static inline float gguf_fp16_to_fp32(uint16_t h)
	{
	uint32_t sign = (uint32_t)(h & 0x8000) << 16;
	int32_t exponent = (h >> 10) & 0x1F;
	uint32_t mantissa = h & 0x03FF;

	uint32_t result;

	if (exponent == 0) {
	if (mantissa == 0) {
	result = sign; /* ±0 */
	} else {
	/* Subnormal → normalize */
	exponent = 1;
	while (!(mantissa & 0x0400)) {
	mantissa <<= 1;
	exponent--;
	}
	mantissa &= 0x03FF;
	result = sign \| ((uint32_t)(exponent + 127 - 15) << 23) \| (mantissa << 13);
	}
	} else if (exponent == 0x1F) {
	result = sign \| 0x7F800000 \| (mantissa << 13); /* Inf/NaN */
	} else {
	result = sign \| ((uint32_t)(exponent + 127 - 15) << 23) \| (mantissa << 13);
	}

	union { uint32_t u; float f; } uf;
	uf.u = result;
	return uf.f;
	}

	/* BFloat16 → Float32 (just shift left by 16, it IS the top 16 bits of fp32) */
	static inline float gguf_bf16_to_fp32(uint16_t bf)
	{
	union { uint32_t u; float f; } uf;
	uf.u = (uint32_t)bf << 16;
	return uf.f;
	}

	/* ═══════════════════════════════════════════════════════════════════════
	* GGUF STRING — Length-prefixed UTF-8 (no null terminator in file)
	* ═══════════════════════════════════════════════════════════════════════ */

	static inline void gguf_write_string(FILE fp, const char s)
	{
	uint64_t len = strlen(s);
	fwrite(&len, sizeof(uint64_t), 1, fp);
	fwrite(s, 1, len, fp);
	}

	/* ═══════════════════════════════════════════════════════════════════════
	* GGUF METADATA KEY-VALUE WRITERS
	*
	* Each KV entry: key_string + value_type(u32) + value_data
	* ═══════════════════════════════════════════════════════════════════════ */

	static inline void gguf_write_kv_string(FILE fp, const char key, const char *val)
	{
	gguf_write_string(fp, key);
	uint32_t vtype = GGUF_TYPE_STRING;
	fwrite(&vtype, sizeof(uint32_t), 1, fp);
	gguf_write_string(fp, val);
	}

	static inline void gguf_write_kv_uint32(FILE fp, const char key, uint32_t val)
	{
	gguf_write_string(fp, key);
	uint32_t vtype = GGUF_TYPE_UINT32;
	fwrite(&vtype, sizeof(uint32_t), 1, fp);
	fwrite(&val, sizeof(uint32_t), 1, fp);
	}

	static inline void gguf_write_kv_int32(FILE fp, const char key, int32_t val)
	{
	gguf_write_string(fp, key);
	uint32_t vtype = GGUF_TYPE_INT32;
	fwrite(&vtype, sizeof(uint32_t), 1, fp);
	fwrite(&val, sizeof(int32_t), 1, fp);
	}

	static inline void gguf_write_kv_uint64(FILE fp, const char key, uint64_t val)
	{
	gguf_write_string(fp, key);
	uint32_t vtype = GGUF_TYPE_UINT64;
	fwrite(&vtype, sizeof(uint32_t), 1, fp);
	fwrite(&val, sizeof(uint64_t), 1, fp);
	}

	static inline void gguf_write_kv_float32(FILE fp, const char key, float val)
	{
	gguf_write_string(fp, key);
	uint32_t vtype = GGUF_TYPE_FLOAT32;
	fwrite(&vtype, sizeof(uint32_t), 1, fp);
	fwrite(&val, sizeof(float), 1, fp);
	}

	static inline void gguf_write_kv_bool(FILE fp, const char key, int val)
	{
	gguf_write_string(fp, key);
	uint32_t vtype = GGUF_TYPE_BOOL;
	fwrite(&vtype, sizeof(uint32_t), 1, fp);
	uint8_t b = val ? 1 : 0;
	fwrite(&b, sizeof(uint8_t), 1, fp);
	}

	/* Write an array of float32 values */
	static inline void gguf_write_kv_float32_array(FILE fp, const char key,
	const float *vals, uint64_t count)
	{
	gguf_write_string(fp, key);
	uint32_t vtype = GGUF_TYPE_ARRAY;
	fwrite(&vtype, sizeof(uint32_t), 1, fp);
	uint32_t subtype = GGUF_TYPE_FLOAT32;
	fwrite(&subtype, sizeof(uint32_t), 1, fp);
	fwrite(&count, sizeof(uint64_t), 1, fp);
	fwrite(vals, sizeof(float), count, fp);
	}

	/* Write an array of int32 values */
	static inline void gguf_write_kv_int32_array(FILE fp, const char key,
	const int32_t *vals, uint64_t count)
	{
	gguf_write_string(fp, key);
	uint32_t vtype = GGUF_TYPE_ARRAY;
	fwrite(&vtype, sizeof(uint32_t), 1, fp);
	uint32_t subtype = GGUF_TYPE_INT32;
	fwrite(&subtype, sizeof(uint32_t), 1, fp);
	fwrite(&count, sizeof(uint64_t), 1, fp);
	fwrite(vals, sizeof(int32_t), count, fp);
	}

	/* Write an array of string values */
	static inline void gguf_write_kv_string_array(FILE fp, const char key,
	const char **vals, uint64_t count)
	{
	gguf_write_string(fp, key);
	uint32_t vtype = GGUF_TYPE_ARRAY;
	fwrite(&vtype, sizeof(uint32_t), 1, fp);
	uint32_t subtype = GGUF_TYPE_STRING;
	fwrite(&subtype, sizeof(uint32_t), 1, fp);
	fwrite(&count, sizeof(uint64_t), 1, fp);
	for (uint64_t i = 0; i < count; i++) {
	gguf_write_string(fp, vals[i] ? vals[i] : "");
	}
	}
	/* ═══════════════════════════════════════════════════════════════════════
	* GGUF TENSOR INFO WRITER
	*
	* Per-tensor descriptor in the file:
	* name_string + n_dims(u32) + dims[n_dims](u64 each) +
	* type(u32) + offset(u64)
	*
	* Offset is relative to the start of the tensor data section.
	* ═══════════════════════════════════════════════════════════════════════ */

	static inline void gguf_write_tensor_info(FILE fp, const char name,
	uint32_t n_dims, const uint64_t *dims,
	GGMLType type, uint64_t offset)
	{
	gguf_write_string(fp, name);
	fwrite(&n_dims, sizeof(uint32_t), 1, fp);
	for (uint32_t i = 0; i < n_dims; i++) {
	fwrite(&dims[i], sizeof(uint64_t), 1, fp);
	}
	uint32_t t = (uint32_t)type;
	fwrite(&t, sizeof(uint32_t), 1, fp);
	fwrite(&offset, sizeof(uint64_t), 1, fp);
	}

	/* ═══════════════════════════════════════════════════════════════════════
	* GGUF HEADER WRITER
	* ═══════════════════════════════════════════════════════════════════════ */

	static inline void gguf_write_header(FILE *fp, uint64_t tensor_count,
	uint64_t metadata_kv_count)
	{
	uint32_t magic = GGUF_MAGIC;
	uint32_t version = GGUF_VERSION;
	fwrite(&magic, sizeof(uint32_t), 1, fp);
	fwrite(&version, sizeof(uint32_t), 1, fp);
	fwrite(&tensor_count, sizeof(uint64_t), 1, fp);
	fwrite(&metadata_kv_count, sizeof(uint64_t), 1, fp);
	}

	/* ═══════════════════════════════════════════════════════════════════════
	* ALIGNMENT PADDING
	* ═══════════════════════════════════════════════════════════════════════ */

	static inline void gguf_write_padding(FILE *fp, uint32_t alignment)
	{
	long pos = ftell(fp);
	long pad = (alignment - (pos % alignment)) % alignment;
	if (pad > 0) {
	uint8_t zeros[64] = {0};
	while (pad > 0) {
	long write_n = (pad > 64) ? 64 : pad;
	fwrite(zeros, 1, write_n, fp);
	pad -= write_n;
	}
	}
	}

	/* ═══════════════════════════════════════════════════════════════════════
	* Q8_0 QUANTIZATION — Reference Implementation
	*
	* For each block of 32 floats:
	* 1. Find amax = max(\|x_i\|)
	* 2. Scale d = amax / 127.0
	* 3. Quantize: qs[i] = round(x_i / d)
	*
	* This is the STANDARD brute-force approach.
	* The HExState MCMC optimizer replaces step 2 with intelligent
	* search for the optimal d that minimizes weighted error.
	* ═══════════════════════════════════════════════════════════════════════ */

	static inline void gguf_quantize_q8_0_reference(const float *x,
	BlockQ8_0 *y,
	int64_t n_elements)
	{
	int64_t n_blocks = n_elements / QK8_0;

	for (int64_t i = 0; i < n_blocks; i++) {
	float amax = 0.0f;
	for (int j = 0; j < QK8_0; j++) {
	float v = fabsf(x[i * QK8_0 + j]);
	if (v > amax) amax = v;
	}

	float d = amax / 127.0f;
	float id = (d != 0.0f) ? 1.0f / d : 0.0f;

	y[i].d = gguf_fp32_to_fp16(d);

	for (int j = 0; j < QK8_0; j++) {
	float v = x[i * QK8_0 + j] * id;
	y[i].qs[j] = (int8_t)roundf(v);
	}
	}
	}

	/* Dequantize a single Q8_0 block back to float (for error measurement) */
	static inline void gguf_dequantize_q8_0_block(const BlockQ8_0 *block,
	float *out)
	{
	float d = gguf_fp16_to_fp32(block->d);
	for (int j = 0; j < QK8_0; j++) {
	out[j] = (float)block->qs[j] * d;
	}
	}

	/* Compute L2 reconstruction error for a Q8_0 quantized block */
	static inline float gguf_q8_0_block_error(const float *original,
	const BlockQ8_0 *block)
	{
	float deq[QK8_0];
	gguf_dequantize_q8_0_block(block, deq);
	float err = 0.0f;
	for (int j = 0; j < QK8_0; j++) {
	float diff = original[j] - deq[j];
	err += diff * diff;
	}
	return err;
	}

	/* ═══════════════════════════════════════════════════════════════════════
	* Q2_K QUANTIZATION — Reference Implementation
	*
	* For each superblock of 256 floats:
	* 1. Divide into 16 sub-blocks of 16 weights
	* 2. For each sub-block: find optimal (scale, min) → w ≈ min + scale * q
	* 3. Quantize sub-block scales/mins to 4 bits each
	* 4. Re-quantize weights to 2 bits using final scales
	* 5. Pack 4 quants per byte
	*
	* The HExState MCMC optimizer replaces step 2's brute-force grid search
	* with intelligent Boltzmann-guided exploration.
	* ═══════════════════════════════════════════════════════════════════════ */

	/* Helper: find nearest integer (ggml-compatible) */
	static inline int gguf_nearest_int(float fval)
	{
	float val = fval + 12582912.f; /* 2^23 + 2^22 */
	int i;
	memcpy(&i, &val, sizeof(int));
	return (i & 0x007fffff) - 0x00400000;
	}

	/* Quantize a sub-block of 16 floats with scale+min scheme.
	* Returns scale; stores abs(min) in *the_min.
	* Outputs L[i] ∈ {0, 1, 2, 3} (nmax = 3). */
	static inline float gguf_make_qkx_quants(int n, int nmax,
	const float x, uint8_t L,
	float *the_min)
	{
	float min_val = x[0];
	float max_val = x[0];
	for (int i = 1; i < n; i++) {
	if (x[i] < min_val) min_val = x[i];
	if (x[i] > max_val) max_val = x[i];
	}
	if (max_val == min_val) {
	for (int i = 0; i < n; i++) L[i] = 0;
	*the_min = -min_val;
	return 0.0f;
	}
	if (min_val > 0) min_val = 0;

	float iscale = nmax / (max_val - min_val);
	float scale = 1.0f / iscale;

	/* Iterative refinement (matches ggml's make_qkx1_quants) */
	for (int itry = 0; itry < 5; itry++) {
	float sumlx = 0;
	int suml2 = 0;
	int did_change = 0;
	for (int i = 0; i < n; i++) {
	int l = gguf_nearest_int(iscale * (x[i] - min_val));
	if (l < 0) l = 0;
	if (l > nmax) l = nmax;
	if (l != (int)L[i]) { L[i] = l; did_change = 1; }
	sumlx += (x[i] - min_val) * l;
	suml2 += l * l;
	}
	if (suml2 > 0) scale = sumlx / suml2;
	float sum = 0;
	for (int i = 0; i < n; i++) {
	sum += x[i] - scale * L[i];
	}
	min_val = 0.7f * min_val + 0.3f * sum / n;
	if (min_val > 0) min_val = 0;
	if (scale > 1e-15f) iscale = 1.0f / scale;
	if (!did_change) break;
	}

	*the_min = -min_val;
	return scale;
	}

	static inline void gguf_quantize_q2_k_reference(const float *x,
	BlockQ2K *y,
	int64_t n_elements)
	{
	int64_t n_blocks = n_elements / QK_K;
	const float q4scale = 15.0f;

	for (int64_t i = 0; i < n_blocks; i++) {
	const float block_x = x + i QK_K;
	uint8_t L[QK_K];
	float mins[QK_K / 16];
	float scales[QK_K / 16];

	float max_scale = 0.0f;
	float max_min = 0.0f;

	/* Step 1: Find scale and min for each of 16 sub-blocks */
	for (int j = 0; j < QK_K / 16; j++) {
	scales[j] = gguf_make_qkx_quants(16, 3,
	block_x + 16 * j,
	L + 16 * j, &mins[j]);
	if (scales[j] > max_scale) max_scale = scales[j];
	if (mins[j] > max_min) max_min = mins[j];
	}

	/* Step 2: Quantize the 16 sub-block scales to 4 bits */
	if (max_scale > 0) {
	float iscale = q4scale / max_scale;
	for (int j = 0; j < QK_K / 16; j++) {
	int l = gguf_nearest_int(iscale * scales[j]);
	if (l < 0) l = 0;
	if (l > 15) l = 15;
	y[i].scales[j] = (uint8_t)l;
	}
	y[i].d = gguf_fp32_to_fp16(max_scale / q4scale);
	} else {
	for (int j = 0; j < QK_K / 16; j++) y[i].scales[j] = 0;
	y[i].d = gguf_fp32_to_fp16(0.0f);
	}

	/* Step 3: Quantize the 16 sub-block mins to 4 bits (packed in high nibble) */
	if (max_min > 0) {
	float iscale = q4scale / max_min;
	for (int j = 0; j < QK_K / 16; j++) {
	int l = gguf_nearest_int(iscale * mins[j]);
	if (l < 0) l = 0;
	if (l > 15) l = 15;
	y[i].scales[j] \|= ((uint8_t)l << 4);
	}
	y[i].dmin = gguf_fp32_to_fp16(max_min / q4scale);
	} else {
	y[i].dmin = gguf_fp32_to_fp16(0.0f);
	}

	/* Step 4: Re-quantize weights to 2 bits using final rounded scales */
	for (int j = 0; j < QK_K / 16; j++) {
	float d = gguf_fp16_to_fp32(y[i].d) * (y[i].scales[j] & 0xF);
	if (d < 1e-15f) {
	for (int ii = 0; ii < 16; ii++) L[16 * j + ii] = 0;
	continue;
	}
	float dm = gguf_fp16_to_fp32(y[i].dmin) * (y[i].scales[j] >> 4);
	for (int ii = 0; ii < 16; ii++) {
	int l = gguf_nearest_int((block_x[16 * j + ii] + dm) / d);
	if (l < 0) l = 0;
	if (l > 3) l = 3;
	L[16 * j + ii] = (uint8_t)l;
	}
	}

	/* Step 5: Pack 4 quants per byte (2 bits each)
	* Layout: 2 groups of 128, each packed as 32 bytes holding 4×32 quants */
	for (int j = 0; j < QK_K; j += 128) {
	for (int l = 0; l < 32; l++) {
	y[i].qs[j / 4 + l] = L[j + l]
	\| (L[j + l + 32] << 2)
	\| (L[j + l + 64] << 4)
	\| (L[j + l + 96] << 6);
	}
	}
	}
	}

	/* Dequantize a single Q2_K superblock to float (for error measurement) */
	static inline void gguf_dequantize_q2_k_block(const BlockQ2K *block,
	float *out)
	{
	float d = gguf_fp16_to_fp32(block->d);
	float dmin = gguf_fp16_to_fp32(block->dmin);

	const uint8_t *q = block->qs;
	int is = 0;

	for (int n = 0; n < QK_K; n += 128) {
	int shift = 0;
	for (int j = 0; j < 4; j++) {
	uint8_t sc = block->scales[is++];
	float dl = d * (sc & 0xF);
	float ml = dmin * (sc >> 4);
	for (int l = 0; l < 16; l++) {
	out++ = dl ((float)((q[l] >> shift) & 3)) - ml;
	}

	sc = block->scales[is++];
	dl = d * (sc & 0xF);
	ml = dmin * (sc >> 4);
	for (int l = 0; l < 16; l++) {
	out++ = dl ((float)((q[l + 16] >> shift) & 3)) - ml;
	}

	shift += 2;
	}
	q += 32;
	}
	}

	/* Compute L2 error for a Q2_K quantized superblock */
	static inline float gguf_q2_k_block_error(const float *original,
	const BlockQ2K *block)
	{
	float deq[QK_K];
	gguf_dequantize_q2_k_block(block, deq);
	float err = 0.0f;
	for (int j = 0; j < QK_K; j++) {
	float diff = original[j] - deq[j];
	err += diff * diff;
	}
	return err;
	}

	/* ═══════════════════════════════════════════════════════════════════════
	* GGML TYPE METADATA — Size calculations
	* ═══════════════════════════════════════════════════════════════════════ */

	/* Block size for a given type */
	static inline int64_t ggml_type_block_size(GGMLType type)
	{
	switch (type) {
	case GGML_TYPE_F32: return 1;
	case GGML_TYPE_F16: return 1;
	case GGML_TYPE_Q8_0: return QK8_0;
	case GGML_TYPE_Q2_K: return QK_K;
	case GGML_TYPE_Q4_0: return 32;
	case GGML_TYPE_Q4_1: return 32;
	case GGML_TYPE_Q5_0: return 32;
	case GGML_TYPE_Q5_1: return 32;
	case GGML_TYPE_Q4_K: return 256;
	case GGML_TYPE_Q5_K: return 256;
	case GGML_TYPE_Q6_K: return 256;
	default: return 1;
	}
	}

	/* Bytes per block for a given type */
	static inline int64_t ggml_type_bytes_per_block(GGMLType type)
	{
	switch (type) {
	case GGML_TYPE_F32: return 4;
	case GGML_TYPE_F16: return 2;
	case GGML_TYPE_Q8_0: return sizeof(BlockQ8_0); /* 34 */
	case GGML_TYPE_Q2_K: return sizeof(BlockQ2K); /* 84 */
	case GGML_TYPE_Q4_0: return 18; /* 2 + 16 */
	case GGML_TYPE_Q4_1: return 20; /* 2 + 2 + 16 */
	default: return 4;
	}
	}

	/* Total bytes for n_elements of a given type */
	static inline int64_t ggml_type_size(GGMLType type, int64_t n_elements)
	{
	int64_t block_size = ggml_type_block_size(type);
	int64_t bytes_per_block = ggml_type_bytes_per_block(type);
	int64_t n_blocks = (n_elements + block_size - 1) / block_size;
	return n_blocks * bytes_per_block;
	}

	#endif /* GGUF_FORMAT_H */