HPC-Quantize / tokenizer_reader.h
CompressedGemma's picture
It's only calibrated for Gemma, atm.
07b428c verified
/*
* tokenizer_reader.h β€” HuggingFace tokenizer.json Parser
*
* Extracts vocabulary, merge rules, and special token IDs from
* HuggingFace tokenizer.json files for embedding into GGUF.
*
* Supports: LLaMA/Mistral BPE tokenizers (sentencepiece-derived)
*/
#ifndef TOKENIZER_READER_H
#define TOKENIZER_READER_H
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#define TOK_MAX_TOKENS 256000 /* Max supported vocab size */
#define TOK_MAX_MERGES 512000 /* Max supported merge rules */
#define TOK_MAX_TOKEN_LEN 512 /* Max length of a single token */
/* Token types matching GGUF tokenizer.ggml.token_type */
typedef enum {
TOK_TYPE_NORMAL = 1,
TOK_TYPE_UNKNOWN = 2,
TOK_TYPE_CONTROL = 3,
TOK_TYPE_USER_DEF = 4,
TOK_TYPE_UNUSED = 5,
TOK_TYPE_BYTE = 6
} TokenType;
typedef struct {
char **tokens; /* Token strings indexed by ID */
float *scores; /* Token scores/priorities */
int32_t *token_types; /* Token type enum per token */
int32_t vocab_size; /* Total vocabulary size */
char **merges; /* BPE merge rule strings */
int32_t n_merges; /* Number of merge rules */
int32_t bos_id; /* Beginning of sequence token ID */
int32_t eos_id; /* End of sequence token ID */
int32_t unk_id; /* Unknown token ID */
int32_t pad_id; /* Padding token ID (-1 if none) */
char model_type[32]; /* "llama", "gpt2", etc. */
} TokenizerData;
/* ═══════════════════════════════════════════════════════════════════
* JSON HELPER β€” Minimal extraction utilities
*
* These are NOT a general JSON parser β€” they target the specific
* structure of HuggingFace tokenizer.json files.
* ═══════════════════════════════════════════════════════════════════ */
/* Skip whitespace */
static inline const char *tok_skip_ws(const char *p) {
while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++;
return p;
}
/* Extract a JSON string value starting at the opening quote.
* Handles basic escape sequences. Returns pointer after closing quote.
* Copies unescaped string into buf. */
static const char *tok_extract_string(const char *p, char *buf, int buflen)
{
if (*p != '"') return NULL;
p++; /* skip opening quote */
int i = 0;
while (*p && *p != '"' && i < buflen - 1) {
if (*p == '\\' && p[1]) {
p++;
switch (*p) {
case '"': buf[i++] = '"'; break;
case '\\': buf[i++] = '\\'; break;
case '/': buf[i++] = '/'; break;
case 'n': buf[i++] = '\n'; break;
case 'r': buf[i++] = '\r'; break;
case 't': buf[i++] = '\t'; break;
case 'u': {
/* Parse \uXXXX unicode escape */
if (p[1] && p[2] && p[3] && p[4]) {
unsigned int cp = 0;
char hex[5] = {p[1], p[2], p[3], p[4], 0};
cp = (unsigned int)strtoul(hex, NULL, 16);
p += 4;
/* Encode as UTF-8 */
if (cp < 0x80) {
buf[i++] = (char)cp;
} else if (cp < 0x800) {
if (i + 1 < buflen - 1) {
buf[i++] = (char)(0xC0 | (cp >> 6));
buf[i++] = (char)(0x80 | (cp & 0x3F));
}
} else {
if (i + 2 < buflen - 1) {
buf[i++] = (char)(0xE0 | (cp >> 12));
buf[i++] = (char)(0x80 | ((cp >> 6) & 0x3F));
buf[i++] = (char)(0x80 | (cp & 0x3F));
}
}
}
break;
}
default: buf[i++] = *p; break;
}
} else {
buf[i++] = *p;
}
p++;
}
buf[i] = '\0';
if (*p == '"') p++; /* skip closing quote */
return p;
}
/* Find a key in JSON and return pointer to its value */
static const char *tok_find_key(const char *json, const char *key)
{
char search[TOK_MAX_TOKEN_LEN + 4];
snprintf(search, sizeof(search), "\"%s\"", key);
const char *p = strstr(json, search);
if (!p) return NULL;
p += strlen(search);
p = tok_skip_ws(p);
if (*p == ':') p++;
p = tok_skip_ws(p);
return p;
}
/* ═══════════════════════════════════════════════════════════════════
* VOCAB PARSER β€” Extract "model": { "vocab": { ... } }
* ═══════════════════════════════════════════════════════════════════ */
static int tok_parse_vocab(const char *json, TokenizerData *td)
{
/* Find "vocab" key inside "model" object */
const char *model_p = tok_find_key(json, "model");
if (!model_p) return -1;
/* Extract model type */
const char *type_p = tok_find_key(model_p, "type");
if (type_p) {
char type_buf[64];
tok_extract_string(type_p, type_buf, sizeof(type_buf));
if (strcasecmp(type_buf, "BPE") == 0) {
strcpy(td->model_type, "llama");
} else {
strncpy(td->model_type, type_buf, sizeof(td->model_type) - 1);
}
}
/* Find "vocab": { */
const char *vocab_p = tok_find_key(model_p, "vocab");
if (!vocab_p || *vocab_p != '{') return -1;
vocab_p++; /* skip '{' */
/* Parse each "token_string": id pair */
char token_buf[TOK_MAX_TOKEN_LEN];
int max_id = -1;
/* First pass: count entries and find max ID */
const char *scan = vocab_p;
int count = 0;
while (*scan && *scan != '}') {
scan = tok_skip_ws(scan);
if (*scan == ',') { scan++; continue; }
if (*scan != '"') break;
/* Skip key */
char dummy[TOK_MAX_TOKEN_LEN];
scan = tok_extract_string(scan, dummy, sizeof(dummy));
if (!scan) break;
scan = tok_skip_ws(scan);
if (*scan == ':') scan++;
scan = tok_skip_ws(scan);
/* Read value (integer) */
int id = (int)strtol(scan, (char **)&scan, 10);
if (id > max_id) max_id = id;
count++;
}
if (count == 0 || max_id < 0) return -1;
td->vocab_size = max_id + 1;
/* Allocate arrays */
td->tokens = (char **)calloc(td->vocab_size, sizeof(char *));
td->scores = (float *)calloc(td->vocab_size, sizeof(float));
td->token_types = (int32_t *)calloc(td->vocab_size, sizeof(int32_t));
/* Initialize with defaults */
for (int i = 0; i < td->vocab_size; i++) {
td->tokens[i] = strdup("");
td->scores[i] = 0.0f;
td->token_types[i] = TOK_TYPE_NORMAL;
}
/* Second pass: fill in tokens */
scan = vocab_p;
while (*scan && *scan != '}') {
scan = tok_skip_ws(scan);
if (*scan == ',') { scan++; continue; }
if (*scan != '"') break;
scan = tok_extract_string(scan, token_buf, sizeof(token_buf));
if (!scan) break;
scan = tok_skip_ws(scan);
if (*scan == ':') scan++;
scan = tok_skip_ws(scan);
int id = (int)strtol(scan, (char **)&scan, 10);
if (id >= 0 && id < td->vocab_size) {
free(td->tokens[id]);
td->tokens[id] = strdup(token_buf);
/* Score = negative index for BPE ordering (higher ID = lower priority) */
td->scores[id] = -(float)id;
}
}
return 0;
}
/* ═══════════════════════════════════════════════════════════════════
* MERGES PARSER β€” Extract "model": { "merges": [ ... ] }
* ═══════════════════════════════════════════════════════════════════ */
static int tok_parse_merges(const char *json, TokenizerData *td)
{
const char *model_p = tok_find_key(json, "model");
if (!model_p) return -1;
const char *merges_p = tok_find_key(model_p, "merges");
if (!merges_p || *merges_p != '[') return -1;
merges_p++; /* skip '[' */
/* Allocate with growth pattern β€” start with 64k slots */
int capacity = 65536;
td->merges = (char **)calloc(capacity, sizeof(char *));
td->n_merges = 0;
/* Extract merge strings */
const char *scan = merges_p;
char merge_buf[TOK_MAX_TOKEN_LEN * 2];
while (*scan && *scan != ']' && td->n_merges < TOK_MAX_MERGES) {
scan = tok_skip_ws(scan);
if (*scan == ',') { scan++; continue; }
if (*scan != '"') { scan++; continue; }
scan = tok_extract_string(scan, merge_buf, sizeof(merge_buf));
if (!scan) break;
/* Grow if needed */
if (td->n_merges >= capacity) {
capacity *= 2;
td->merges = (char **)realloc(td->merges, capacity * sizeof(char *));
}
td->merges[td->n_merges] = strdup(merge_buf);
td->n_merges++;
}
return 0;
}
/* ═══════════════════════════════════════════════════════════════════
* SPECIAL TOKENS β€” Extract from "added_tokens" array
* ═══════════════════════════════════════════════════════════════════ */
static void tok_parse_added_tokens(const char *json, TokenizerData *td)
{
const char *added_p = tok_find_key(json, "added_tokens");
if (!added_p || *added_p != '[') return;
added_p++;
/* Scan through the array of objects */
while (*added_p && *added_p != ']') {
added_p = tok_skip_ws(added_p);
if (*added_p == ',') { added_p++; continue; }
if (*added_p != '{') { added_p++; continue; }
/* Find end of this object */
const char *obj_start = added_p;
int depth = 1;
added_p++;
while (*added_p && depth > 0) {
if (*added_p == '{') depth++;
if (*added_p == '}') depth--;
added_p++;
}
/* Extract content and id from this object */
char content[TOK_MAX_TOKEN_LEN] = "";
int id = -1;
int is_special = 0;
const char *id_p = tok_find_key(obj_start, "id");
if (id_p) id = (int)strtol(id_p, NULL, 10);
const char *content_p = tok_find_key(obj_start, "content");
if (content_p && *content_p == '"')
tok_extract_string(content_p, content, sizeof(content));
const char *special_p = tok_find_key(obj_start, "special");
if (special_p) {
is_special = (strncmp(special_p, "true", 4) == 0);
}
/* Mark special tokens */
if (id >= 0 && id < td->vocab_size) {
if (is_special) {
td->token_types[id] = TOK_TYPE_CONTROL;
}
/* Update token string if needed */
if (content[0] && (!td->tokens[id] || !td->tokens[id][0])) {
free(td->tokens[id]);
td->tokens[id] = strdup(content);
}
}
}
}
/* ═══════════════════════════════════════════════════════════════════
* SPECIAL TOKEN IDs β€” Extract from tokenizer_config.json
* ═══════════════════════════════════════════════════════════════════ */
static void tok_parse_config(const char *config_json, TokenizerData *td)
{
/* Look for bos_token, eos_token, unk_token content strings */
/* Then find their IDs in the vocab */
/* Search for token content in the config */
struct { const char *key; int32_t *id_ptr; const char *default_content; } specials[] = {
{"bos_token", &td->bos_id, "<s>"},
{"eos_token", &td->eos_id, "</s>"},
{"unk_token", &td->unk_id, "<unk>"},
{NULL, NULL, NULL}
};
for (int s = 0; specials[s].key; s++) {
const char *p = tok_find_key(config_json, specials[s].key);
if (!p) {
/* Try to find in vocab by default content */
for (int i = 0; i < td->vocab_size; i++) {
if (td->tokens[i] && strcmp(td->tokens[i], specials[s].default_content) == 0) {
*specials[s].id_ptr = i;
break;
}
}
continue;
}
/* The value might be a string directly or an object with "content" */
if (*p == '"') {
char content[TOK_MAX_TOKEN_LEN];
tok_extract_string(p, content, sizeof(content));
/* Find this content in vocab */
for (int i = 0; i < td->vocab_size; i++) {
if (td->tokens[i] && strcmp(td->tokens[i], content) == 0) {
*specials[s].id_ptr = i;
break;
}
}
} else if (*p == '{') {
/* Object with "content" field */
const char *cp = tok_find_key(p, "content");
if (cp && *cp == '"') {
char content[TOK_MAX_TOKEN_LEN];
tok_extract_string(cp, content, sizeof(content));
for (int i = 0; i < td->vocab_size; i++) {
if (td->tokens[i] && strcmp(td->tokens[i], content) == 0) {
*specials[s].id_ptr = i;
break;
}
}
}
}
}
}
/* ═══════════════════════════════════════════════════════════════════
* MAIN API β€” Load tokenizer from directory
* ═══════════════════════════════════════════════════════════════════ */
static char *tok_read_file(const char *path)
{
FILE *f = fopen(path, "rb");
if (!f) return NULL;
fseek(f, 0, SEEK_END);
long size = ftell(f);
fseek(f, 0, SEEK_SET);
char *buf = (char *)malloc(size + 1);
if (!buf) { fclose(f); return NULL; }
fread(buf, 1, size, f);
buf[size] = '\0';
fclose(f);
return buf;
}
static TokenizerData *tok_load(const char *tokenizer_json_path,
const char *config_json_path)
{
TokenizerData *td = (TokenizerData *)calloc(1, sizeof(TokenizerData));
if (!td) return NULL;
td->bos_id = 1;
td->eos_id = 2;
td->unk_id = 0;
td->pad_id = -1;
strcpy(td->model_type, "llama");
/* Read tokenizer.json */
char *json = tok_read_file(tokenizer_json_path);
if (!json) {
fprintf(stderr, " WARNING: Could not read '%s'\n", tokenizer_json_path);
free(td);
return NULL;
}
/* Parse vocab */
if (tok_parse_vocab(json, td) != 0) {
fprintf(stderr, " WARNING: Failed to parse vocab from tokenizer.json\n");
free(json);
free(td);
return NULL;
}
/* Parse merges */
tok_parse_merges(json, td);
/* Parse added tokens (special tokens) */
tok_parse_added_tokens(json, td);
/* Detect byte tokens: <0x00> through <0xFF> */
for (int i = 0; i < td->vocab_size; i++) {
if (td->tokens[i] && td->tokens[i][0] == '<' &&
td->tokens[i][1] == '0' && td->tokens[i][2] == 'x' &&
strlen(td->tokens[i]) == 6 && td->tokens[i][5] == '>') {
td->token_types[i] = TOK_TYPE_BYTE;
}
}
free(json);
/* Read config if available */
if (config_json_path) {
char *config = tok_read_file(config_json_path);
if (config) {
tok_parse_config(config, td);
free(config);
}
}
return td;
}
static void tok_free(TokenizerData *td)
{
if (!td) return;
if (td->tokens) {
for (int i = 0; i < td->vocab_size; i++)
free(td->tokens[i]);
free(td->tokens);
}
if (td->merges) {
for (int i = 0; i < td->n_merges; i++)
free(td->merges[i]);
free(td->merges);
}
free(td->scores);
free(td->token_types);
free(td);
}
/* Print summary */
static void tok_print_summary(const TokenizerData *td)
{
printf(" ╔═══════════════════════════════════════════════════════════════╗\n");
printf(" β•‘ Tokenizer β•‘\n");
printf(" ╠═══════════════════════════════════════════════════════════════╣\n");
printf(" β•‘ Model: %-40s β•‘\n", td->model_type);
printf(" β•‘ Vocab size: %-40d β•‘\n", td->vocab_size);
printf(" β•‘ Merges: %-40d β•‘\n", td->n_merges);
printf(" β•‘ BOS token: %-3d %-36s β•‘\n", td->bos_id,
(td->bos_id >= 0 && td->bos_id < td->vocab_size) ? td->tokens[td->bos_id] : "");
printf(" β•‘ EOS token: %-3d %-36s β•‘\n", td->eos_id,
(td->eos_id >= 0 && td->eos_id < td->vocab_size) ? td->tokens[td->eos_id] : "");
printf(" β•‘ UNK token: %-3d %-36s β•‘\n", td->unk_id,
(td->unk_id >= 0 && td->unk_id < td->vocab_size) ? td->tokens[td->unk_id] : "");
printf(" β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\n\n");
}
#endif /* TOKENIZER_READER_H */