| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #ifndef TOKENIZER_READER_H |
| #define TOKENIZER_READER_H |
|
|
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <stdint.h> |
|
|
| #define TOK_MAX_TOKENS 256000 |
| #define TOK_MAX_MERGES 512000 |
| #define TOK_MAX_TOKEN_LEN 512 |
|
|
| |
| typedef enum { |
| TOK_TYPE_NORMAL = 1, |
| TOK_TYPE_UNKNOWN = 2, |
| TOK_TYPE_CONTROL = 3, |
| TOK_TYPE_USER_DEF = 4, |
| TOK_TYPE_UNUSED = 5, |
| TOK_TYPE_BYTE = 6 |
| } TokenType; |
|
|
| typedef struct { |
| char **tokens; |
| float *scores; |
| int32_t *token_types; |
| int32_t vocab_size; |
|
|
| char **merges; |
| int32_t n_merges; |
|
|
| int32_t bos_id; |
| int32_t eos_id; |
| int32_t unk_id; |
| int32_t pad_id; |
|
|
| char model_type[32]; |
| } TokenizerData; |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| static inline const char *tok_skip_ws(const char *p) { |
| while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') p++; |
| return p; |
| } |
|
|
| |
| |
| |
| static const char *tok_extract_string(const char *p, char *buf, int buflen) |
| { |
| if (*p != '"') return NULL; |
| p++; |
|
|
| int i = 0; |
| while (*p && *p != '"' && i < buflen - 1) { |
| if (*p == '\\' && p[1]) { |
| p++; |
| switch (*p) { |
| case '"': buf[i++] = '"'; break; |
| case '\\': buf[i++] = '\\'; break; |
| case '/': buf[i++] = '/'; break; |
| case 'n': buf[i++] = '\n'; break; |
| case 'r': buf[i++] = '\r'; break; |
| case 't': buf[i++] = '\t'; break; |
| case 'u': { |
| |
| if (p[1] && p[2] && p[3] && p[4]) { |
| unsigned int cp = 0; |
| char hex[5] = {p[1], p[2], p[3], p[4], 0}; |
| cp = (unsigned int)strtoul(hex, NULL, 16); |
| p += 4; |
| |
| if (cp < 0x80) { |
| buf[i++] = (char)cp; |
| } else if (cp < 0x800) { |
| if (i + 1 < buflen - 1) { |
| buf[i++] = (char)(0xC0 | (cp >> 6)); |
| buf[i++] = (char)(0x80 | (cp & 0x3F)); |
| } |
| } else { |
| if (i + 2 < buflen - 1) { |
| buf[i++] = (char)(0xE0 | (cp >> 12)); |
| buf[i++] = (char)(0x80 | ((cp >> 6) & 0x3F)); |
| buf[i++] = (char)(0x80 | (cp & 0x3F)); |
| } |
| } |
| } |
| break; |
| } |
| default: buf[i++] = *p; break; |
| } |
| } else { |
| buf[i++] = *p; |
| } |
| p++; |
| } |
| buf[i] = '\0'; |
|
|
| if (*p == '"') p++; |
| return p; |
| } |
|
|
| |
| static const char *tok_find_key(const char *json, const char *key) |
| { |
| char search[TOK_MAX_TOKEN_LEN + 4]; |
| snprintf(search, sizeof(search), "\"%s\"", key); |
|
|
| const char *p = strstr(json, search); |
| if (!p) return NULL; |
|
|
| p += strlen(search); |
| p = tok_skip_ws(p); |
| if (*p == ':') p++; |
| p = tok_skip_ws(p); |
| return p; |
| } |
|
|
| |
| |
| |
|
|
| static int tok_parse_vocab(const char *json, TokenizerData *td) |
| { |
| |
| const char *model_p = tok_find_key(json, "model"); |
| if (!model_p) return -1; |
|
|
| |
| const char *type_p = tok_find_key(model_p, "type"); |
| if (type_p) { |
| char type_buf[64]; |
| tok_extract_string(type_p, type_buf, sizeof(type_buf)); |
| if (strcasecmp(type_buf, "BPE") == 0) { |
| strcpy(td->model_type, "llama"); |
| } else { |
| strncpy(td->model_type, type_buf, sizeof(td->model_type) - 1); |
| } |
| } |
|
|
| |
| const char *vocab_p = tok_find_key(model_p, "vocab"); |
| if (!vocab_p || *vocab_p != '{') return -1; |
| vocab_p++; |
|
|
| |
| char token_buf[TOK_MAX_TOKEN_LEN]; |
| int max_id = -1; |
|
|
| |
| const char *scan = vocab_p; |
| int count = 0; |
| while (*scan && *scan != '}') { |
| scan = tok_skip_ws(scan); |
| if (*scan == ',') { scan++; continue; } |
| if (*scan != '"') break; |
|
|
| |
| char dummy[TOK_MAX_TOKEN_LEN]; |
| scan = tok_extract_string(scan, dummy, sizeof(dummy)); |
| if (!scan) break; |
| scan = tok_skip_ws(scan); |
| if (*scan == ':') scan++; |
| scan = tok_skip_ws(scan); |
|
|
| |
| int id = (int)strtol(scan, (char **)&scan, 10); |
| if (id > max_id) max_id = id; |
| count++; |
| } |
|
|
| if (count == 0 || max_id < 0) return -1; |
|
|
| td->vocab_size = max_id + 1; |
|
|
| |
| td->tokens = (char **)calloc(td->vocab_size, sizeof(char *)); |
| td->scores = (float *)calloc(td->vocab_size, sizeof(float)); |
| td->token_types = (int32_t *)calloc(td->vocab_size, sizeof(int32_t)); |
|
|
| |
| for (int i = 0; i < td->vocab_size; i++) { |
| td->tokens[i] = strdup(""); |
| td->scores[i] = 0.0f; |
| td->token_types[i] = TOK_TYPE_NORMAL; |
| } |
|
|
| |
| scan = vocab_p; |
| while (*scan && *scan != '}') { |
| scan = tok_skip_ws(scan); |
| if (*scan == ',') { scan++; continue; } |
| if (*scan != '"') break; |
|
|
| scan = tok_extract_string(scan, token_buf, sizeof(token_buf)); |
| if (!scan) break; |
| scan = tok_skip_ws(scan); |
| if (*scan == ':') scan++; |
| scan = tok_skip_ws(scan); |
|
|
| int id = (int)strtol(scan, (char **)&scan, 10); |
|
|
| if (id >= 0 && id < td->vocab_size) { |
| free(td->tokens[id]); |
| td->tokens[id] = strdup(token_buf); |
| |
| td->scores[id] = -(float)id; |
| } |
| } |
|
|
| return 0; |
| } |
|
|
| |
| |
| |
|
|
| static int tok_parse_merges(const char *json, TokenizerData *td) |
| { |
| const char *model_p = tok_find_key(json, "model"); |
| if (!model_p) return -1; |
|
|
| const char *merges_p = tok_find_key(model_p, "merges"); |
| if (!merges_p || *merges_p != '[') return -1; |
| merges_p++; |
|
|
| |
| int capacity = 65536; |
| td->merges = (char **)calloc(capacity, sizeof(char *)); |
| td->n_merges = 0; |
|
|
| |
| const char *scan = merges_p; |
| char merge_buf[TOK_MAX_TOKEN_LEN * 2]; |
| while (*scan && *scan != ']' && td->n_merges < TOK_MAX_MERGES) { |
| scan = tok_skip_ws(scan); |
| if (*scan == ',') { scan++; continue; } |
| if (*scan != '"') { scan++; continue; } |
|
|
| scan = tok_extract_string(scan, merge_buf, sizeof(merge_buf)); |
| if (!scan) break; |
|
|
| |
| if (td->n_merges >= capacity) { |
| capacity *= 2; |
| td->merges = (char **)realloc(td->merges, capacity * sizeof(char *)); |
| } |
|
|
| td->merges[td->n_merges] = strdup(merge_buf); |
| td->n_merges++; |
| } |
|
|
| return 0; |
| } |
|
|
| |
| |
| |
|
|
| static void tok_parse_added_tokens(const char *json, TokenizerData *td) |
| { |
| const char *added_p = tok_find_key(json, "added_tokens"); |
| if (!added_p || *added_p != '[') return; |
| added_p++; |
|
|
| |
| while (*added_p && *added_p != ']') { |
| added_p = tok_skip_ws(added_p); |
| if (*added_p == ',') { added_p++; continue; } |
| if (*added_p != '{') { added_p++; continue; } |
|
|
| |
| const char *obj_start = added_p; |
| int depth = 1; |
| added_p++; |
| while (*added_p && depth > 0) { |
| if (*added_p == '{') depth++; |
| if (*added_p == '}') depth--; |
| added_p++; |
| } |
|
|
| |
| char content[TOK_MAX_TOKEN_LEN] = ""; |
| int id = -1; |
| int is_special = 0; |
|
|
| const char *id_p = tok_find_key(obj_start, "id"); |
| if (id_p) id = (int)strtol(id_p, NULL, 10); |
|
|
| const char *content_p = tok_find_key(obj_start, "content"); |
| if (content_p && *content_p == '"') |
| tok_extract_string(content_p, content, sizeof(content)); |
|
|
| const char *special_p = tok_find_key(obj_start, "special"); |
| if (special_p) { |
| is_special = (strncmp(special_p, "true", 4) == 0); |
| } |
|
|
| |
| if (id >= 0 && id < td->vocab_size) { |
| if (is_special) { |
| td->token_types[id] = TOK_TYPE_CONTROL; |
| } |
| |
| if (content[0] && (!td->tokens[id] || !td->tokens[id][0])) { |
| free(td->tokens[id]); |
| td->tokens[id] = strdup(content); |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
|
|
| static void tok_parse_config(const char *config_json, TokenizerData *td) |
| { |
| |
| |
|
|
| |
| struct { const char *key; int32_t *id_ptr; const char *default_content; } specials[] = { |
| {"bos_token", &td->bos_id, "<s>"}, |
| {"eos_token", &td->eos_id, "</s>"}, |
| {"unk_token", &td->unk_id, "<unk>"}, |
| {NULL, NULL, NULL} |
| }; |
|
|
| for (int s = 0; specials[s].key; s++) { |
| const char *p = tok_find_key(config_json, specials[s].key); |
| if (!p) { |
| |
| for (int i = 0; i < td->vocab_size; i++) { |
| if (td->tokens[i] && strcmp(td->tokens[i], specials[s].default_content) == 0) { |
| *specials[s].id_ptr = i; |
| break; |
| } |
| } |
| continue; |
| } |
|
|
| |
| if (*p == '"') { |
| char content[TOK_MAX_TOKEN_LEN]; |
| tok_extract_string(p, content, sizeof(content)); |
| |
| for (int i = 0; i < td->vocab_size; i++) { |
| if (td->tokens[i] && strcmp(td->tokens[i], content) == 0) { |
| *specials[s].id_ptr = i; |
| break; |
| } |
| } |
| } else if (*p == '{') { |
| |
| const char *cp = tok_find_key(p, "content"); |
| if (cp && *cp == '"') { |
| char content[TOK_MAX_TOKEN_LEN]; |
| tok_extract_string(cp, content, sizeof(content)); |
| for (int i = 0; i < td->vocab_size; i++) { |
| if (td->tokens[i] && strcmp(td->tokens[i], content) == 0) { |
| *specials[s].id_ptr = i; |
| break; |
| } |
| } |
| } |
| } |
| } |
| } |
|
|
| |
| |
| |
|
|
| static char *tok_read_file(const char *path) |
| { |
| FILE *f = fopen(path, "rb"); |
| if (!f) return NULL; |
|
|
| fseek(f, 0, SEEK_END); |
| long size = ftell(f); |
| fseek(f, 0, SEEK_SET); |
|
|
| char *buf = (char *)malloc(size + 1); |
| if (!buf) { fclose(f); return NULL; } |
|
|
| fread(buf, 1, size, f); |
| buf[size] = '\0'; |
| fclose(f); |
| return buf; |
| } |
|
|
| static TokenizerData *tok_load(const char *tokenizer_json_path, |
| const char *config_json_path) |
| { |
| TokenizerData *td = (TokenizerData *)calloc(1, sizeof(TokenizerData)); |
| if (!td) return NULL; |
|
|
| td->bos_id = 1; |
| td->eos_id = 2; |
| td->unk_id = 0; |
| td->pad_id = -1; |
| strcpy(td->model_type, "llama"); |
|
|
| |
| char *json = tok_read_file(tokenizer_json_path); |
| if (!json) { |
| fprintf(stderr, " WARNING: Could not read '%s'\n", tokenizer_json_path); |
| free(td); |
| return NULL; |
| } |
|
|
| |
| if (tok_parse_vocab(json, td) != 0) { |
| fprintf(stderr, " WARNING: Failed to parse vocab from tokenizer.json\n"); |
| free(json); |
| free(td); |
| return NULL; |
| } |
|
|
| |
| tok_parse_merges(json, td); |
|
|
| |
| tok_parse_added_tokens(json, td); |
|
|
| |
| for (int i = 0; i < td->vocab_size; i++) { |
| if (td->tokens[i] && td->tokens[i][0] == '<' && |
| td->tokens[i][1] == '0' && td->tokens[i][2] == 'x' && |
| strlen(td->tokens[i]) == 6 && td->tokens[i][5] == '>') { |
| td->token_types[i] = TOK_TYPE_BYTE; |
| } |
| } |
|
|
| free(json); |
|
|
| |
| if (config_json_path) { |
| char *config = tok_read_file(config_json_path); |
| if (config) { |
| tok_parse_config(config, td); |
| free(config); |
| } |
| } |
|
|
| return td; |
| } |
|
|
| static void tok_free(TokenizerData *td) |
| { |
| if (!td) return; |
| if (td->tokens) { |
| for (int i = 0; i < td->vocab_size; i++) |
| free(td->tokens[i]); |
| free(td->tokens); |
| } |
| if (td->merges) { |
| for (int i = 0; i < td->n_merges; i++) |
| free(td->merges[i]); |
| free(td->merges); |
| } |
| free(td->scores); |
| free(td->token_types); |
| free(td); |
| } |
|
|
| |
| static void tok_print_summary(const TokenizerData *td) |
| { |
| printf(" βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n"); |
| printf(" β Tokenizer β\n"); |
| printf(" β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£\n"); |
| printf(" β Model: %-40s β\n", td->model_type); |
| printf(" β Vocab size: %-40d β\n", td->vocab_size); |
| printf(" β Merges: %-40d β\n", td->n_merges); |
| printf(" β BOS token: %-3d %-36s β\n", td->bos_id, |
| (td->bos_id >= 0 && td->bos_id < td->vocab_size) ? td->tokens[td->bos_id] : ""); |
| printf(" β EOS token: %-3d %-36s β\n", td->eos_id, |
| (td->eos_id >= 0 && td->eos_id < td->vocab_size) ? td->tokens[td->eos_id] : ""); |
| printf(" β UNK token: %-3d %-36s β\n", td->unk_id, |
| (td->unk_id >= 0 && td->unk_id < td->vocab_size) ? td->tokens[td->unk_id] : ""); |
| printf(" βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ\n\n"); |
| } |
|
|
| #endif |
|
|