Text Generation
Transformers
PyTorch
English
experimental
research
bit-level
transformer
reversible
safety
telemetry
language-modeling
Instructions to use WCNegentropy/BitTransformerLM with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use WCNegentropy/BitTransformerLM with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="WCNegentropy/BitTransformerLM")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("WCNegentropy/BitTransformerLM", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use WCNegentropy/BitTransformerLM with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "WCNegentropy/BitTransformerLM" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/WCNegentropy/BitTransformerLM
- SGLang
How to use WCNegentropy/BitTransformerLM with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "WCNegentropy/BitTransformerLM" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "WCNegentropy/BitTransformerLM" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "WCNegentropy/BitTransformerLM", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use WCNegentropy/BitTransformerLM with Docker Model Runner:
docker model run hf.co/WCNegentropy/BitTransformerLM
| import os | |
| import time | |
| import math | |
| from itertools import cycle | |
| from typing import Optional | |
| import torch | |
| import torch.nn.functional as F | |
| from bit_transformer import ( | |
| BitTransformerLM, | |
| text_to_bits, | |
| quantize_dynamic, | |
| prepare_qat_fx, | |
| convert_qat_fx, | |
| hil_safe_inference, | |
| collapse_submodel, | |
| diffusion_inference, | |
| TelemetrySynthesizer, | |
| save_distilled_model, | |
| ) | |
| from bit_transformer.training import train_loop as train | |
| from bit_transformer.optimization import configure_optimizer, adjust_learning_rate | |
| from bit_transformer.utils import save_model, load_model, set_dropout | |
| from bit_transformer.torch_utils import cpu_autocast | |
| def lines_to_tensor(lines, max_len): | |
| seqs = [] | |
| for text in lines: | |
| bits = text_to_bits(text)[:max_len] | |
| if len(bits) < max_len: | |
| bits.extend([0] * (max_len - len(bits))) | |
| seqs.append(bits) | |
| return torch.tensor(seqs, dtype=torch.long) | |
| def load_wikitext(dataset_size=128, max_len=64): | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("wikitext", "wikitext-2-raw-v1") | |
| train_lines = [t for t in ds["train"]["text"] if t.strip()][:dataset_size] | |
| valid_split = max(1, dataset_size // 4) | |
| valid_lines = [t for t in ds["validation"]["text"] if t.strip()][:valid_split] | |
| train = lines_to_tensor(train_lines, max_len) | |
| valid = lines_to_tensor(valid_lines, max_len) | |
| return train, valid, train_lines | |
| except Exception as e: | |
| print("Dataset load failed, using random bits", e) | |
| train = torch.randint(0, 2, (dataset_size, max_len), dtype=torch.long) | |
| valid = torch.randint(0, 2, (max_len, max_len), dtype=torch.long) | |
| return train, valid, ["" for _ in range(len(train))] | |
| def _warmup( | |
| model: BitTransformerLM, | |
| data: torch.Tensor, | |
| steps: int = 5, | |
| freeze_old: bool = False, | |
| old_layers: int = 0, | |
| *, | |
| diffusion: bool = False, | |
| curriculum: bool = False, | |
| optimizer: Optional[torch.optim.Optimizer] = None, | |
| scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, | |
| ) -> None: | |
| """Run a short warm-up loop after expansion.""" | |
| model.train() | |
| set_dropout(model, 0.1) | |
| if freeze_old: | |
| for idx, layer in enumerate(model.layers): | |
| if idx < old_layers: | |
| for p in layer.parameters(): | |
| p.requires_grad_(False) | |
| if optimizer is None or scheduler is None: | |
| optimizer, scheduler = configure_optimizer(model, lr=1e-3, total_steps=steps) | |
| it = iter(data.split(8)) | |
| for idx in range(steps): | |
| try: | |
| batch = next(it) | |
| except StopIteration: | |
| it = iter(data.split(8)) | |
| batch = next(it) | |
| if diffusion: | |
| p = 0.5 * (1 - idx / max(1, steps - 1)) if curriculum else 0.5 | |
| noise = (torch.rand_like(batch.float()) < p).long() | |
| noisy = batch ^ noise | |
| logits, _ = model(noisy, causal=False) | |
| pred = logits.reshape(-1, 2) | |
| target = batch.reshape(-1) | |
| else: | |
| logits, _ = model(batch) | |
| pred = logits[:, :-1, :].reshape(-1, 2) | |
| target = batch[:, 1:].reshape(-1) | |
| loss = F.cross_entropy(pred, target) | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
| optimizer.step() | |
| scheduler.step() | |
| optimizer.zero_grad() | |
| for p in model.parameters(): | |
| p.requires_grad_(True) | |
| model.eval() | |
| set_dropout(model, 0.0) | |
| def integration_schedule( | |
| steps: int = 10, | |
| max_len: int = 64, | |
| dataset_size: int = 128, | |
| *, | |
| weights_path: str = "weights/model.pt.gz", | |
| plateau_steps: int = 0, | |
| collapsed_path: str | None = None, | |
| epochs_per_step: int = 2, | |
| extra_steps: int = 3, | |
| collapse: bool = True, | |
| diffusion: bool = False, | |
| noise_schedule: str = "linear", | |
| diffusion_steps: int = 8, | |
| diffusion_curriculum: bool = False, | |
| use_checkpoint: bool = True, | |
| reversible: bool = True, | |
| improve_thresh: float = 0.01, | |
| qat: bool = False, | |
| ): | |
| start = time.time() | |
| train_bits, valid_bits, train_lines = load_wikitext(dataset_size, max_len) | |
| if os.path.exists(weights_path): | |
| try: | |
| model = load_model(weights_path) | |
| print(f"Loaded model from {weights_path}") | |
| except Exception as e: | |
| print("Failed to load weights, initializing new model", e) | |
| model = BitTransformerLM( | |
| d_model=32, | |
| nhead=4, | |
| num_layers=1, | |
| dim_feedforward=64, | |
| max_seq_len=max_len, | |
| use_act=True, | |
| act_threshold=0.7, | |
| reversible=reversible, | |
| chunk_size=max_len, | |
| use_autocast=True, | |
| use_checkpoint=use_checkpoint, | |
| ) | |
| else: | |
| model = BitTransformerLM( | |
| d_model=32, | |
| nhead=4, | |
| num_layers=1, | |
| dim_feedforward=64, | |
| max_seq_len=max_len, | |
| use_act=True, | |
| act_threshold=0.7, | |
| reversible=reversible, | |
| chunk_size=max_len, | |
| use_autocast=True, | |
| use_checkpoint=use_checkpoint, | |
| ) | |
| if qat: | |
| model = prepare_qat_fx(model) | |
| results = [] | |
| scale_cycle = cycle(["layers", "width", "context"]) | |
| base_lr = 1e-3 | |
| prev_val_loss: Optional[float] = None | |
| for step in range(steps): | |
| model.train() | |
| set_dropout(model, 0.1) | |
| opt, sched = configure_optimizer( | |
| model, lr=base_lr, total_steps=epochs_per_step | |
| ) | |
| train( | |
| model, | |
| train_bits, | |
| epochs=epochs_per_step, | |
| extra_steps=extra_steps, | |
| compress_prob=0.0 if diffusion else 1.0, | |
| log=True, | |
| diffusion=diffusion, | |
| diffusion_curriculum=diffusion_curriculum, | |
| optimizer=opt, | |
| scheduler=sched, | |
| ) | |
| model.eval() | |
| set_dropout(model, 0.0) | |
| with torch.no_grad(): | |
| logits, telemetry = model(valid_bits, causal=not diffusion) | |
| if diffusion: | |
| pred = logits.reshape(-1, 2) | |
| target = valid_bits.reshape(-1) | |
| else: | |
| pred = logits[:, :-1, :].reshape(-1, 2) | |
| target = valid_bits[:, 1:].reshape(-1) | |
| val_loss = F.cross_entropy(pred, target).item() | |
| k = telemetry["negentropy_logits"].mean().item() | |
| c = telemetry["lz_complexity_logits"].mean().item() | |
| s = telemetry["symbiosis_score"].mean().item() | |
| print(f"Step {step} validation loss: {val_loss:.4f} K={k:.3f} C={c:.3f} S={s:.3f}") | |
| results.append((step, val_loss, k, c, s)) | |
| if prev_val_loss is not None and prev_val_loss - val_loss < improve_thresh: | |
| strategy = next(scale_cycle) | |
| base_lr = adjust_learning_rate(opt, 1 / math.sqrt(2)) | |
| if strategy == "layers": | |
| old_layers = model.num_layers | |
| model = model.double_layers() | |
| warm_opt, warm_sched = configure_optimizer( | |
| model, lr=base_lr, total_steps=100 | |
| ) | |
| _warmup( | |
| model, | |
| train_bits, | |
| steps=100, | |
| freeze_old=True, | |
| old_layers=old_layers, | |
| diffusion=diffusion, | |
| curriculum=diffusion_curriculum, | |
| optimizer=warm_opt, | |
| scheduler=warm_sched, | |
| ) | |
| elif strategy == "width": | |
| model = model.double_width() | |
| warm_opt, warm_sched = configure_optimizer( | |
| model, lr=base_lr, total_steps=100 | |
| ) | |
| _warmup( | |
| model, | |
| train_bits, | |
| steps=100, | |
| diffusion=diffusion, | |
| curriculum=diffusion_curriculum, | |
| optimizer=warm_opt, | |
| scheduler=warm_sched, | |
| ) | |
| else: | |
| max_len *= 2 | |
| train_bits, valid_bits, train_lines = load_wikitext( | |
| dataset_size, max_len | |
| ) | |
| model = model.double_length() | |
| warm_opt, warm_sched = configure_optimizer( | |
| model, lr=base_lr, total_steps=100 | |
| ) | |
| _warmup( | |
| model, | |
| train_bits, | |
| steps=100, | |
| diffusion=diffusion, | |
| curriculum=diffusion_curriculum, | |
| optimizer=warm_opt, | |
| scheduler=warm_sched, | |
| ) | |
| prev_val_loss = val_loss | |
| if time.time() - start > 8 * 60: | |
| print("Time limit reached") | |
| break | |
| # optional plateau phase at final size | |
| for p in range(plateau_steps): | |
| model.train() | |
| set_dropout(model, 0.1) | |
| train( | |
| model, | |
| train_bits, | |
| epochs=epochs_per_step, | |
| extra_steps=extra_steps, | |
| compress_prob=0.0 if diffusion else 1.0, | |
| log=True, | |
| diffusion=diffusion, | |
| diffusion_curriculum=diffusion_curriculum, | |
| ) | |
| model.eval() | |
| set_dropout(model, 0.0) | |
| with torch.no_grad(): | |
| logits, telemetry = model(valid_bits, causal=not diffusion) | |
| if diffusion: | |
| pred = logits.reshape(-1, 2) | |
| target = valid_bits.reshape(-1) | |
| else: | |
| pred = logits[:, :-1, :].reshape(-1, 2) | |
| target = valid_bits[:, 1:].reshape(-1) | |
| val_loss = F.cross_entropy(pred, target).item() | |
| k = telemetry["negentropy_logits"].mean().item() | |
| c = telemetry["lz_complexity_logits"].mean().item() | |
| s = telemetry["symbiosis_score"].mean().item() | |
| idx = steps + p | |
| print( | |
| f"Plateau {p} validation loss: {val_loss:.4f} K={k:.3f} C={c:.3f} S={s:.3f}" | |
| ) | |
| results.append((idx, val_loss, k, c, s)) | |
| if time.time() - start > 8 * 60: | |
| print("Time limit reached") | |
| break | |
| # final validation after last step | |
| model.eval() | |
| set_dropout(model, 0.0) | |
| with torch.no_grad(): | |
| logits, telemetry = model(valid_bits, causal=not diffusion) | |
| if diffusion: | |
| pred = logits.reshape(-1, 2) | |
| target = valid_bits.reshape(-1) | |
| else: | |
| pred = logits[:, :-1, :].reshape(-1, 2) | |
| target = valid_bits[:, 1:].reshape(-1) | |
| val_loss = F.cross_entropy(pred, target).item() | |
| k = telemetry["negentropy_logits"].mean().item() | |
| c = telemetry["lz_complexity_logits"].mean().item() | |
| s = telemetry["symbiosis_score"].mean().item() | |
| print(f"Final validation loss: {val_loss:.4f} K={k:.3f} C={c:.3f} S={s:.3f}") | |
| results.append((steps + plateau_steps, val_loss, k, c, s)) | |
| # persist final model weights for future runs | |
| save_model(model, weights_path) | |
| input_bits = valid_bits[:1] | |
| if qat: | |
| qmodel = convert_qat_fx(model) | |
| else: | |
| with cpu_autocast(): | |
| model(input_bits) | |
| qmodel = quantize_dynamic(model) | |
| qmodel.eval() | |
| try: | |
| hil_safe_inference( | |
| qmodel, | |
| input_bits, | |
| c_floor=0.3, | |
| s_floor=0.5, | |
| causal=not diffusion, | |
| strict=not diffusion, | |
| ) | |
| except RuntimeError as e: | |
| print("Safety gate triggered", e) | |
| collapsed = None | |
| if collapse: | |
| synth = TelemetrySynthesizer(n_clusters=8) | |
| reps = synth.cluster_sequences(model, train_bits[:64]) | |
| floors = {"negentropy": 0.3, "lz_complexity": 0.35, "symbiosis_score": 0.5} | |
| collapsed, metrics = collapse_submodel( | |
| reps, | |
| target_params=dict( | |
| d_model=16, | |
| nhead=4, | |
| num_layers=1, | |
| dim_feedforward=32, | |
| max_seq_len=max_len, | |
| ), | |
| floors=floors, | |
| ) | |
| collapsed.eval() | |
| with torch.no_grad(): | |
| logits, _ = collapsed(valid_bits) | |
| pred = logits[:, :-1, :].reshape(-1, 2) | |
| target = valid_bits[:, 1:].reshape(-1) | |
| c_loss = F.cross_entropy(pred, target).item() | |
| print("Collapsed model validation loss:", c_loss) | |
| if collapsed_path is not None: | |
| save_distilled_model( | |
| collapsed, | |
| collapsed_path, | |
| {**metrics, "val_loss": c_loss}, | |
| floors=floors, | |
| ) | |
| if diffusion: | |
| sample = diffusion_inference( | |
| model, length=max_len, steps=diffusion_steps, schedule=noise_schedule | |
| ) | |
| print("Diffusion sample:", sample[0].tolist()) | |
| return results, collapsed | |
| if __name__ == "__main__": | |
| integration_schedule() | |