|
|
import os, torch, gradio as gr |
|
|
from threading import Thread |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer |
|
|
|
|
|
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") |
|
|
|
|
|
MODEL_ID = "TildeAI/TildeOpen-30b" |
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False) |
|
|
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
MODEL_ID, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="auto", |
|
|
) |
|
|
|
|
|
|
|
|
torch.backends.cuda.matmul.allow_tf32 = True |
|
|
|
|
|
SYS = ( |
|
|
"You are a helpful multilingual assistant. " |
|
|
"This is a *base* model (not instruction tuned); follow the user's request precisely." |
|
|
) |
|
|
|
|
|
def build_prompt(history, user_msg): |
|
|
|
|
|
parts = [SYS, ""] |
|
|
for u, a in history: |
|
|
parts += [f"User: {u}", f"Assistant: {a}"] |
|
|
parts += [f"User: {user_msg}", "Assistant:"] |
|
|
return "\n".join(parts) |
|
|
|
|
|
def chat_fn(message, history): |
|
|
prompt = build_prompt(history, message) |
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
|
|
|
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
|
gen_kwargs = dict( |
|
|
**inputs, |
|
|
max_new_tokens=512, |
|
|
do_sample=True, |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
repetition_penalty=1.1, |
|
|
streamer=streamer, |
|
|
) |
|
|
|
|
|
t = Thread(target=model.generate, kwargs=gen_kwargs) |
|
|
t.start() |
|
|
partial = "" |
|
|
for chunk in streamer: |
|
|
partial += chunk |
|
|
yield partial |
|
|
|
|
|
demo = gr.ChatInterface( |
|
|
fn=chat_fn, |
|
|
title="TildeOpen-30B (Transformers, BF16)", |
|
|
description="Base model; multilingual. If build fails with OOM, switch to Option B (GGUF).", |
|
|
) |
|
|
demo.queue().launch() |
|
|
|