Spaces:
Sleeping
Sleeping
| import os | |
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| import gradio as gr | |
| import threading | |
| import uvicorn | |
| # ======================= | |
| # Load Secrets | |
| # ======================= | |
| SYSTEM_PROMPT = os.environ.get( | |
| "prompt", | |
| "You are a placeholder Sovereign. No secrets found in environment." | |
| ) | |
| # ======================= | |
| # Initialize Unsloth-optimized Falcon-3B | |
| # ======================= | |
| # Install via: pip install unsloth torch transformers | |
| from unsloth import FastLanguageModel | |
| from transformers import AutoTokenizer | |
| MODEL_NAME = "tiiuae/Falcon3-3B-Instruct" | |
| # 1) Load model and tokenizer with 4-bit quantization | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=MODEL_NAME, | |
| max_seq_length=2048, | |
| load_in_4bit=True, | |
| dtype=None, | |
| ) | |
| # 2) Apply inference optimizations (fused kernels, streaming, etc.) | |
| FastLanguageModel.for_inference(model) | |
| # ======================= | |
| # Core Chat Function | |
| # ======================= | |
| def chat_fn(user_input: str) -> str: | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": f"User: {user_input}"} | |
| ] | |
| prompt_text = "\n".join(f"{m['role'].capitalize()}: {m['content']}" for m in messages) | |
| # Tokenize and run generation | |
| inputs = tokenizer(prompt_text, return_tensors="pt").to(model.device) | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=256, | |
| do_sample=False, | |
| eos_token_id=tokenizer.eos_token_id | |
| ) | |
| # Decode only the newly generated tokens | |
| gen_tokens = output_ids[0][inputs.input_ids.shape[-1]:] | |
| generated_text = tokenizer.decode(gen_tokens, skip_special_tokens=True) | |
| return generated_text.strip() | |
| # ======================= | |
| # Gradio UI | |
| # ======================= | |
| def gradio_chat(user_input: str) -> str: | |
| return chat_fn(user_input) | |
| iface = gr.Interface( | |
| fn=gradio_chat, | |
| inputs=gr.Textbox(lines=5, placeholder="Enter your prompt…"), | |
| outputs="text", | |
| title="Prompt cracking challenge", | |
| description="Does he really think he is the king?" | |
| ) | |
| # Run Gradio in a separate thread so FastAPI can also start | |
| def run_gradio(): | |
| iface.launch(server_name="0.0.0.0", share=True) | |
| # ======================= | |
| # FastAPI for API access | |
| # ======================= | |
| app = FastAPI(title="Prompt cracking challenge API") | |
| class Request(BaseModel): | |
| prompt: str | |
| def generate(req: Request): | |
| return {"response": chat_fn(req.prompt)} | |
| # ======================= | |
| # Launch Both Servers | |
| # ======================= | |
| if __name__ == "__main__": | |
| threading.Thread(target=run_gradio, daemon=True).start() | |
| uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 8000))) |