import os from dotenv import load_dotenv from transformers import pipeline import torch from model_config import MODEL_NAME # Load environment variables from .env file load_dotenv() # Get HuggingFace token from environment variable (for gated models only) HF_TOKEN = os.getenv("HF_TOKEN") # Initialize text generation pipeline # We'll use FP16 for better performance on CPU (if supported) generator = pipeline( "text-generation", model=MODEL_NAME, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto", # Automatically choose device (CPU if no GPU) token=HF_TOKEN if "bigcode" in MODEL_NAME else None # Only for gated repos ) def generate_response(prompt: str) -> str: """ Generate a response from the model based on the input prompt. Returns raw model output as string. """ # Generate text with controlled parameters outputs = generator( prompt, max_new_tokens=200, # Limit generation length num_return_sequences=1, temperature=0.3, # Lower = more deterministic top_p=0.9, do_sample=True ) return outputs[0]["generated_text"]