import os import torch import gradio as gr import soundfile as sf import numpy as np from pathlib import Path import json import traceback # Token MUST be added via HuggingFace Space Secrets HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: print("WARNING: HF_TOKEN missing. Add it in Space → Settings → Variables & Secrets.") MODEL_ID = "ai4bharat/indic-parler-tts" try: from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer except Exception as e: raise RuntimeError("Missing required libraries. Install dependencies from requirements.txt. Error: " + str(e)) device = "cuda" if torch.cuda.is_available() else "cpu" print("Loading model…") model = ParlerTTSForConditionalGeneration.from_pretrained(MODEL_ID).to(device) text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) try: desc_encoder_name = model.config.text_encoder._name_or_path desc_tokenizer = AutoTokenizer.from_pretrained(desc_encoder_name) except: desc_tokenizer = text_tokenizer sampling_rate = getattr(model.config, "sampling_rate", 22050) sp_file = Path(__file__).parent / "speakers.json" if sp_file.exists(): SPEAKERS = json.load(open(sp_file, "r", encoding="utf-8")) else: SPEAKERS = ["Default"] def synthesize(text, speaker, emotion="Neutral"): if not text.strip(): return None desc = f"{speaker}'s voice. Tone: {emotion}. Natural, clear speech, close mic." try: desc_ids = desc_tokenizer(desc, return_tensors="pt").to(device) text_ids = text_tokenizer(text, return_tensors="pt").to(device) except: desc_ids = desc_tokenizer(desc, return_tensors="pt") text_ids = text_tokenizer(text, return_tensors="pt") with torch.no_grad(): try: audio = model.generate( input_ids=desc_ids.input_ids, attention_mask=desc_ids.attention_mask, prompt_input_ids=text_ids.input_ids, prompt_attention_mask=text_ids.attention_mask, max_length=20000, ) except: audio = model.generate(description=desc, text=text) arr = audio.cpu().numpy().squeeze() if np.issubdtype(arr.dtype, np.integer): arr = arr.astype("float32") / np.iinfo(arr.dtype).max out_path = f"/tmp/out_{abs(hash(text))}.wav" sf.write(out_path, arr, sampling_rate) return out_path with gr.Blocks() as demo: gr.Markdown("# Indic Parler-TTS (69 Speakers)") txt = gr.Textbox(value="नमस्ते, यह एक परीक्षण वाक्य है।", label="Text") sp = gr.Dropdown(SPEAKERS, value=SPEAKERS[0], label="Speaker") emo = gr.Dropdown(["Neutral","Happy","Sad","Angry","Narration"], value="Neutral", label="Emotion") btn = gr.Button("Generate") out = gr.Audio() btn.click(fn=synthesize, inputs=[txt, sp, emo], outputs=out) if __name__ == '__main__': demo.launch(server_name="0.0.0.0", server_port=7860)