import os
import torch
import gradio as gr
import soundfile as sf
import numpy as np
from pathlib import Path
import json
import traceback

# Token MUST be added via HuggingFace Space Secrets
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    print("WARNING: HF_TOKEN missing. Add it in Space → Settings → Variables & Secrets.")

MODEL_ID = "ai4bharat/indic-parler-tts"

try:
    from parler_tts import ParlerTTSForConditionalGeneration
    from transformers import AutoTokenizer
except Exception as e:
    raise RuntimeError("Missing required libraries. Install dependencies from requirements.txt. Error: " + str(e))

device = "cuda" if torch.cuda.is_available() else "cpu"

print("Loading model…")
model = ParlerTTSForConditionalGeneration.from_pretrained(MODEL_ID).to(device)
text_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

try:
    desc_encoder_name = model.config.text_encoder._name_or_path
    desc_tokenizer = AutoTokenizer.from_pretrained(desc_encoder_name)
except:
    desc_tokenizer = text_tokenizer

sampling_rate = getattr(model.config, "sampling_rate", 22050)

sp_file = Path(__file__).parent / "speakers.json"
if sp_file.exists():
    SPEAKERS = json.load(open(sp_file, "r", encoding="utf-8"))
else:
    SPEAKERS = ["Default"]

def synthesize(text, speaker, emotion="Neutral"):
    if not text.strip():
        return None

    desc = f"{speaker}'s voice. Tone: {emotion}. Natural, clear speech, close mic."
    try:
        desc_ids = desc_tokenizer(desc, return_tensors="pt").to(device)
        text_ids = text_tokenizer(text, return_tensors="pt").to(device)
    except:
        desc_ids = desc_tokenizer(desc, return_tensors="pt")
        text_ids = text_tokenizer(text, return_tensors="pt")

    with torch.no_grad():
        try:
            audio = model.generate(
                input_ids=desc_ids.input_ids,
                attention_mask=desc_ids.attention_mask,
                prompt_input_ids=text_ids.input_ids,
                prompt_attention_mask=text_ids.attention_mask,
                max_length=20000,
            )
        except:
            audio = model.generate(description=desc, text=text)

    arr = audio.cpu().numpy().squeeze()
    if np.issubdtype(arr.dtype, np.integer):
        arr = arr.astype("float32") / np.iinfo(arr.dtype).max

    out_path = f"/tmp/out_{abs(hash(text))}.wav"
    sf.write(out_path, arr, sampling_rate)
    return out_path

with gr.Blocks() as demo:
    gr.Markdown("# Indic Parler-TTS (69 Speakers)")

    txt = gr.Textbox(value="नमस्ते, यह एक परीक्षण वाक्य है।", label="Text")
    sp = gr.Dropdown(SPEAKERS, value=SPEAKERS[0], label="Speaker")
    emo = gr.Dropdown(["Neutral","Happy","Sad","Angry","Narration"], value="Neutral", label="Emotion")
    btn = gr.Button("Generate")
    out = gr.Audio()

    btn.click(fn=synthesize, inputs=[txt, sp, emo], outputs=out)

if __name__ == '__main__':
    demo.launch(server_name="0.0.0.0", server_port=7860)