vikram-iitm's picture
Add app and requirements
c44777b
import os, torchaudio, torch, gradio as gr
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from pyctcdecode import build_ctcdecoder
from huggingface_hub import hf_hub_download
from pyannote.audio import Pipeline
MODEL_ID = "vikram-iitm/indic-asr-hi-kenlm4"
HF_TOKEN = os.getenv("HF_TOKEN")
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).eval()
lm_path = hf_hub_download(repo_id=MODEL_ID, filename="lm.binary")
labels = [t for t,_ in sorted(processor.tokenizer.get_vocab().items(), key=lambda x:
x[1])]
decoder = build_ctcdecoder(labels, kenlm_model_path=lm_path)
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token=HF_TOKEN)
def diarize_asr(path):
diar = pipeline(path)
wav, sr = torchaudio.load(path)
if sr != 16000:
wav = torchaudio.functional.resample(wav, sr, 16000)
out = []
for turn, _, speaker in diar.itertracks(yield_label=True):
s = int(turn.start * 16000)
e = int(turn.end * 16000)
seg = wav[:, s:e]
iv = processor(seg.squeeze(), sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(**iv).logits[0].cpu().numpy()
text = decoder.decode(logits.argmax(-1))
out.append(f"[{turn.start:.1f}-{turn.end:.1f}] Speaker {speaker}: {text}")
return "\n".join(out)
gr.Interface(
diarize_asr,
inputs=gr.Audio(sources=["microphone","upload"], type="filepath"),
outputs="text",
title="Hindi ASR + Speaker Diarization"
).launch()