Spaces:
Sleeping
Sleeping
| import os, torchaudio, torch, gradio as gr | |
| from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
| from pyctcdecode import build_ctcdecoder | |
| from huggingface_hub import hf_hub_download | |
| from pyannote.audio import Pipeline | |
| MODEL_ID = "vikram-iitm/indic-asr-hi-kenlm4" | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| processor = Wav2Vec2Processor.from_pretrained(MODEL_ID) | |
| model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).eval() | |
| lm_path = hf_hub_download(repo_id=MODEL_ID, filename="lm.binary") | |
| labels = [t for t,_ in sorted(processor.tokenizer.get_vocab().items(), key=lambda x: | |
| x[1])] | |
| decoder = build_ctcdecoder(labels, kenlm_model_path=lm_path) | |
| pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token=HF_TOKEN) | |
| def diarize_asr(path): | |
| diar = pipeline(path) | |
| wav, sr = torchaudio.load(path) | |
| if sr != 16000: | |
| wav = torchaudio.functional.resample(wav, sr, 16000) | |
| out = [] | |
| for turn, _, speaker in diar.itertracks(yield_label=True): | |
| s = int(turn.start * 16000) | |
| e = int(turn.end * 16000) | |
| seg = wav[:, s:e] | |
| iv = processor(seg.squeeze(), sampling_rate=16000, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| logits = model(**iv).logits[0].cpu().numpy() | |
| text = decoder.decode(logits.argmax(-1)) | |
| out.append(f"[{turn.start:.1f}-{turn.end:.1f}] Speaker {speaker}: {text}") | |
| return "\n".join(out) | |
| gr.Interface( | |
| diarize_asr, | |
| inputs=gr.Audio(sources=["microphone","upload"], type="filepath"), | |
| outputs="text", | |
| title="Hindi ASR + Speaker Diarization" | |
| ).launch() | |