vikram-iitm commited on
Commit
c44777b
·
1 Parent(s): 79b555b

Add app and requirements

Browse files
Files changed (2) hide show
  1. app.py +40 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, torchaudio, torch, gradio as gr
2
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
3
+ from pyctcdecode import build_ctcdecoder
4
+ from huggingface_hub import hf_hub_download
5
+ from pyannote.audio import Pipeline
6
+
7
+ MODEL_ID = "vikram-iitm/indic-asr-hi-kenlm4"
8
+ HF_TOKEN = os.getenv("HF_TOKEN")
9
+
10
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
11
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID).eval()
12
+ lm_path = hf_hub_download(repo_id=MODEL_ID, filename="lm.binary")
13
+ labels = [t for t,_ in sorted(processor.tokenizer.get_vocab().items(), key=lambda x:
14
+ x[1])]
15
+ decoder = build_ctcdecoder(labels, kenlm_model_path=lm_path)
16
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.0", use_auth_token=HF_TOKEN)
17
+
18
+ def diarize_asr(path):
19
+ diar = pipeline(path)
20
+ wav, sr = torchaudio.load(path)
21
+ if sr != 16000:
22
+ wav = torchaudio.functional.resample(wav, sr, 16000)
23
+ out = []
24
+ for turn, _, speaker in diar.itertracks(yield_label=True):
25
+ s = int(turn.start * 16000)
26
+ e = int(turn.end * 16000)
27
+ seg = wav[:, s:e]
28
+ iv = processor(seg.squeeze(), sampling_rate=16000, return_tensors="pt", padding=True)
29
+ with torch.no_grad():
30
+ logits = model(**iv).logits[0].cpu().numpy()
31
+ text = decoder.decode(logits.argmax(-1))
32
+ out.append(f"[{turn.start:.1f}-{turn.end:.1f}] Speaker {speaker}: {text}")
33
+ return "\n".join(out)
34
+
35
+ gr.Interface(
36
+ diarize_asr,
37
+ inputs=gr.Audio(sources=["microphone","upload"], type="filepath"),
38
+ outputs="text",
39
+ title="Hindi ASR + Speaker Diarization"
40
+ ).launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ transformers
4
+ pyctcdecode
5
+ kenlm
6
+ soundfile
7
+ gradio
8
+ pyannote.audio