realtime-hakka-asr

Sleeping

App Files Files Community

yuripeyamashita commited on Oct 27

Commit

0902d62

1 Parent(s): 83c898e

feat: update models

Browse files

Files changed (2) hide show

app.py +68 -25
models.yaml +9 -0

app.py CHANGED Viewed

@@ -15,32 +15,55 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokeni
 #     shell=True,
 # )
-device = "cuda" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16
-MODEL_NAME = "openai/whisper-large-v3-turbo"
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
-)
-model.to(device)
-processor = AutoProcessor.from_pretrained(MODEL_NAME)
-tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=model,
-    tokenizer=tokenizer,
-    feature_extractor=processor.feature_extractor,
-    chunk_length_s=10,
-    torch_dtype=torch_dtype,
-    device=device,
-)
 # @spaces.GPU
-def stream_transcribe(stream, new_chunk):
     start_time = time.time()
     try:
         sr, y = new_chunk
@@ -57,7 +80,19 @@ def stream_transcribe(stream, new_chunk):
         else:
             stream = y
-        transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
         end_time = time.time()
         latency = end_time - start_time
@@ -78,15 +113,23 @@ def clear_state():
 with gr.Blocks() as microphone:
     with gr.Column():
         gr.Markdown(
-            f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
             latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
-        input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [
                                       state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])

 #     shell=True,
 # )
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# torch_dtype = torch.float16
+# MODEL_NAME = "openai/whisper-large-v3-turbo"
+# model = AutoModelForSpeechSeq2Seq.from_pretrained(
+#     MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
+# )
+# model.to(device)
+# processor = AutoProcessor.from_pretrained(MODEL_NAME)
+# tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
+# pipe = pipeline(
+#     task="automatic-speech-recognition",
+#     model=model,
+#     tokenizer=tokenizer,
+#     feature_extractor=processor.feature_extractor,
+#     chunk_length_s=10,
+#     torch_dtype=torch_dtype,
+#     device=device,
+# )
+from omegaconf import OmegaConf
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+def load_pipe(model_id: str):
+    return pipeline(
+        "automatic-speech-recognition",
+        model=model_id,
+        max_new_tokens=128,
+        chunk_length_s=30,
+        batch_size=8,
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+OmegaConf.register_new_resolver("load_pipe", load_pipe)
+models_config = OmegaConf.to_object(OmegaConf.load("models.yaml"))
+default_model_id = "whisper-large-v3"
+model = models_config[default_model_id]["model"]
 # @spaces.GPU
+def stream_transcribe(stream, new_chunk, dialect_id):
     start_time = time.time()
     try:
         sr, y = new_chunk
         else:
             stream = y
+            generate_kwargs = {
+                "task": "transcribe",
+                "language": "Chinese",
+                "num_beams": 1,
+                "prompt_ids": torch.from_numpy(model.tokenizer.get_prompt_ids(dialect_id)).to(
+                    device
+                ),
+                "sampling_rate": sr,
+                "raw": stream
+            }
+        # transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
+        transcription = model(generate_kwargs=generate_kwargs)["text"]
         end_time = time.time()
         latency = end_time - start_time
 with gr.Blocks() as microphone:
     with gr.Column():
         gr.Markdown(
+            f"# Realtime Hakka ASR: \n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
+            dialect_drop_down = gr.Dropdown(
+                choices=[
+                    (k, v)
+                    for k, v in models_config[default_model_id]["dialect_mapping"].items()
+                ],
+                value=list(models_config[default_model_id]["dialect_mapping"].values())[0],
+                label="腔調",
+            )
             output = gr.Textbox(label="Transcription", value="")
             latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
+        input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone, dialect_drop_down], [
                                       state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])

models.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+whisper-large-v3:
+  model: ${load_pipe:formospeech/whisper-large-v3-taiwanese-hakka}
+  dialect_mapping:
+    四縣: htia_sixian
+    海陸: htia_hailu
+    大埔: htia_dapu
+    饒平: htia_raoping
+    詔安: htia_zhaoan
+    南四縣: htia_nansixian