realtime-hakka-asr

Sleeping

App Files Files Community

yuripeyamashita commited on Oct 27

Commit

6e80e89

1 Parent(s): 123c6d0

feat: update app.py

Browse files

Files changed (2) hide show

README.md +4 -11
app.py +23 -91

README.md CHANGED Viewed

@@ -1,15 +1,8 @@
----
-title: Realtime Whisper Turbo
-emoji: 🤯
-colorFrom: indigo
-colorTo: red
 sdk: gradio
 sdk_version: 5.0.1
 app_file: app.py
 pinned: true
-tags:
-- whisper-event
-short_description: Realtime implementation of Whisper large turbo
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+title: Realtime Hakka Asr
+emoji: 🐨
+colorFrom: gray
+colorTo: gray
 sdk: gradio
 sdk_version: 5.0.1
 app_file: app.py
 pinned: true

app.py CHANGED Viewed

@@ -5,15 +5,15 @@ import tempfile
 import os
 import uuid
 import scipy.io.wavfile
-import time
 import numpy as np
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
-import subprocess
-subprocess.run(
-    "pip install flash-attn --no-build-isolation",
-    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-    shell=True,
-)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16
@@ -37,24 +37,26 @@ pipe = pipeline(
     device=device,
 )
-@spaces.GPU
 def stream_transcribe(stream, new_chunk):
-    start_time = time.time()
     try:
         sr, y = new_chunk
         # Convert to mono if stereo
         if y.ndim > 1:
             y = y.mean(axis=1)
         y = y.astype(np.float32)
         y /= np.max(np.abs(y))
         if stream is not None:
             stream = np.concatenate([stream, y])
         else:
             stream = y
         transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
         end_time = time.time()
         latency = end_time - start_time
@@ -64,52 +66,19 @@ def stream_transcribe(stream, new_chunk):
         print(f"Error during Transcription: {e}")
         return stream, e, "Error"
-@spaces.GPU
-def transcribe(inputs, previous_transcription):
-    start_time = time.time()
-    try:
-        filename = f"{uuid.uuid4().hex}.wav"
-        sample_rate, audio_data = inputs
-        scipy.io.wavfile.write(filename, sample_rate, audio_data)
-        transcription = pipe(filename)["text"]
-        previous_transcription += transcription
-        end_time = time.time()
-        latency = end_time - start_time
-        return previous_transcription, f"{latency:.2f}"
-    except Exception as e:
-        print(f"Error during Transcription: {e}")
-        return previous_transcription, "Error"
-@spaces.GPU
-def translate_and_transcribe(inputs, previous_transcription, target_language):
-    start_time = time.time()
-    try:
-        filename = f"{uuid.uuid4().hex}.wav"
-        sample_rate, audio_data = inputs
-        scipy.io.wavfile.write(filename, sample_rate, audio_data)
-        translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language} )["text"]
-        previous_transcription += translation
-        end_time = time.time()
-        latency = end_time - start_time
-        return previous_transcription, f"{latency:.2f}"
-    except Exception as e:
-        print(f"Error during Translation and Transcription: {e}")
-        return previous_transcription, "Error"
 def clear():
     return ""
 def clear_state():
     return None
 with gr.Blocks() as microphone:
     with gr.Column():
-        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
@@ -117,49 +86,12 @@ with gr.Blocks() as microphone:
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
-        input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
-with gr.Blocks() as file:
-    with gr.Column():
-        gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
-        with gr.Row():
-            input_audio_microphone = gr.Audio(sources="upload", type="numpy")
-            output = gr.Textbox(label="Transcription", value="")
-            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
-        with gr.Row():
-            submit_button = gr.Button("Submit")
-            clear_button = gr.Button("Clear Output")
-        submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
-        clear_button.click(clear, outputs=[output])
-# with gr.Blocks() as translate:
-#     with gr.Column():
-#         gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
-#         with gr.Row():
-#             input_audio_microphone = gr.Audio(streaming=True)
-#             output = gr.Textbox(label="Transcription and Translation", value="")
-#             latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
-#             target_language_dropdown = gr.Dropdown(
-#                 choices=["english", "french", "hindi", "spanish", "russian"],
-#                 label="Target Language",
-#                 value="<|es|>"
-#             )
-#         with gr.Row():
-#             clear_button = gr.Button("Clear Output")
-#         input_audio_microphone.stream(
-#             translate_and_transcribe,
-#             [input_audio_microphone, output, target_language_dropdown],
-#             [output, latency_textbox],
-#             time_limit=45,
-#             stream_every=2,
-#             concurrency_limit=None
-#         )
-#         clear_button.click(clear, outputs=[output])
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
-    gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
-demo.launch()

 import os
 import uuid
 import scipy.io.wavfile
+import time
 import numpy as np
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
+# import subprocess
+# subprocess.run(
+#     "pip install flash-attn --no-build-isolation",
+#     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+#     shell=True,
+# )
 device = "cuda" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16
     device=device,
 )
+# @spaces.GPU
 def stream_transcribe(stream, new_chunk):
+    start_time = time.time()
     try:
         sr, y = new_chunk
         # Convert to mono if stereo
         if y.ndim > 1:
             y = y.mean(axis=1)
         y = y.astype(np.float32)
         y /= np.max(np.abs(y))
         if stream is not None:
             stream = np.concatenate([stream, y])
         else:
             stream = y
         transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
         end_time = time.time()
         latency = end_time - start_time
         print(f"Error during Transcription: {e}")
         return stream, e, "Error"
 def clear():
     return ""
 def clear_state():
     return None
 with gr.Blocks() as microphone:
     with gr.Column():
+        gr.Markdown(
+            f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
         with gr.Row():
             input_audio_microphone = gr.Audio(streaming=True)
             output = gr.Textbox(label="Transcription", value="")
         with gr.Row():
             clear_button = gr.Button("Clear Output")
         state = gr.State()
+        input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [
+                                      state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
         clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
+    gr.TabbedInterface([microphone], ["Microphone"])
+demo.launch()