yuripeyamashita commited on
Commit
6e80e89
·
1 Parent(s): 123c6d0

feat: update app.py

Browse files
Files changed (2) hide show
  1. README.md +4 -11
  2. app.py +23 -91
README.md CHANGED
@@ -1,15 +1,8 @@
1
- ---
2
- title: Realtime Whisper Turbo
3
- emoji: 🤯
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.0.1
8
  app_file: app.py
9
  pinned: true
10
- tags:
11
- - whisper-event
12
- short_description: Realtime implementation of Whisper large turbo
13
- ---
14
-
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ title: Realtime Hakka Asr
2
+ emoji: 🐨
3
+ colorFrom: gray
4
+ colorTo: gray
 
5
  sdk: gradio
6
  sdk_version: 5.0.1
7
  app_file: app.py
8
  pinned: true
 
 
 
 
 
 
app.py CHANGED
@@ -5,15 +5,15 @@ import tempfile
5
  import os
6
  import uuid
7
  import scipy.io.wavfile
8
- import time
9
  import numpy as np
10
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
11
- import subprocess
12
- subprocess.run(
13
- "pip install flash-attn --no-build-isolation",
14
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
15
- shell=True,
16
- )
17
 
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  torch_dtype = torch.float16
@@ -37,24 +37,26 @@ pipe = pipeline(
37
  device=device,
38
  )
39
 
40
- @spaces.GPU
 
 
41
  def stream_transcribe(stream, new_chunk):
42
- start_time = time.time()
43
  try:
44
  sr, y = new_chunk
45
-
46
  # Convert to mono if stereo
47
  if y.ndim > 1:
48
  y = y.mean(axis=1)
49
-
50
  y = y.astype(np.float32)
51
  y /= np.max(np.abs(y))
52
-
53
  if stream is not None:
54
  stream = np.concatenate([stream, y])
55
  else:
56
  stream = y
57
-
58
  transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
59
  end_time = time.time()
60
  latency = end_time - start_time
@@ -64,52 +66,19 @@ def stream_transcribe(stream, new_chunk):
64
  print(f"Error during Transcription: {e}")
65
  return stream, e, "Error"
66
 
67
- @spaces.GPU
68
- def transcribe(inputs, previous_transcription):
69
- start_time = time.time()
70
- try:
71
- filename = f"{uuid.uuid4().hex}.wav"
72
- sample_rate, audio_data = inputs
73
- scipy.io.wavfile.write(filename, sample_rate, audio_data)
74
-
75
- transcription = pipe(filename)["text"]
76
- previous_transcription += transcription
77
-
78
- end_time = time.time()
79
- latency = end_time - start_time
80
- return previous_transcription, f"{latency:.2f}"
81
- except Exception as e:
82
- print(f"Error during Transcription: {e}")
83
- return previous_transcription, "Error"
84
-
85
- @spaces.GPU
86
- def translate_and_transcribe(inputs, previous_transcription, target_language):
87
- start_time = time.time()
88
- try:
89
- filename = f"{uuid.uuid4().hex}.wav"
90
- sample_rate, audio_data = inputs
91
- scipy.io.wavfile.write(filename, sample_rate, audio_data)
92
-
93
- translation = pipe(filename, generate_kwargs={"task": "translate", "language": target_language} )["text"]
94
-
95
- previous_transcription += translation
96
-
97
- end_time = time.time()
98
- latency = end_time - start_time
99
- return previous_transcription, f"{latency:.2f}"
100
- except Exception as e:
101
- print(f"Error during Translation and Transcription: {e}")
102
- return previous_transcription, "Error"
103
 
104
  def clear():
105
  return ""
106
 
 
107
  def clear_state():
108
  return None
109
 
 
110
  with gr.Blocks() as microphone:
111
  with gr.Column():
112
- gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
 
113
  with gr.Row():
114
  input_audio_microphone = gr.Audio(streaming=True)
115
  output = gr.Textbox(label="Transcription", value="")
@@ -117,49 +86,12 @@ with gr.Blocks() as microphone:
117
  with gr.Row():
118
  clear_button = gr.Button("Clear Output")
119
  state = gr.State()
120
- input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
 
121
  clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
122
 
123
- with gr.Blocks() as file:
124
- with gr.Column():
125
- gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
126
- with gr.Row():
127
- input_audio_microphone = gr.Audio(sources="upload", type="numpy")
128
- output = gr.Textbox(label="Transcription", value="")
129
- latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
130
- with gr.Row():
131
- submit_button = gr.Button("Submit")
132
- clear_button = gr.Button("Clear Output")
133
-
134
- submit_button.click(transcribe, [input_audio_microphone, output], [output, latency_textbox], concurrency_limit=None)
135
- clear_button.click(clear, outputs=[output])
136
-
137
- # with gr.Blocks() as translate:
138
- # with gr.Column():
139
- # gr.Markdown(f"# Realtime Whisper Large V3 Turbo (Translation): \n Transcribe and Translate Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
140
- # with gr.Row():
141
- # input_audio_microphone = gr.Audio(streaming=True)
142
- # output = gr.Textbox(label="Transcription and Translation", value="")
143
- # latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
144
- # target_language_dropdown = gr.Dropdown(
145
- # choices=["english", "french", "hindi", "spanish", "russian"],
146
- # label="Target Language",
147
- # value="<|es|>"
148
- # )
149
- # with gr.Row():
150
- # clear_button = gr.Button("Clear Output")
151
-
152
- # input_audio_microphone.stream(
153
- # translate_and_transcribe,
154
- # [input_audio_microphone, output, target_language_dropdown],
155
- # [output, latency_textbox],
156
- # time_limit=45,
157
- # stream_every=2,
158
- # concurrency_limit=None
159
- # )
160
- # clear_button.click(clear, outputs=[output])
161
 
162
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
163
- gr.TabbedInterface([microphone, file], ["Microphone", "Transcribe from file"])
164
 
165
- demo.launch()
 
5
  import os
6
  import uuid
7
  import scipy.io.wavfile
8
+ import time
9
  import numpy as np
10
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
11
+ # import subprocess
12
+ # subprocess.run(
13
+ # "pip install flash-attn --no-build-isolation",
14
+ # env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
15
+ # shell=True,
16
+ # )
17
 
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
  torch_dtype = torch.float16
 
37
  device=device,
38
  )
39
 
40
+ # @spaces.GPU
41
+
42
+
43
  def stream_transcribe(stream, new_chunk):
44
+ start_time = time.time()
45
  try:
46
  sr, y = new_chunk
47
+
48
  # Convert to mono if stereo
49
  if y.ndim > 1:
50
  y = y.mean(axis=1)
51
+
52
  y = y.astype(np.float32)
53
  y /= np.max(np.abs(y))
54
+
55
  if stream is not None:
56
  stream = np.concatenate([stream, y])
57
  else:
58
  stream = y
59
+
60
  transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
61
  end_time = time.time()
62
  latency = end_time - start_time
 
66
  print(f"Error during Transcription: {e}")
67
  return stream, e, "Error"
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  def clear():
71
  return ""
72
 
73
+
74
  def clear_state():
75
  return None
76
 
77
+
78
  with gr.Blocks() as microphone:
79
  with gr.Column():
80
+ gr.Markdown(
81
+ f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
82
  with gr.Row():
83
  input_audio_microphone = gr.Audio(streaming=True)
84
  output = gr.Textbox(label="Transcription", value="")
 
86
  with gr.Row():
87
  clear_button = gr.Button("Clear Output")
88
  state = gr.State()
89
+ input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [
90
+ state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
91
  clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  with gr.Blocks(theme=gr.themes.Ocean()) as demo:
95
+ gr.TabbedInterface([microphone], ["Microphone"])
96
 
97
+ demo.launch()