yuripeyamashita commited on
Commit
0902d62
·
1 Parent(s): 83c898e

feat: update models

Browse files
Files changed (2) hide show
  1. app.py +68 -25
  2. models.yaml +9 -0
app.py CHANGED
@@ -15,32 +15,55 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokeni
15
  # shell=True,
16
  # )
17
 
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
19
- torch_dtype = torch.float16
20
- MODEL_NAME = "openai/whisper-large-v3-turbo"
21
-
22
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
23
- MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
24
- )
25
- model.to(device)
26
-
27
- processor = AutoProcessor.from_pretrained(MODEL_NAME)
28
- tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
29
-
30
- pipe = pipeline(
31
- task="automatic-speech-recognition",
32
- model=model,
33
- tokenizer=tokenizer,
34
- feature_extractor=processor.feature_extractor,
35
- chunk_length_s=10,
36
- torch_dtype=torch_dtype,
37
- device=device,
38
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  # @spaces.GPU
41
 
42
 
43
- def stream_transcribe(stream, new_chunk):
44
  start_time = time.time()
45
  try:
46
  sr, y = new_chunk
@@ -57,7 +80,19 @@ def stream_transcribe(stream, new_chunk):
57
  else:
58
  stream = y
59
 
60
- transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
 
 
 
 
 
 
 
 
 
 
 
 
61
  end_time = time.time()
62
  latency = end_time - start_time
63
 
@@ -78,15 +113,23 @@ def clear_state():
78
  with gr.Blocks() as microphone:
79
  with gr.Column():
80
  gr.Markdown(
81
- f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
82
  with gr.Row():
83
  input_audio_microphone = gr.Audio(streaming=True)
 
 
 
 
 
 
 
 
84
  output = gr.Textbox(label="Transcription", value="")
85
  latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
86
  with gr.Row():
87
  clear_button = gr.Button("Clear Output")
88
  state = gr.State()
89
- input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [
90
  state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
91
  clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
92
 
 
15
  # shell=True,
16
  # )
17
 
18
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ # torch_dtype = torch.float16
20
+ # MODEL_NAME = "openai/whisper-large-v3-turbo"
21
+
22
+ # model = AutoModelForSpeechSeq2Seq.from_pretrained(
23
+ # MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
24
+ # )
25
+ # model.to(device)
26
+
27
+ # processor = AutoProcessor.from_pretrained(MODEL_NAME)
28
+ # tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
29
+
30
+ # pipe = pipeline(
31
+ # task="automatic-speech-recognition",
32
+ # model=model,
33
+ # tokenizer=tokenizer,
34
+ # feature_extractor=processor.feature_extractor,
35
+ # chunk_length_s=10,
36
+ # torch_dtype=torch_dtype,
37
+ # device=device,
38
+ # )
39
+
40
+ from omegaconf import OmegaConf
41
+
42
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
43
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
44
+
45
+
46
+ def load_pipe(model_id: str):
47
+ return pipeline(
48
+ "automatic-speech-recognition",
49
+ model=model_id,
50
+ max_new_tokens=128,
51
+ chunk_length_s=30,
52
+ batch_size=8,
53
+ torch_dtype=torch_dtype,
54
+ device=device,
55
+ )
56
+
57
+
58
+ OmegaConf.register_new_resolver("load_pipe", load_pipe)
59
+ models_config = OmegaConf.to_object(OmegaConf.load("models.yaml"))
60
+ default_model_id = "whisper-large-v3"
61
+ model = models_config[default_model_id]["model"]
62
 
63
  # @spaces.GPU
64
 
65
 
66
+ def stream_transcribe(stream, new_chunk, dialect_id):
67
  start_time = time.time()
68
  try:
69
  sr, y = new_chunk
 
80
  else:
81
  stream = y
82
 
83
+ generate_kwargs = {
84
+ "task": "transcribe",
85
+ "language": "Chinese",
86
+ "num_beams": 1,
87
+ "prompt_ids": torch.from_numpy(model.tokenizer.get_prompt_ids(dialect_id)).to(
88
+ device
89
+ ),
90
+ "sampling_rate": sr,
91
+ "raw": stream
92
+ }
93
+
94
+ # transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
95
+ transcription = model(generate_kwargs=generate_kwargs)["text"]
96
  end_time = time.time()
97
  latency = end_time - start_time
98
 
 
113
  with gr.Blocks() as microphone:
114
  with gr.Column():
115
  gr.Markdown(
116
+ f"# Realtime Hakka ASR: \n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
117
  with gr.Row():
118
  input_audio_microphone = gr.Audio(streaming=True)
119
+ dialect_drop_down = gr.Dropdown(
120
+ choices=[
121
+ (k, v)
122
+ for k, v in models_config[default_model_id]["dialect_mapping"].items()
123
+ ],
124
+ value=list(models_config[default_model_id]["dialect_mapping"].values())[0],
125
+ label="腔調",
126
+ )
127
  output = gr.Textbox(label="Transcription", value="")
128
  latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
129
  with gr.Row():
130
  clear_button = gr.Button("Clear Output")
131
  state = gr.State()
132
+ input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone, dialect_drop_down], [
133
  state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
134
  clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
135
 
models.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ whisper-large-v3:
2
+ model: ${load_pipe:formospeech/whisper-large-v3-taiwanese-hakka}
3
+ dialect_mapping:
4
+ 四縣: htia_sixian
5
+ 海陸: htia_hailu
6
+ 大埔: htia_dapu
7
+ 饒平: htia_raoping
8
+ 詔安: htia_zhaoan
9
+ 南四縣: htia_nansixian