Spaces:
Sleeping
Sleeping
yuripeyamashita
commited on
Commit
·
0902d62
1
Parent(s):
83c898e
feat: update models
Browse files- app.py +68 -25
- models.yaml +9 -0
app.py
CHANGED
|
@@ -15,32 +15,55 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokeni
|
|
| 15 |
# shell=True,
|
| 16 |
# )
|
| 17 |
|
| 18 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
-
torch_dtype = torch.float16
|
| 20 |
-
MODEL_NAME = "openai/whisper-large-v3-turbo"
|
| 21 |
-
|
| 22 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 23 |
-
|
| 24 |
-
)
|
| 25 |
-
model.to(device)
|
| 26 |
-
|
| 27 |
-
processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
| 28 |
-
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
|
| 29 |
-
|
| 30 |
-
pipe = pipeline(
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
# @spaces.GPU
|
| 41 |
|
| 42 |
|
| 43 |
-
def stream_transcribe(stream, new_chunk):
|
| 44 |
start_time = time.time()
|
| 45 |
try:
|
| 46 |
sr, y = new_chunk
|
|
@@ -57,7 +80,19 @@ def stream_transcribe(stream, new_chunk):
|
|
| 57 |
else:
|
| 58 |
stream = y
|
| 59 |
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
end_time = time.time()
|
| 62 |
latency = end_time - start_time
|
| 63 |
|
|
@@ -78,15 +113,23 @@ def clear_state():
|
|
| 78 |
with gr.Blocks() as microphone:
|
| 79 |
with gr.Column():
|
| 80 |
gr.Markdown(
|
| 81 |
-
f"# Realtime
|
| 82 |
with gr.Row():
|
| 83 |
input_audio_microphone = gr.Audio(streaming=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
output = gr.Textbox(label="Transcription", value="")
|
| 85 |
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
|
| 86 |
with gr.Row():
|
| 87 |
clear_button = gr.Button("Clear Output")
|
| 88 |
state = gr.State()
|
| 89 |
-
input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [
|
| 90 |
state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
|
| 91 |
clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
|
| 92 |
|
|
|
|
| 15 |
# shell=True,
|
| 16 |
# )
|
| 17 |
|
| 18 |
+
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
+
# torch_dtype = torch.float16
|
| 20 |
+
# MODEL_NAME = "openai/whisper-large-v3-turbo"
|
| 21 |
+
|
| 22 |
+
# model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 23 |
+
# MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
|
| 24 |
+
# )
|
| 25 |
+
# model.to(device)
|
| 26 |
+
|
| 27 |
+
# processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
| 28 |
+
# tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
|
| 29 |
+
|
| 30 |
+
# pipe = pipeline(
|
| 31 |
+
# task="automatic-speech-recognition",
|
| 32 |
+
# model=model,
|
| 33 |
+
# tokenizer=tokenizer,
|
| 34 |
+
# feature_extractor=processor.feature_extractor,
|
| 35 |
+
# chunk_length_s=10,
|
| 36 |
+
# torch_dtype=torch_dtype,
|
| 37 |
+
# device=device,
|
| 38 |
+
# )
|
| 39 |
+
|
| 40 |
+
from omegaconf import OmegaConf
|
| 41 |
+
|
| 42 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 43 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def load_pipe(model_id: str):
|
| 47 |
+
return pipeline(
|
| 48 |
+
"automatic-speech-recognition",
|
| 49 |
+
model=model_id,
|
| 50 |
+
max_new_tokens=128,
|
| 51 |
+
chunk_length_s=30,
|
| 52 |
+
batch_size=8,
|
| 53 |
+
torch_dtype=torch_dtype,
|
| 54 |
+
device=device,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
OmegaConf.register_new_resolver("load_pipe", load_pipe)
|
| 59 |
+
models_config = OmegaConf.to_object(OmegaConf.load("models.yaml"))
|
| 60 |
+
default_model_id = "whisper-large-v3"
|
| 61 |
+
model = models_config[default_model_id]["model"]
|
| 62 |
|
| 63 |
# @spaces.GPU
|
| 64 |
|
| 65 |
|
| 66 |
+
def stream_transcribe(stream, new_chunk, dialect_id):
|
| 67 |
start_time = time.time()
|
| 68 |
try:
|
| 69 |
sr, y = new_chunk
|
|
|
|
| 80 |
else:
|
| 81 |
stream = y
|
| 82 |
|
| 83 |
+
generate_kwargs = {
|
| 84 |
+
"task": "transcribe",
|
| 85 |
+
"language": "Chinese",
|
| 86 |
+
"num_beams": 1,
|
| 87 |
+
"prompt_ids": torch.from_numpy(model.tokenizer.get_prompt_ids(dialect_id)).to(
|
| 88 |
+
device
|
| 89 |
+
),
|
| 90 |
+
"sampling_rate": sr,
|
| 91 |
+
"raw": stream
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
# transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
|
| 95 |
+
transcription = model(generate_kwargs=generate_kwargs)["text"]
|
| 96 |
end_time = time.time()
|
| 97 |
latency = end_time - start_time
|
| 98 |
|
|
|
|
| 113 |
with gr.Blocks() as microphone:
|
| 114 |
with gr.Column():
|
| 115 |
gr.Markdown(
|
| 116 |
+
f"# Realtime Hakka ASR: \n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
|
| 117 |
with gr.Row():
|
| 118 |
input_audio_microphone = gr.Audio(streaming=True)
|
| 119 |
+
dialect_drop_down = gr.Dropdown(
|
| 120 |
+
choices=[
|
| 121 |
+
(k, v)
|
| 122 |
+
for k, v in models_config[default_model_id]["dialect_mapping"].items()
|
| 123 |
+
],
|
| 124 |
+
value=list(models_config[default_model_id]["dialect_mapping"].values())[0],
|
| 125 |
+
label="腔調",
|
| 126 |
+
)
|
| 127 |
output = gr.Textbox(label="Transcription", value="")
|
| 128 |
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
|
| 129 |
with gr.Row():
|
| 130 |
clear_button = gr.Button("Clear Output")
|
| 131 |
state = gr.State()
|
| 132 |
+
input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone, dialect_drop_down], [
|
| 133 |
state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
|
| 134 |
clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
|
| 135 |
|
models.yaml
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
whisper-large-v3:
|
| 2 |
+
model: ${load_pipe:formospeech/whisper-large-v3-taiwanese-hakka}
|
| 3 |
+
dialect_mapping:
|
| 4 |
+
四縣: htia_sixian
|
| 5 |
+
海陸: htia_hailu
|
| 6 |
+
大埔: htia_dapu
|
| 7 |
+
饒平: htia_raoping
|
| 8 |
+
詔安: htia_zhaoan
|
| 9 |
+
南四縣: htia_nansixian
|