Spaces:
Sleeping
Sleeping
File size: 2,957 Bytes
5d52c32 6c226f9 d790c0b 88183ad 1e8d252 6cd6646 6e80e89 23a2ead e19d3c8 6e80e89 6c226f9 e19d3c8 e7a6563 e19d3c8 8d01bbb e19d3c8 ea9f05f 6c226f9 e19d3c8 ceea111 e19d3c8 6c226f9 6e80e89 913488c 6e80e89 913488c 6e80e89 913488c 6e80e89 913488c 6e80e89 913488c 6e80e89 f631e7e 913488c f631e7e 913488c 864e6b1 913488c e19d3c8 3da85d4 6e80e89 580b358 6e80e89 fd5a036 3df1d51 6e80e89 e19d3c8 a84f14d e19d3c8 a84f14d 913488c 6e80e89 580b358 3da85d4 ea9f05f fc21d85 6e80e89 fd5a036 6e80e89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import spaces
import torch
import gradio as gr
import tempfile
import os
import uuid
import scipy.io.wavfile
import time
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
# import subprocess
# subprocess.run(
# "pip install flash-attn --no-build-isolation",
# env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
# shell=True,
# )
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16
MODEL_NAME = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
)
model.to(device)
processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=10,
torch_dtype=torch_dtype,
device=device,
)
# @spaces.GPU
def stream_transcribe(stream, new_chunk):
start_time = time.time()
try:
sr, y = new_chunk
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
y /= np.max(np.abs(y))
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
end_time = time.time()
latency = end_time - start_time
return stream, transcription, f"{latency:.2f}"
except Exception as e:
print(f"Error during Transcription: {e}")
return stream, e, "Error"
def clear():
return ""
def clear_state():
return None
with gr.Blocks() as microphone:
with gr.Column():
gr.Markdown(
f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
with gr.Row():
input_audio_microphone = gr.Audio(streaming=True)
output = gr.Textbox(label="Transcription", value="")
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
with gr.Row():
clear_button = gr.Button("Clear Output")
state = gr.State()
input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [
state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
gr.TabbedInterface([microphone], ["Microphone"])
demo.launch()
|