File size: 2,957 Bytes
5d52c32
6c226f9
 
d790c0b
88183ad
1e8d252
6cd6646
6e80e89
23a2ead
e19d3c8
6e80e89
 
 
 
 
 
6c226f9
e19d3c8
 
e7a6563
e19d3c8
 
8d01bbb
e19d3c8
 
 
 
ea9f05f
6c226f9
 
 
e19d3c8
 
 
ceea111
e19d3c8
6c226f9
 
 
6e80e89
 
 
913488c
6e80e89
913488c
 
6e80e89
913488c
 
 
6e80e89
913488c
 
6e80e89
913488c
 
 
 
6e80e89
f631e7e
913488c
 
f631e7e
 
913488c
 
864e6b1
913488c
e19d3c8
 
 
3da85d4
6e80e89
580b358
 
 
6e80e89
fd5a036
3df1d51
6e80e89
 
e19d3c8
a84f14d
e19d3c8
 
 
a84f14d
913488c
6e80e89
 
580b358
3da85d4
ea9f05f
fc21d85
6e80e89
fd5a036
6e80e89
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import spaces
import torch
import gradio as gr
import tempfile
import os
import uuid
import scipy.io.wavfile
import time
import numpy as np
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
# import subprocess
# subprocess.run(
#     "pip install flash-attn --no-build-isolation",
#     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
#     shell=True,
# )

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16
MODEL_NAME = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="flash_attention_2"
)
model.to(device)

processor = AutoProcessor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=10,
    torch_dtype=torch_dtype,
    device=device,
)

# @spaces.GPU


def stream_transcribe(stream, new_chunk):
    start_time = time.time()
    try:
        sr, y = new_chunk

        # Convert to mono if stereo
        if y.ndim > 1:
            y = y.mean(axis=1)

        y = y.astype(np.float32)
        y /= np.max(np.abs(y))

        if stream is not None:
            stream = np.concatenate([stream, y])
        else:
            stream = y

        transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
        end_time = time.time()
        latency = end_time - start_time

        return stream, transcription, f"{latency:.2f}"
    except Exception as e:
        print(f"Error during Transcription: {e}")
        return stream, e, "Error"


def clear():
    return ""


def clear_state():
    return None


with gr.Blocks() as microphone:
    with gr.Column():
        gr.Markdown(
            f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
        with gr.Row():
            input_audio_microphone = gr.Audio(streaming=True)
            output = gr.Textbox(label="Transcription", value="")
            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
        with gr.Row():
            clear_button = gr.Button("Clear Output")
        state = gr.State()
        input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [
                                      state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
        clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])


with gr.Blocks(theme=gr.themes.Ocean()) as demo:
    gr.TabbedInterface([microphone], ["Microphone"])

demo.launch()