import spaces import torch import gradio as gr import tempfile import os import uuid import scipy.io.wavfile import time import numpy as np from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline # import subprocess # subprocess.run( # "pip install flash-attn --no-build-isolation", # env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, # shell=True, # ) # device = "cuda" if torch.cuda.is_available() else "cpu" # torch_dtype = torch.float16 # MODEL_NAME = "openai/whisper-large-v3-turbo" # model = AutoModelForSpeechSeq2Seq.from_pretrained( # MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True # ) # model.to(device) # processor = AutoProcessor.from_pretrained(MODEL_NAME) # tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME) # pipe = pipeline( # task="automatic-speech-recognition", # model=model, # tokenizer=tokenizer, # feature_extractor=processor.feature_extractor, # chunk_length_s=10, # torch_dtype=torch_dtype, # device=device, # ) from omegaconf import OmegaConf device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 default_model_id = "whisper-large-v3" def load_pipe(model_id: str): return pipeline( "automatic-speech-recognition", model=model_id, max_new_tokens=128, chunk_length_s=30, batch_size=8, torch_dtype=torch_dtype, device=device, ) OmegaConf.register_new_resolver("load_pipe", load_pipe) models_config = OmegaConf.to_object(OmegaConf.load("models.yaml")) model = models_config[default_model_id]["model"] def automatic_speech_recognition(model_id: str, dialect_id: str, audio_file: str): generate_kwargs = { "task": "transcribe", "language": "Chinese", "num_beams": 1, "prompt_ids": torch.from_numpy(model.tokenizer.get_prompt_ids(dialect_id)).to( device ), } return model(audio_file, generate_kwargs=generate_kwargs)["text"].replace( f" {dialect_id}", "" ) @spaces.GPU def stream_transcribe(stream, new_chunk, dialect_id): start_time = time.time() try: sr, y = new_chunk # Convert to mono if stereo if y.ndim > 1: y = y.mean(axis=1) y = y.astype(np.float32) y /= np.max(np.abs(y)) if stream is not None: stream = np.concatenate([stream, y]) else: stream = y generate_kwargs = { "task": "transcribe", "language": "Chinese", "num_beams": 1, "prompt_ids": torch.from_numpy(model.tokenizer.get_prompt_ids(dialect_id)).to( device ) } # transcription = pipe({"sampling_rate": sr, "raw": stream})["text"] transcription = model(stream, generate_kwargs=generate_kwargs)["text"] end_time = time.time() latency = end_time - start_time return stream, transcription, f"{latency:.2f}" except Exception as e: print(f"Error during Transcription: {e}") return stream, e, "Error" def clear(): return "" def clear_state(): return None with gr.Blocks() as microphone: with gr.Column(): gr.Markdown( f"# Realtime Hakka Asr: \nNote: The first token takes about 5 seconds. After that, it works flawlessly.") with gr.Row(): input_audio_microphone = gr.Audio(streaming=True) dialect_drop_down = gr.Dropdown( choices=[ (k, v) for k, v in models_config[default_model_id]["dialect_mapping"].items() ], value=list(models_config[default_model_id]["dialect_mapping"].values())[0], label="腔調", ) with gr.Row(): output = gr.Textbox(label="Transcription", value="") latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0) with gr.Row(): clear_button = gr.Button("Clear Output") state = gr.State() input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone, dialect_drop_down], [ state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None) clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output]) with gr.Blocks(theme=gr.themes.Ocean()) as demo: gr.TabbedInterface([microphone], ["Microphone"]) demo.launch()