import os.path import pathlib from io import StringIO import gradio as gr import whisper import numpy as np import pytube as pt import ffmpeg import textwrap import cv2 from transformers import pipeline APP_DIR = pathlib.Path(__file__).parent.absolute() LOCAL_DIR = APP_DIR / "local" LOCAL_DIR.mkdir(exist_ok=True) save_dir = LOCAL_DIR / "output" save_dir.mkdir(exist_ok=True) transcriber = whisper.load_model("base") print( f"Model is {'multilingual' if transcriber.is_multilingual else 'English-only'} " f"and has {sum(np.prod(p.shape) for p in transcriber.parameters()):,} parameters." ) options = dict(language='en', beam_size=5, best_of=5) transcribe_options = dict(task="translate", **options) translator = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es") def translate(text): return translator(text)[0]["translation_text"] def format_timestamp(seconds, always_include_hours=False, fractional_seperator='.'): assert seconds >= 0, "non-negative timestamp expected" milliseconds = round(seconds * 1000.0) hours = milliseconds // 3_600_000 milliseconds -= hours * 3_600_000 minutes = milliseconds // 60_000 milliseconds -= minutes * 60_000 seconds = milliseconds // 1_000 milliseconds -= seconds * 1_000 hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractional_seperator}{milliseconds:03d}" def process_text(text: str, max_line_width=None): if max_line_width is None or max_line_width < 0: return text lines = textwrap.wrap(text, width=max_line_width, tabsize=4) return '\n'.join(lines) def write_srt(transcript, file, max_line_width=None): for i, segment in enumerate(transcript, start=1): text = process_text(segment['text'].strip(), max_line_width).replace('-->', '->') # write srt lines print( f"{i}\n" f"{format_timestamp(segment['start'], always_include_hours=True, fractional_seperator=',')} --> " f"{format_timestamp(segment['end'], always_include_hours=True, fractional_seperator=',')}\n" f"{text}\n", file=file, flush=True, ) def get_subs(segments, max_line_width): for sentence in segments: sentence['text'] = translate(sentence["text"]) segment_stream = StringIO() write_srt(segments, file=segment_stream, max_line_width=max_line_width) segment_stream.seek(0) return segment_stream.read() def generate_subtitled_video(video, audio, transcript): video_file = ffmpeg.input(video) audio_file = ffmpeg.input(audio) ffmpeg.concat(video_file.filter( "subtitles", transcript ), audio_file, v=1, a=1).output(f"{save_dir}/final.mp4").run(quiet=True, overwrite_output=True) return f"{save_dir}/final.mp4" def generate_subtitled_audio(audio, transcript): if not os.path.exists(f'{save_dir}/cover.jpg'): cover = np.zeros([320, 640, 3], dtype=np.uint8) cv2.imwrite(f'{save_dir}/cover.jpg', cover) os.system(f'ffmpeg -y -loop 1 -i {save_dir}/cover.jpg ' f'-i {audio} -c:v libx264 ' f'-tune stillimage -c:a aac -b:a 192k -pix_fmt yuv420p -shortest ' f'-vf "subtitles={transcript}" {save_dir}/final.mp4') return f"{save_dir}/final.mp4" def transcribe(audio): transcription = transcriber.transcribe(audio, **transcribe_options) srt = get_subs(transcription["segments"], 80) with open(f"{save_dir}/transcript.srt", "w+", encoding='utf8') as f: f.writelines(srt) f.close() return transcription["text"], srt def transcribe_audio(audio): transcription, translation = transcribe(audio) return generate_subtitled_audio(audio, f"{save_dir}/transcript.srt"), transcription, translation def transcribe_video(video): audio = ffmpeg.input(video) audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k") ffmpeg.run(audio, overwrite_output=True) audio = whisper.load_audio(f"{save_dir}/output.wav") transcription, translation = transcribe(audio) return generate_subtitled_video(video, f"{save_dir}/output.wav", f"{save_dir}/transcript.srt"), transcription, translation def youtube_transcribe(url): yt = pt.YouTube(url) stream = yt.streams.filter(only_audio=True).first() audio = stream.download(filename='youtube.mp4') audio = whisper.load_audio(audio) return transcribe(audio) mic_interface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs=["text", "text"], title="Transcribir y traducir audio", ) audio_interface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), outputs=["video", "text", "text"], title="Transcribir y traducir audio", ) video_interface = gr.Interface( fn=transcribe_video, inputs="video", outputs=["video", "text", "text"], title="Transcribir y traducir audio", ) youtuve_interface = gr.Interface( fn=youtube_transcribe, inputs="text", outputs=["text", "text"], title="Transcribir y traducir audio", ) if __name__ == "__main__": gr.TabbedInterface( [mic_interface, audio_interface, video_interface, youtuve_interface], ["Micrófono", "Audio", "Video", "YouTube"], ).launch()