transcriptor / app.py
xangcastle's picture
adding translation
e3eb20a
import os.path
import pathlib
from io import StringIO
import gradio as gr
import whisper
import numpy as np
import pytube as pt
import ffmpeg
import textwrap
import cv2
from transformers import pipeline
APP_DIR = pathlib.Path(__file__).parent.absolute()
LOCAL_DIR = APP_DIR / "local"
LOCAL_DIR.mkdir(exist_ok=True)
save_dir = LOCAL_DIR / "output"
save_dir.mkdir(exist_ok=True)
transcriber = whisper.load_model("base")
print(
f"Model is {'multilingual' if transcriber.is_multilingual else 'English-only'} "
f"and has {sum(np.prod(p.shape) for p in transcriber.parameters()):,} parameters."
)
options = dict(language='en', beam_size=5, best_of=5)
transcribe_options = dict(task="translate", **options)
translator = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es")
def translate(text):
return translator(text)[0]["translation_text"]
def format_timestamp(seconds, always_include_hours=False, fractional_seperator='.'):
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractional_seperator}{milliseconds:03d}"
def process_text(text: str, max_line_width=None):
if max_line_width is None or max_line_width < 0:
return text
lines = textwrap.wrap(text, width=max_line_width, tabsize=4)
return '\n'.join(lines)
def write_srt(transcript, file, max_line_width=None):
for i, segment in enumerate(transcript, start=1):
text = process_text(segment['text'].strip(), max_line_width).replace('-->', '->')
# write srt lines
print(
f"{i}\n"
f"{format_timestamp(segment['start'], always_include_hours=True, fractional_seperator=',')} --> "
f"{format_timestamp(segment['end'], always_include_hours=True, fractional_seperator=',')}\n"
f"{text}\n",
file=file,
flush=True,
)
def get_subs(segments, max_line_width):
for sentence in segments:
sentence['text'] = translate(sentence["text"])
segment_stream = StringIO()
write_srt(segments, file=segment_stream, max_line_width=max_line_width)
segment_stream.seek(0)
return segment_stream.read()
def generate_subtitled_video(video, audio, transcript):
video_file = ffmpeg.input(video)
audio_file = ffmpeg.input(audio)
ffmpeg.concat(video_file.filter(
"subtitles", transcript
), audio_file, v=1, a=1).output(f"{save_dir}/final.mp4").run(quiet=True, overwrite_output=True)
return f"{save_dir}/final.mp4"
def generate_subtitled_audio(audio, transcript):
if not os.path.exists(f'{save_dir}/cover.jpg'):
cover = np.zeros([320, 640, 3], dtype=np.uint8)
cv2.imwrite(f'{save_dir}/cover.jpg', cover)
os.system(f'ffmpeg -y -loop 1 -i {save_dir}/cover.jpg '
f'-i {audio} -c:v libx264 '
f'-tune stillimage -c:a aac -b:a 192k -pix_fmt yuv420p -shortest '
f'-vf "subtitles={transcript}" {save_dir}/final.mp4')
return f"{save_dir}/final.mp4"
def transcribe(audio):
transcription = transcriber.transcribe(audio, **transcribe_options)
srt = get_subs(transcription["segments"], 80)
with open(f"{save_dir}/transcript.srt", "w+", encoding='utf8') as f:
f.writelines(srt)
f.close()
return transcription["text"], srt
def transcribe_audio(audio):
transcription, translation = transcribe(audio)
return generate_subtitled_audio(audio, f"{save_dir}/transcript.srt"), transcription, translation
def transcribe_video(video):
audio = ffmpeg.input(video)
audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
ffmpeg.run(audio, overwrite_output=True)
audio = whisper.load_audio(f"{save_dir}/output.wav")
transcription, translation = transcribe(audio)
return generate_subtitled_video(video, f"{save_dir}/output.wav",
f"{save_dir}/transcript.srt"), transcription, translation
def youtube_transcribe(url):
yt = pt.YouTube(url)
stream = yt.streams.filter(only_audio=True).first()
audio = stream.download(filename='youtube.mp4')
audio = whisper.load_audio(audio)
return transcribe(audio)
mic_interface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=["text", "text"],
title="Transcribir y traducir audio",
)
audio_interface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs=["video", "text", "text"],
title="Transcribir y traducir audio",
)
video_interface = gr.Interface(
fn=transcribe_video,
inputs="video",
outputs=["video", "text", "text"],
title="Transcribir y traducir audio",
)
youtuve_interface = gr.Interface(
fn=youtube_transcribe,
inputs="text",
outputs=["text", "text"],
title="Transcribir y traducir audio",
)
if __name__ == "__main__":
gr.TabbedInterface(
[mic_interface, audio_interface, video_interface, youtuve_interface],
["Micrófono", "Audio", "Video", "YouTube"],
).launch()