Spaces:
Build error
Build error
File size: 5,434 Bytes
e3eb20a 7d11cd2 bee8bb6 3355b10 e3eb20a 3355b10 e3eb20a bee8bb6 e3eb20a 7630a02 3355b10 e3eb20a 3355b10 7d11cd2 43f52c4 e3eb20a 3355b10 d8cd210 3355b10 e3eb20a d8cd210 e3eb20a 3355b10 7d11cd2 3355b10 e3eb20a 3355b10 e3eb20a 3355b10 d8cd210 3355b10 bee8bb6 3355b10 7d11cd2 3355b10 d8cd210 3355b10 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import os.path
import pathlib
from io import StringIO
import gradio as gr
import whisper
import numpy as np
import pytube as pt
import ffmpeg
import textwrap
import cv2
from transformers import pipeline
APP_DIR = pathlib.Path(__file__).parent.absolute()
LOCAL_DIR = APP_DIR / "local"
LOCAL_DIR.mkdir(exist_ok=True)
save_dir = LOCAL_DIR / "output"
save_dir.mkdir(exist_ok=True)
transcriber = whisper.load_model("base")
print(
f"Model is {'multilingual' if transcriber.is_multilingual else 'English-only'} "
f"and has {sum(np.prod(p.shape) for p in transcriber.parameters()):,} parameters."
)
options = dict(language='en', beam_size=5, best_of=5)
transcribe_options = dict(task="translate", **options)
translator = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es")
def translate(text):
return translator(text)[0]["translation_text"]
def format_timestamp(seconds, always_include_hours=False, fractional_seperator='.'):
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractional_seperator}{milliseconds:03d}"
def process_text(text: str, max_line_width=None):
if max_line_width is None or max_line_width < 0:
return text
lines = textwrap.wrap(text, width=max_line_width, tabsize=4)
return '\n'.join(lines)
def write_srt(transcript, file, max_line_width=None):
for i, segment in enumerate(transcript, start=1):
text = process_text(segment['text'].strip(), max_line_width).replace('-->', '->')
# write srt lines
print(
f"{i}\n"
f"{format_timestamp(segment['start'], always_include_hours=True, fractional_seperator=',')} --> "
f"{format_timestamp(segment['end'], always_include_hours=True, fractional_seperator=',')}\n"
f"{text}\n",
file=file,
flush=True,
)
def get_subs(segments, max_line_width):
for sentence in segments:
sentence['text'] = translate(sentence["text"])
segment_stream = StringIO()
write_srt(segments, file=segment_stream, max_line_width=max_line_width)
segment_stream.seek(0)
return segment_stream.read()
def generate_subtitled_video(video, audio, transcript):
video_file = ffmpeg.input(video)
audio_file = ffmpeg.input(audio)
ffmpeg.concat(video_file.filter(
"subtitles", transcript
), audio_file, v=1, a=1).output(f"{save_dir}/final.mp4").run(quiet=True, overwrite_output=True)
return f"{save_dir}/final.mp4"
def generate_subtitled_audio(audio, transcript):
if not os.path.exists(f'{save_dir}/cover.jpg'):
cover = np.zeros([320, 640, 3], dtype=np.uint8)
cv2.imwrite(f'{save_dir}/cover.jpg', cover)
os.system(f'ffmpeg -y -loop 1 -i {save_dir}/cover.jpg '
f'-i {audio} -c:v libx264 '
f'-tune stillimage -c:a aac -b:a 192k -pix_fmt yuv420p -shortest '
f'-vf "subtitles={transcript}" {save_dir}/final.mp4')
return f"{save_dir}/final.mp4"
def transcribe(audio):
transcription = transcriber.transcribe(audio, **transcribe_options)
srt = get_subs(transcription["segments"], 80)
with open(f"{save_dir}/transcript.srt", "w+", encoding='utf8') as f:
f.writelines(srt)
f.close()
return transcription["text"], srt
def transcribe_audio(audio):
transcription, translation = transcribe(audio)
return generate_subtitled_audio(audio, f"{save_dir}/transcript.srt"), transcription, translation
def transcribe_video(video):
audio = ffmpeg.input(video)
audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
ffmpeg.run(audio, overwrite_output=True)
audio = whisper.load_audio(f"{save_dir}/output.wav")
transcription, translation = transcribe(audio)
return generate_subtitled_video(video, f"{save_dir}/output.wav",
f"{save_dir}/transcript.srt"), transcription, translation
def youtube_transcribe(url):
yt = pt.YouTube(url)
stream = yt.streams.filter(only_audio=True).first()
audio = stream.download(filename='youtube.mp4')
audio = whisper.load_audio(audio)
return transcribe(audio)
mic_interface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=["text", "text"],
title="Transcribir y traducir audio",
)
audio_interface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs=["video", "text", "text"],
title="Transcribir y traducir audio",
)
video_interface = gr.Interface(
fn=transcribe_video,
inputs="video",
outputs=["video", "text", "text"],
title="Transcribir y traducir audio",
)
youtuve_interface = gr.Interface(
fn=youtube_transcribe,
inputs="text",
outputs=["text", "text"],
title="Transcribir y traducir audio",
)
if __name__ == "__main__":
gr.TabbedInterface(
[mic_interface, audio_interface, video_interface, youtuve_interface],
["Micrófono", "Audio", "Video", "YouTube"],
).launch()
|