File size: 5,434 Bytes
e3eb20a
 
 
7d11cd2
bee8bb6
 
3355b10
e3eb20a
 
 
 
3355b10
e3eb20a
 
 
 
 
 
 
 
bee8bb6
e3eb20a
 
7630a02
3355b10
e3eb20a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3355b10
7d11cd2
43f52c4
e3eb20a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3355b10
 
 
 
 
d8cd210
 
3355b10
 
 
 
 
 
 
 
 
 
 
e3eb20a
d8cd210
e3eb20a
3355b10
7d11cd2
 
3355b10
e3eb20a
3355b10
e3eb20a
3355b10
 
 
d8cd210
3355b10
 
 
bee8bb6
3355b10
7d11cd2
3355b10
 
d8cd210
3355b10
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os.path
import pathlib
from io import StringIO
import gradio as gr
import whisper
import numpy as np
import pytube as pt
import ffmpeg
import textwrap
import cv2
from transformers import pipeline

APP_DIR = pathlib.Path(__file__).parent.absolute()

LOCAL_DIR = APP_DIR / "local"
LOCAL_DIR.mkdir(exist_ok=True)
save_dir = LOCAL_DIR / "output"
save_dir.mkdir(exist_ok=True)

transcriber = whisper.load_model("base")
print(
    f"Model is {'multilingual' if transcriber.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in transcriber.parameters()):,} parameters."
)

options = dict(language='en', beam_size=5, best_of=5)
transcribe_options = dict(task="translate", **options)

translator = pipeline("translation_en_to_es", model="Helsinki-NLP/opus-mt-en-es")


def translate(text):
    return translator(text)[0]["translation_text"]


def format_timestamp(seconds, always_include_hours=False, fractional_seperator='.'):
    assert seconds >= 0, "non-negative timestamp expected"
    milliseconds = round(seconds * 1000.0)

    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000

    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000

    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000

    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
    return f"{hours_marker}{minutes:02d}:{seconds:02d}{fractional_seperator}{milliseconds:03d}"


def process_text(text: str, max_line_width=None):
    if max_line_width is None or max_line_width < 0:
        return text

    lines = textwrap.wrap(text, width=max_line_width, tabsize=4)
    return '\n'.join(lines)


def write_srt(transcript, file, max_line_width=None):
    for i, segment in enumerate(transcript, start=1):
        text = process_text(segment['text'].strip(), max_line_width).replace('-->', '->')

        # write srt lines
        print(
            f"{i}\n"
            f"{format_timestamp(segment['start'], always_include_hours=True, fractional_seperator=',')} --> "
            f"{format_timestamp(segment['end'], always_include_hours=True, fractional_seperator=',')}\n"
            f"{text}\n",
            file=file,
            flush=True,
        )


def get_subs(segments, max_line_width):
    for sentence in segments:
        sentence['text'] = translate(sentence["text"])
    segment_stream = StringIO()
    write_srt(segments, file=segment_stream, max_line_width=max_line_width)
    segment_stream.seek(0)
    return segment_stream.read()


def generate_subtitled_video(video, audio, transcript):
    video_file = ffmpeg.input(video)
    audio_file = ffmpeg.input(audio)
    ffmpeg.concat(video_file.filter(
        "subtitles", transcript
    ), audio_file, v=1, a=1).output(f"{save_dir}/final.mp4").run(quiet=True, overwrite_output=True)
    return f"{save_dir}/final.mp4"


def generate_subtitled_audio(audio, transcript):
    if not os.path.exists(f'{save_dir}/cover.jpg'):
        cover = np.zeros([320, 640, 3], dtype=np.uint8)
        cv2.imwrite(f'{save_dir}/cover.jpg', cover)
    os.system(f'ffmpeg -y -loop 1 -i {save_dir}/cover.jpg '
              f'-i {audio} -c:v libx264 '
              f'-tune stillimage -c:a aac -b:a 192k -pix_fmt yuv420p -shortest '
              f'-vf "subtitles={transcript}" {save_dir}/final.mp4')
    return f"{save_dir}/final.mp4"


def transcribe(audio):
    transcription = transcriber.transcribe(audio, **transcribe_options)
    srt = get_subs(transcription["segments"], 80)
    with open(f"{save_dir}/transcript.srt", "w+", encoding='utf8') as f:
        f.writelines(srt)
        f.close()
    return transcription["text"], srt


def transcribe_audio(audio):
    transcription, translation = transcribe(audio)
    return generate_subtitled_audio(audio, f"{save_dir}/transcript.srt"), transcription, translation


def transcribe_video(video):
    audio = ffmpeg.input(video)
    audio = ffmpeg.output(audio, f"{save_dir}/output.wav", acodec="pcm_s16le", ac=1, ar="16k")
    ffmpeg.run(audio, overwrite_output=True)
    audio = whisper.load_audio(f"{save_dir}/output.wav")
    transcription, translation = transcribe(audio)
    return generate_subtitled_video(video, f"{save_dir}/output.wav",
                                    f"{save_dir}/transcript.srt"), transcription, translation


def youtube_transcribe(url):
    yt = pt.YouTube(url)
    stream = yt.streams.filter(only_audio=True).first()
    audio = stream.download(filename='youtube.mp4')
    audio = whisper.load_audio(audio)
    return transcribe(audio)


mic_interface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=["text", "text"],
    title="Transcribir y traducir audio",
)

audio_interface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(type="filepath"),
    outputs=["video", "text", "text"],
    title="Transcribir y traducir audio",
)

video_interface = gr.Interface(
    fn=transcribe_video,
    inputs="video",
    outputs=["video", "text", "text"],
    title="Transcribir y traducir audio",
)

youtuve_interface = gr.Interface(
    fn=youtube_transcribe,
    inputs="text",
    outputs=["text", "text"],
    title="Transcribir y traducir audio",
)

if __name__ == "__main__":
    gr.TabbedInterface(
        [mic_interface, audio_interface, video_interface, youtuve_interface],
        ["Micrófono", "Audio", "Video", "YouTube"],
    ).launch()