Spaces:
Runtime error
Runtime error
Serhiy Stetskovych
commited on
Commit
·
37f9a5d
1
Parent(s):
39cc8c4
App with new vocoder
Browse files- app.py +33 -28
- prompt.wav +0 -0
- prompt22050.wav +0 -0
app.py
CHANGED
|
@@ -28,15 +28,17 @@ from vocos import Vocos
|
|
| 28 |
|
| 29 |
PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt'
|
| 30 |
#PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt'
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000'
|
| 33 |
|
| 34 |
|
| 35 |
transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
|
| 36 |
-
wav, sr = torchaudio.load('
|
| 37 |
|
| 38 |
prompt = mel_spectrogram(
|
| 39 |
-
wav,
|
| 40 |
1024,
|
| 41 |
80,
|
| 42 |
22050,
|
|
@@ -85,7 +87,7 @@ def load_vocos(checkpoint_path, config_path, device):
|
|
| 85 |
|
| 86 |
|
| 87 |
def to_waveform(mel, vocoder, denoiser=None):
|
| 88 |
-
return vocoder.decode(mel).cpu().squeeze()
|
| 89 |
|
| 90 |
# audio = vocoder(mel).clamp(-1, 1)
|
| 91 |
# if denoiser is not None:
|
|
@@ -113,9 +115,10 @@ model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
|
|
| 113 |
_ = model.eval()
|
| 114 |
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
#vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=
|
|
|
|
| 119 |
denoiser = None#Denoiser(vocoder, mode="zeros")
|
| 120 |
|
| 121 |
|
|
@@ -134,23 +137,25 @@ def synthesise(text, speed):
|
|
| 134 |
length_scale=1/speed,
|
| 135 |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
|
| 136 |
|
| 137 |
-
guidance_scale=
|
| 138 |
|
| 139 |
)
|
| 140 |
-
waveform_vocos =
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
|
| 145 |
|
| 146 |
-
return text_processed['x_phones'][1::2], (22050, waveform_vocos.numpy())
|
| 147 |
|
| 148 |
|
| 149 |
description = f'''
|
| 150 |
# Експериментальна апка для генерації аудіо з тексту.
|
| 151 |
|
| 152 |
pflow checkpoint {PFLOW_MODEL_PATH}
|
| 153 |
-
|
|
|
|
|
|
|
| 154 |
'''
|
| 155 |
|
| 156 |
|
|
@@ -164,28 +169,28 @@ if __name__ == "__main__":
|
|
| 164 |
],
|
| 165 |
outputs=[
|
| 166 |
gr.Text(label='Фонемізований текст:', lines=5),
|
| 167 |
-
# gr.Audio(
|
| 168 |
-
# label="Vocos 44100 аудіо:",
|
| 169 |
-
# autoplay=False,
|
| 170 |
-
# streaming=False,
|
| 171 |
-
# type="numpy",
|
| 172 |
-
# ),
|
| 173 |
gr.Audio(
|
| 174 |
-
label="Vocos аудіо:",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
autoplay=False,
|
| 176 |
streaming=False,
|
| 177 |
type="numpy",
|
| 178 |
),
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
|
| 186 |
],
|
| 187 |
allow_flagging ='manual',
|
| 188 |
-
flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")],
|
| 189 |
cache_examples=True,
|
| 190 |
title='',
|
| 191 |
# description=description,
|
|
|
|
| 28 |
|
| 29 |
PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt'
|
| 30 |
#PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt'
|
| 31 |
+
VOCODER22_MODEL_PATH = 'BSC-LT/vocos-mel-22khz'
|
| 32 |
+
VOCODER44_MODEL_PATH = 'patriotyk/vocos-mel-hifigan-compat-44100khz'
|
| 33 |
+
|
| 34 |
HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000'
|
| 35 |
|
| 36 |
|
| 37 |
transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
|
| 38 |
+
wav, sr = torchaudio.load('prompt22050.wav')
|
| 39 |
|
| 40 |
prompt = mel_spectrogram(
|
| 41 |
+
transform(wav),
|
| 42 |
1024,
|
| 43 |
80,
|
| 44 |
22050,
|
|
|
|
| 87 |
|
| 88 |
|
| 89 |
def to_waveform(mel, vocoder, denoiser=None):
|
| 90 |
+
return vocoder.decode(mel).clamp(-1, 1).cpu().squeeze()
|
| 91 |
|
| 92 |
# audio = vocoder(mel).clamp(-1, 1)
|
| 93 |
# if denoiser is not None:
|
|
|
|
| 115 |
_ = model.eval()
|
| 116 |
|
| 117 |
|
| 118 |
+
hifigan = load_hifigan(HIFIGAN_MODEL_PATH, device)
|
| 119 |
+
vocos_22050 = Vocos.from_pretrained(VOCODER22_MODEL_PATH)
|
| 120 |
+
#vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=209_step=3924480_val_loss=3.7036_44100_11.ckpt', 'vocos.yaml', device)
|
| 121 |
+
vocos_44100 = Vocos.from_pretrained(VOCODER44_MODEL_PATH)
|
| 122 |
denoiser = None#Denoiser(vocoder, mode="zeros")
|
| 123 |
|
| 124 |
|
|
|
|
| 137 |
length_scale=1/speed,
|
| 138 |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
|
| 139 |
|
| 140 |
+
guidance_scale=2.0
|
| 141 |
|
| 142 |
)
|
| 143 |
+
waveform_vocos = vocos_22050.decode(output["mel"]).cpu().squeeze()
|
| 144 |
+
waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze()
|
| 145 |
+
waveform_hifigan = hifigan(output["mel"]).clamp(-1, 1).cpu().squeeze()
|
| 146 |
+
transform = torchaudio.transforms.Vol(gain=-18, gain_type="db")
|
| 147 |
|
| 148 |
|
| 149 |
+
return text_processed['x_phones'][1::2], (44100, waveform_vocos_44100.numpy()), (22050, waveform_vocos.numpy()), (22050, transform(waveform_hifigan).numpy())
|
| 150 |
|
| 151 |
|
| 152 |
description = f'''
|
| 153 |
# Експериментальна апка для генерації аудіо з тексту.
|
| 154 |
|
| 155 |
pflow checkpoint {PFLOW_MODEL_PATH}
|
| 156 |
+
Vocos 44100 аудіо - {VOCODER44_MODEL_PATH}
|
| 157 |
+
Vocos 22050 аудіо - {VOCODER22_MODEL_PATH}
|
| 158 |
+
HIFIGAN 22050 аудіо - {HIFIGAN_MODEL_PATH}
|
| 159 |
'''
|
| 160 |
|
| 161 |
|
|
|
|
| 169 |
],
|
| 170 |
outputs=[
|
| 171 |
gr.Text(label='Фонемізований текст:', lines=5),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
gr.Audio(
|
| 173 |
+
label="Vocos 44100 аудіо:",
|
| 174 |
+
autoplay=False,
|
| 175 |
+
streaming=False,
|
| 176 |
+
type="numpy",
|
| 177 |
+
),
|
| 178 |
+
gr.Audio(
|
| 179 |
+
label="Vocos 22050 аудіо:",
|
| 180 |
autoplay=False,
|
| 181 |
streaming=False,
|
| 182 |
type="numpy",
|
| 183 |
),
|
| 184 |
+
gr.Audio(
|
| 185 |
+
label="HIFIGAN 22050 аудіо:",
|
| 186 |
+
autoplay=False,
|
| 187 |
+
streaming=False,
|
| 188 |
+
type="numpy",
|
| 189 |
+
)
|
| 190 |
|
| 191 |
],
|
| 192 |
allow_flagging ='manual',
|
| 193 |
+
#flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")],
|
| 194 |
cache_examples=True,
|
| 195 |
title='',
|
| 196 |
# description=description,
|
prompt.wav
DELETED
|
Binary file (112 kB)
|
|
|
prompt22050.wav
ADDED
|
Binary file (655 kB). View file
|
|
|