Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,98 +1,64 @@
|
|
1 |
from turtle import title
|
2 |
import gradio as gr
|
3 |
-
|
4 |
import git
|
5 |
import os
|
6 |
os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS')
|
7 |
os.system('pip install -q -e TTS/')
|
8 |
os.system('pip install -q torchaudio==0.9.0')
|
9 |
-
|
10 |
import sys
|
11 |
TTS_PATH = "TTS/"
|
12 |
-
|
13 |
-
# add libraries into environment
|
14 |
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
|
15 |
-
|
16 |
import os
|
17 |
import string
|
18 |
import time
|
19 |
import argparse
|
20 |
import json
|
21 |
-
|
22 |
import numpy as np
|
23 |
import IPython
|
24 |
from IPython.display import Audio
|
25 |
-
|
26 |
-
|
27 |
import torch
|
28 |
-
|
29 |
from TTS.tts.utils.synthesis import synthesis
|
30 |
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
31 |
try:
|
32 |
from TTS.utils.audio import AudioProcessor
|
33 |
except:
|
34 |
from TTS.utils.audio import AudioProcessor
|
35 |
-
|
36 |
-
|
37 |
from TTS.tts.models import setup_model
|
38 |
from TTS.config import load_config
|
39 |
from TTS.tts.models.vits import *
|
40 |
|
41 |
OUT_PATH = 'out/'
|
42 |
-
|
43 |
-
# create output path
|
44 |
os.makedirs(OUT_PATH, exist_ok=True)
|
45 |
|
46 |
-
# model vars
|
47 |
MODEL_PATH = '/home/user/app/best_model_latest.pth.tar'
|
48 |
CONFIG_PATH = '/home/user/app/config.json'
|
49 |
TTS_LANGUAGES = "/home/user/app/language_ids.json"
|
50 |
TTS_SPEAKERS = "/home/user/app/speakers.json"
|
51 |
USE_CUDA = torch.cuda.is_available()
|
52 |
|
53 |
-
# load the config
|
54 |
C = load_config(CONFIG_PATH)
|
55 |
-
|
56 |
-
|
57 |
-
# load the audio processor
|
58 |
ap = AudioProcessor(**C.audio)
|
59 |
-
|
60 |
speaker_embedding = None
|
61 |
-
|
62 |
C.model_args['d_vector_file'] = TTS_SPEAKERS
|
63 |
C.model_args['use_speaker_encoder_as_loss'] = False
|
64 |
-
|
65 |
model = setup_model(C)
|
66 |
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
|
67 |
-
# print(model.language_manager.num_languages, model.embedded_language_dim)
|
68 |
-
# print(model.emb_l)
|
69 |
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
|
70 |
-
# remove speaker encoder
|
71 |
model_weights = cp['model'].copy()
|
72 |
for key in list(model_weights.keys()):
|
73 |
if "speaker_encoder" in key:
|
74 |
del model_weights[key]
|
75 |
-
|
76 |
model.load_state_dict(model_weights)
|
77 |
-
|
78 |
-
|
79 |
model.eval()
|
80 |
-
|
81 |
if USE_CUDA:
|
82 |
model = model.cuda()
|
83 |
-
|
84 |
-
# synthesize voice
|
85 |
use_griffin_lim = False
|
86 |
-
|
87 |
os.system('pip install -q pydub ffmpeg-normalize')
|
88 |
-
|
89 |
CONFIG_SE_PATH = "config_se.json"
|
90 |
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
|
91 |
-
|
92 |
from TTS.tts.utils.speakers import SpeakerManager
|
93 |
from pydub import AudioSegment
|
94 |
import librosa
|
95 |
-
|
96 |
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)
|
97 |
|
98 |
def compute_spec(ref_file):
|
@@ -101,8 +67,6 @@ def compute_spec(ref_file):
|
|
101 |
spec = torch.FloatTensor(spec).unsqueeze(0)
|
102 |
return spec
|
103 |
|
104 |
-
|
105 |
-
|
106 |
def greet(Text,Voicetoclone,VoiceMicrophone):
|
107 |
text= "%s" % (Text)
|
108 |
if Voicetoclone is not None:
|
@@ -130,7 +94,6 @@ def greet(Text,Voicetoclone,VoiceMicrophone):
|
|
130 |
text = text
|
131 |
model.language_manager.language_id_mapping
|
132 |
language_id = 0
|
133 |
-
|
134 |
print(" > text: {}".format(text))
|
135 |
wav, alignment, _, _ = synthesis(
|
136 |
model,
|
@@ -160,6 +123,6 @@ demo = gr.Interface(
|
|
160 |
fn=greet,
|
161 |
inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),gr.Audio(type="filepath", source="upload",label='Please upload a voice to clone (max. 30mb)'),gr.Audio(source="microphone", type="filepath", streaming=True)],
|
162 |
outputs="audio",
|
163 |
-
title="
|
164 |
)
|
165 |
demo.launch()
|
|
|
1 |
from turtle import title
|
2 |
import gradio as gr
|
|
|
3 |
import git
|
4 |
import os
|
5 |
os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS')
|
6 |
os.system('pip install -q -e TTS/')
|
7 |
os.system('pip install -q torchaudio==0.9.0')
|
|
|
8 |
import sys
|
9 |
TTS_PATH = "TTS/"
|
|
|
|
|
10 |
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
|
|
|
11 |
import os
|
12 |
import string
|
13 |
import time
|
14 |
import argparse
|
15 |
import json
|
|
|
16 |
import numpy as np
|
17 |
import IPython
|
18 |
from IPython.display import Audio
|
|
|
|
|
19 |
import torch
|
|
|
20 |
from TTS.tts.utils.synthesis import synthesis
|
21 |
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
22 |
try:
|
23 |
from TTS.utils.audio import AudioProcessor
|
24 |
except:
|
25 |
from TTS.utils.audio import AudioProcessor
|
|
|
|
|
26 |
from TTS.tts.models import setup_model
|
27 |
from TTS.config import load_config
|
28 |
from TTS.tts.models.vits import *
|
29 |
|
30 |
OUT_PATH = 'out/'
|
|
|
|
|
31 |
os.makedirs(OUT_PATH, exist_ok=True)
|
32 |
|
|
|
33 |
MODEL_PATH = '/home/user/app/best_model_latest.pth.tar'
|
34 |
CONFIG_PATH = '/home/user/app/config.json'
|
35 |
TTS_LANGUAGES = "/home/user/app/language_ids.json"
|
36 |
TTS_SPEAKERS = "/home/user/app/speakers.json"
|
37 |
USE_CUDA = torch.cuda.is_available()
|
38 |
|
|
|
39 |
C = load_config(CONFIG_PATH)
|
|
|
|
|
|
|
40 |
ap = AudioProcessor(**C.audio)
|
|
|
41 |
speaker_embedding = None
|
|
|
42 |
C.model_args['d_vector_file'] = TTS_SPEAKERS
|
43 |
C.model_args['use_speaker_encoder_as_loss'] = False
|
|
|
44 |
model = setup_model(C)
|
45 |
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
|
|
|
|
|
46 |
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
|
|
|
47 |
model_weights = cp['model'].copy()
|
48 |
for key in list(model_weights.keys()):
|
49 |
if "speaker_encoder" in key:
|
50 |
del model_weights[key]
|
|
|
51 |
model.load_state_dict(model_weights)
|
|
|
|
|
52 |
model.eval()
|
|
|
53 |
if USE_CUDA:
|
54 |
model = model.cuda()
|
|
|
|
|
55 |
use_griffin_lim = False
|
|
|
56 |
os.system('pip install -q pydub ffmpeg-normalize')
|
|
|
57 |
CONFIG_SE_PATH = "config_se.json"
|
58 |
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
|
|
|
59 |
from TTS.tts.utils.speakers import SpeakerManager
|
60 |
from pydub import AudioSegment
|
61 |
import librosa
|
|
|
62 |
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)
|
63 |
|
64 |
def compute_spec(ref_file):
|
|
|
67 |
spec = torch.FloatTensor(spec).unsqueeze(0)
|
68 |
return spec
|
69 |
|
|
|
|
|
70 |
def greet(Text,Voicetoclone,VoiceMicrophone):
|
71 |
text= "%s" % (Text)
|
72 |
if Voicetoclone is not None:
|
|
|
94 |
text = text
|
95 |
model.language_manager.language_id_mapping
|
96 |
language_id = 0
|
|
|
97 |
print(" > text: {}".format(text))
|
98 |
wav, alignment, _, _ = synthesis(
|
99 |
model,
|
|
|
123 |
fn=greet,
|
124 |
inputs=[gr.inputs.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),gr.Audio(type="filepath", source="upload",label='Please upload a voice to clone (max. 30mb)'),gr.Audio(source="microphone", type="filepath", streaming=True)],
|
125 |
outputs="audio",
|
126 |
+
title="Clone Any Voice"
|
127 |
)
|
128 |
demo.launch()
|