gorkemgoknar commited on
Commit
156829e
1 Parent(s): f7c2b84

Use xtts-streaming for generation

Browse files
Files changed (1) hide show
  1. app.py +227 -136
app.py CHANGED
@@ -3,16 +3,17 @@ import io, os, stat
3
  import subprocess
4
  import random
5
  from zipfile import ZipFile
6
- import uuid
7
  import time
8
  import torch
9
  import torchaudio
 
10
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
11
  os.environ["COQUI_TOS_AGREED"] = "1"
12
 
13
  # langid is used to detect language for longer text
14
  # Most users expect text to be their own language, there is checkbox to disable it
15
- import langid
16
  import base64
17
  import csv
18
  from io import StringIO
@@ -39,12 +40,13 @@ repo_id = "coqui/xtts"
39
  print("Export newer ffmpeg binary for denoise filter")
40
  ZipFile("ffmpeg.zip").extractall()
41
  print("Make ffmpeg binary executable")
42
- st = os.stat('ffmpeg')
43
- os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
44
 
45
  # This will trigger downloading model
46
  print("Downloading if not downloaded Coqui XTTS V1.1")
47
  from TTS.utils.manage import ModelManager
 
48
  model_name = "tts_models/multilingual/multi-dataset/xtts_v1.1"
49
  ModelManager().download_model(model_name)
50
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
@@ -63,156 +65,188 @@ model.load_checkpoint(
63
  checkpoint_path=os.path.join(model_path, "model.pth"),
64
  vocab_path=os.path.join(model_path, "vocab.json"),
65
  eval=True,
66
- use_deepspeed=True
67
  )
68
  model.cuda()
69
 
70
  # This is for debugging purposes only
71
- DEVICE_ASSERT_DETECTED=0
72
- DEVICE_ASSERT_PROMPT=None
73
- DEVICE_ASSERT_LANG=None
74
 
75
 
 
 
76
 
77
- #supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
78
- supported_languages=config.languages
79
 
80
- def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
 
 
 
 
 
 
 
 
 
81
  if agree == True:
82
-
83
-
84
  if language not in supported_languages:
85
- gr.Warning(f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown")
86
-
 
 
87
  return (
88
- None,
89
- None,
90
- None,
91
- None,
92
- )
93
 
94
- language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
 
 
95
 
96
  # tts expects chinese as zh-cn
97
- if language_predicted == "zh":
98
- #we use zh-cn
99
  language_predicted = "zh-cn"
100
 
101
  print(f"Detected language:{language_predicted}, Chosen language:{language}")
102
 
103
  # After text character length 15 trigger language detection
104
- if len(prompt)>15:
105
  # allow any language for short text as some may be common
106
  # If user unchecks language autodetection it will not trigger
107
  # You may remove this completely for own use
108
  if language_predicted != language and not no_lang_auto_detect:
109
- #Please duplicate and remove this check if you really want this
110
- #Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
111
- gr.Warning(f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox" )
112
-
 
 
113
  return (
114
- None,
115
- None,
116
- None,
117
- None,
118
- )
119
 
120
-
121
  if use_mic == True:
122
  if mic_file_path is not None:
123
- speaker_wav=mic_file_path
124
  else:
125
- gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
 
 
126
  return (
127
  None,
128
  None,
129
  None,
130
  None,
131
- )
132
-
133
  else:
134
- speaker_wav=audio_file_pth
135
 
136
-
137
  # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
138
  # This is fast filtering not perfect
139
 
140
  # Apply all on demand
141
- lowpassfilter=denoise=trim=loudness=True
142
-
143
  if lowpassfilter:
144
- lowpass_highpass="lowpass=8000,highpass=75,"
145
  else:
146
- lowpass_highpass=""
147
 
148
  if trim:
149
  # better to remove silence in beginning and end for microphone
150
- trim_silence="areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
151
  else:
152
- trim_silence=""
153
-
154
- if (voice_cleanup):
155
  try:
156
- out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
157
-
158
- #we will use newer ffmpeg as that has afftn denoise filter
159
- shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
160
-
161
- command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
162
- speaker_wav=out_filename
 
 
 
 
 
 
 
 
 
163
  print("Filtered microphone input")
164
  except subprocess.CalledProcessError:
165
  # There was an error - command exited with non-zero code
166
  print("Error: failed filtering, use original microphone input")
167
  else:
168
- speaker_wav=speaker_wav
169
 
170
- if len(prompt)<2:
171
  gr.Warning("Please give a longer prompt text")
172
  return (
173
- None,
174
- None,
175
- None,
176
- None,
177
- )
178
- if len(prompt)>200:
179
- gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
 
 
180
  return (
181
- None,
182
- None,
183
- None,
184
- None,
185
- )
186
  global DEVICE_ASSERT_DETECTED
187
  if DEVICE_ASSERT_DETECTED:
188
  global DEVICE_ASSERT_PROMPT
189
  global DEVICE_ASSERT_LANG
190
- #It will likely never come here as we restart space on first unrecoverable error now
191
- print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
192
-
193
- try:
194
- metrics_text=""
195
- t_latent=time.time()
196
-
 
 
197
  # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
198
  try:
199
- gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
 
 
 
 
200
  except Exception as e:
201
  print("Speaker encoding error", str(e))
202
- gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
 
 
203
  return (
204
  None,
205
  None,
206
  None,
207
  None,
208
- )
209
-
210
-
211
  latent_calculation_time = time.time() - t_latent
212
- #metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
213
 
214
  wav_chunks = []
215
-
 
216
  print("I: Generating new audio...")
217
  t0 = time.time()
218
  out = model.inference(
@@ -230,29 +264,78 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
230
  print(f"Real-time factor (RTF): {real_time_factor}")
231
  metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
232
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
233
- except RuntimeError as e :
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  if "device-side assert" in str(e):
235
  # cannot do anything on cuda device side error, need tor estart
236
- print(f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}", flush=True)
 
 
 
237
  gr.Warning("Unhandled Exception encounter, please retry in a minute")
238
  print("Cuda device-assert Runtime encountered need restart")
239
  if not DEVICE_ASSERT_DETECTED:
240
- DEVICE_ASSERT_DETECTED=1
241
- DEVICE_ASSERT_PROMPT=prompt
242
- DEVICE_ASSERT_LANG=language
243
-
244
  # just before restarting save what caused the issue so we can handle it in future
245
  # Uploading Error data only happens for unrecovarable error
246
- error_time = datetime.datetime.now().strftime('%d-%m-%Y-%H:%M:%S')
247
- error_data = [error_time, prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree]
248
- error_data = [str(e) if type(e)!=str else e for e in error_data]
 
 
 
 
 
 
 
 
 
 
249
  print(error_data)
250
  print(speaker_wav)
251
  write_io = StringIO()
252
  csv.writer(write_io).writerows([error_data])
253
- csv_upload= write_io.getvalue().encode()
254
-
255
- filename = error_time+"_" + str(uuid.uuid4()) +".csv"
256
  print("Writing error csv")
257
  error_api = HfApi()
258
  error_api.upload_file(
@@ -261,10 +344,12 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
261
  repo_id="coqui/xtts-flagged-dataset",
262
  repo_type="dataset",
263
  )
264
-
265
- #speaker_wav
266
  print("Writing error reference audio")
267
- speaker_filename = error_time+"_reference_"+ str(uuid.uuid4()) +".wav"
 
 
268
  error_api = HfApi()
269
  error_api.upload_file(
270
  path_or_fileobj=speaker_wav,
@@ -273,21 +358,23 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
273
  repo_type="dataset",
274
  )
275
 
276
- # HF Space specific.. This error is unrecoverable need to restart space
277
  api.restart_space(repo_id=repo_id)
278
  else:
279
  if "Failed to decode" in str(e):
280
  print("Speaker encoding error", str(e))
281
- gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
 
 
282
  else:
283
  print("RuntimeError: non device-side assert error:", str(e))
284
  gr.Warning("Something unexpected happened please retry again.")
285
  return (
286
- None,
287
- None,
288
- None,
289
- None,
290
- )
291
  return (
292
  gr.make_waveform(
293
  audio="output.wav",
@@ -299,11 +386,11 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_clea
299
  else:
300
  gr.Warning("Please accept the Terms & Condition!")
301
  return (
302
- None,
303
- None,
304
- None,
305
- None,
306
- )
307
 
308
 
309
  title = "Coqui🐸 XTTS"
@@ -351,7 +438,6 @@ examples = [
351
  False,
352
  False,
353
  True,
354
-
355
  ],
356
  [
357
  "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
@@ -408,7 +494,7 @@ examples = [
408
  "it",
409
  "examples/female.wav",
410
  None,
411
- False,
412
  False,
413
  False,
414
  True,
@@ -428,7 +514,7 @@ examples = [
428
  "ru",
429
  "examples/female.wav",
430
  None,
431
- False,
432
  False,
433
  False,
434
  True,
@@ -438,7 +524,7 @@ examples = [
438
  "nl",
439
  "examples/male.wav",
440
  None,
441
- False,
442
  False,
443
  False,
444
  True,
@@ -448,7 +534,7 @@ examples = [
448
  "cs",
449
  "examples/female.wav",
450
  None,
451
- False,
452
  False,
453
  False,
454
  True,
@@ -458,7 +544,7 @@ examples = [
458
  "zh-cn",
459
  "examples/female.wav",
460
  None,
461
- False,
462
  False,
463
  False,
464
  True,
@@ -476,7 +562,6 @@ examples = [
476
  ]
477
 
478
 
479
-
480
  gr.Interface(
481
  fn=predict,
482
  inputs=[
@@ -502,7 +587,7 @@ gr.Interface(
502
  "cs",
503
  "ar",
504
  "zh-cn",
505
- "ja"
506
  ],
507
  max_choices=1,
508
  value="en",
@@ -513,31 +598,36 @@ gr.Interface(
513
  type="filepath",
514
  value="examples/female.wav",
515
  ),
516
- gr.Audio(source="microphone",
517
- type="filepath",
518
- info="Use your microphone to record audio",
519
- label="Use Microphone for Reference"),
520
- gr.Checkbox(label="Use Microphone",
521
- value=False,
522
- info="Notice: Microphone input may not work properly under traffic",),
523
- gr.Checkbox(label="Cleanup Reference Voice",
524
- value=False,
525
- info="This check can improve output if your microphone or reference voice is noisy",
526
- ),
527
- gr.Checkbox(label="Do not use language auto-detect",
528
- value=False,
529
- info="Check to disable language auto-detection",),
 
 
 
 
 
 
 
530
  gr.Checkbox(
531
  label="Agree",
532
  value=False,
533
  info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
534
  ),
535
-
536
-
537
  ],
538
  outputs=[
539
  gr.Video(label="Waveform Visual"),
540
- gr.Audio(label="Synthesised Audio",autoplay=True),
541
  gr.Text(label="Metrics"),
542
  gr.Audio(label="Reference Audio Used"),
543
  ],
@@ -545,4 +635,5 @@ gr.Interface(
545
  description=description,
546
  article=article,
547
  examples=examples,
548
- ).queue().launch(debug=True,show_api=True)
 
 
3
  import subprocess
4
  import random
5
  from zipfile import ZipFile
6
+ import uuid
7
  import time
8
  import torch
9
  import torchaudio
10
+
11
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
12
  os.environ["COQUI_TOS_AGREED"] = "1"
13
 
14
  # langid is used to detect language for longer text
15
  # Most users expect text to be their own language, there is checkbox to disable it
16
+ import langid
17
  import base64
18
  import csv
19
  from io import StringIO
 
40
  print("Export newer ffmpeg binary for denoise filter")
41
  ZipFile("ffmpeg.zip").extractall()
42
  print("Make ffmpeg binary executable")
43
+ st = os.stat("ffmpeg")
44
+ os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
45
 
46
  # This will trigger downloading model
47
  print("Downloading if not downloaded Coqui XTTS V1.1")
48
  from TTS.utils.manage import ModelManager
49
+
50
  model_name = "tts_models/multilingual/multi-dataset/xtts_v1.1"
51
  ModelManager().download_model(model_name)
52
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
 
65
  checkpoint_path=os.path.join(model_path, "model.pth"),
66
  vocab_path=os.path.join(model_path, "vocab.json"),
67
  eval=True,
68
+ use_deepspeed=True,
69
  )
70
  model.cuda()
71
 
72
  # This is for debugging purposes only
73
+ DEVICE_ASSERT_DETECTED = 0
74
+ DEVICE_ASSERT_PROMPT = None
75
+ DEVICE_ASSERT_LANG = None
76
 
77
 
78
+ # supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
79
+ supported_languages = config.languages
80
 
 
 
81
 
82
+ def predict(
83
+ prompt,
84
+ language,
85
+ audio_file_pth,
86
+ mic_file_path,
87
+ use_mic,
88
+ voice_cleanup,
89
+ no_lang_auto_detect,
90
+ agree,
91
+ ):
92
  if agree == True:
 
 
93
  if language not in supported_languages:
94
+ gr.Warning(
95
+ f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
96
+ )
97
+
98
  return (
99
+ None,
100
+ None,
101
+ None,
102
+ None,
103
+ )
104
 
105
+ language_predicted = langid.classify(prompt)[
106
+ 0
107
+ ].strip() # strip need as there is space at end!
108
 
109
  # tts expects chinese as zh-cn
110
+ if language_predicted == "zh":
111
+ # we use zh-cn
112
  language_predicted = "zh-cn"
113
 
114
  print(f"Detected language:{language_predicted}, Chosen language:{language}")
115
 
116
  # After text character length 15 trigger language detection
117
+ if len(prompt) > 15:
118
  # allow any language for short text as some may be common
119
  # If user unchecks language autodetection it will not trigger
120
  # You may remove this completely for own use
121
  if language_predicted != language and not no_lang_auto_detect:
122
+ # Please duplicate and remove this check if you really want this
123
+ # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
124
+ gr.Warning(
125
+ f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
126
+ )
127
+
128
  return (
129
+ None,
130
+ None,
131
+ None,
132
+ None,
133
+ )
134
 
 
135
  if use_mic == True:
136
  if mic_file_path is not None:
137
+ speaker_wav = mic_file_path
138
  else:
139
+ gr.Warning(
140
+ "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
141
+ )
142
  return (
143
  None,
144
  None,
145
  None,
146
  None,
147
+ )
148
+
149
  else:
150
+ speaker_wav = audio_file_pth
151
 
 
152
  # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
153
  # This is fast filtering not perfect
154
 
155
  # Apply all on demand
156
+ lowpassfilter = denoise = trim = loudness = True
157
+
158
  if lowpassfilter:
159
+ lowpass_highpass = "lowpass=8000,highpass=75,"
160
  else:
161
+ lowpass_highpass = ""
162
 
163
  if trim:
164
  # better to remove silence in beginning and end for microphone
165
+ trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
166
  else:
167
+ trim_silence = ""
168
+
169
+ if voice_cleanup:
170
  try:
171
+ out_filename = (
172
+ speaker_wav + str(uuid.uuid4()) + ".wav"
173
+ ) # ffmpeg to know output format
174
+
175
+ # we will use newer ffmpeg as that has afftn denoise filter
176
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
177
+ " "
178
+ )
179
+
180
+ command_result = subprocess.run(
181
+ [item for item in shell_command],
182
+ capture_output=False,
183
+ text=True,
184
+ check=True,
185
+ )
186
+ speaker_wav = out_filename
187
  print("Filtered microphone input")
188
  except subprocess.CalledProcessError:
189
  # There was an error - command exited with non-zero code
190
  print("Error: failed filtering, use original microphone input")
191
  else:
192
+ speaker_wav = speaker_wav
193
 
194
+ if len(prompt) < 2:
195
  gr.Warning("Please give a longer prompt text")
196
  return (
197
+ None,
198
+ None,
199
+ None,
200
+ None,
201
+ )
202
+ if len(prompt) > 200:
203
+ gr.Warning(
204
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
205
+ )
206
  return (
207
+ None,
208
+ None,
209
+ None,
210
+ None,
211
+ )
212
  global DEVICE_ASSERT_DETECTED
213
  if DEVICE_ASSERT_DETECTED:
214
  global DEVICE_ASSERT_PROMPT
215
  global DEVICE_ASSERT_LANG
216
+ # It will likely never come here as we restart space on first unrecoverable error now
217
+ print(
218
+ f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
219
+ )
220
+
221
+ try:
222
+ metrics_text = ""
223
+ t_latent = time.time()
224
+
225
  # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
226
  try:
227
+ (
228
+ gpt_cond_latent,
229
+ diffusion_conditioning,
230
+ speaker_embedding,
231
+ ) = model.get_conditioning_latents(audio_path=speaker_wav)
232
  except Exception as e:
233
  print("Speaker encoding error", str(e))
234
+ gr.Warning(
235
+ "It appears something wrong with reference, did you unmute your microphone?"
236
+ )
237
  return (
238
  None,
239
  None,
240
  None,
241
  None,
242
+ )
243
+
 
244
  latent_calculation_time = time.time() - t_latent
245
+ # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
246
 
247
  wav_chunks = []
248
+ ## Direct mode
249
+ """
250
  print("I: Generating new audio...")
251
  t0 = time.time()
252
  out = model.inference(
 
264
  print(f"Real-time factor (RTF): {real_time_factor}")
265
  metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
266
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
267
+ """
268
+
269
+ print("I: Generating new audio in streaming mode...")
270
+ t0 = time.time()
271
+ chunks = model.inference_stream(
272
+ prompt,
273
+ language,
274
+ gpt_cond_latent,
275
+ speaker_embedding,
276
+ decoder="ne_hifigan",
277
+ )
278
+
279
+ first_chunk = True
280
+ for i, chunk in enumerate(chunks):
281
+ if first_chunk:
282
+ first_chunk_time = time.time() - t0
283
+ metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
284
+ first_chunk = False
285
+ wav_chunks.append(chunk)
286
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
287
+ inference_time = time.time() - t0
288
+ print(
289
+ f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
290
+ )
291
+ metrics_text += (
292
+ f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
293
+ )
294
+
295
+ wav = torch.cat(wav_chunks, dim=0)
296
+ print(wav.shape)
297
+ real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
298
+ print(f"Real-time factor (RTF): {real_time_factor}")
299
+ metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
300
+
301
+ torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
302
+
303
+ except RuntimeError as e:
304
  if "device-side assert" in str(e):
305
  # cannot do anything on cuda device side error, need tor estart
306
+ print(
307
+ f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
308
+ flush=True,
309
+ )
310
  gr.Warning("Unhandled Exception encounter, please retry in a minute")
311
  print("Cuda device-assert Runtime encountered need restart")
312
  if not DEVICE_ASSERT_DETECTED:
313
+ DEVICE_ASSERT_DETECTED = 1
314
+ DEVICE_ASSERT_PROMPT = prompt
315
+ DEVICE_ASSERT_LANG = language
316
+
317
  # just before restarting save what caused the issue so we can handle it in future
318
  # Uploading Error data only happens for unrecovarable error
319
+ error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
320
+ error_data = [
321
+ error_time,
322
+ prompt,
323
+ language,
324
+ audio_file_pth,
325
+ mic_file_path,
326
+ use_mic,
327
+ voice_cleanup,
328
+ no_lang_auto_detect,
329
+ agree,
330
+ ]
331
+ error_data = [str(e) if type(e) != str else e for e in error_data]
332
  print(error_data)
333
  print(speaker_wav)
334
  write_io = StringIO()
335
  csv.writer(write_io).writerows([error_data])
336
+ csv_upload = write_io.getvalue().encode()
337
+
338
+ filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
339
  print("Writing error csv")
340
  error_api = HfApi()
341
  error_api.upload_file(
 
344
  repo_id="coqui/xtts-flagged-dataset",
345
  repo_type="dataset",
346
  )
347
+
348
+ # speaker_wav
349
  print("Writing error reference audio")
350
+ speaker_filename = (
351
+ error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
352
+ )
353
  error_api = HfApi()
354
  error_api.upload_file(
355
  path_or_fileobj=speaker_wav,
 
358
  repo_type="dataset",
359
  )
360
 
361
+ # HF Space specific.. This error is unrecoverable need to restart space
362
  api.restart_space(repo_id=repo_id)
363
  else:
364
  if "Failed to decode" in str(e):
365
  print("Speaker encoding error", str(e))
366
+ gr.Warning(
367
+ "It appears something wrong with reference, did you unmute your microphone?"
368
+ )
369
  else:
370
  print("RuntimeError: non device-side assert error:", str(e))
371
  gr.Warning("Something unexpected happened please retry again.")
372
  return (
373
+ None,
374
+ None,
375
+ None,
376
+ None,
377
+ )
378
  return (
379
  gr.make_waveform(
380
  audio="output.wav",
 
386
  else:
387
  gr.Warning("Please accept the Terms & Condition!")
388
  return (
389
+ None,
390
+ None,
391
+ None,
392
+ None,
393
+ )
394
 
395
 
396
  title = "Coqui🐸 XTTS"
 
438
  False,
439
  False,
440
  True,
 
441
  ],
442
  [
443
  "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
 
494
  "it",
495
  "examples/female.wav",
496
  None,
497
+ False,
498
  False,
499
  False,
500
  True,
 
514
  "ru",
515
  "examples/female.wav",
516
  None,
517
+ False,
518
  False,
519
  False,
520
  True,
 
524
  "nl",
525
  "examples/male.wav",
526
  None,
527
+ False,
528
  False,
529
  False,
530
  True,
 
534
  "cs",
535
  "examples/female.wav",
536
  None,
537
+ False,
538
  False,
539
  False,
540
  True,
 
544
  "zh-cn",
545
  "examples/female.wav",
546
  None,
547
+ False,
548
  False,
549
  False,
550
  True,
 
562
  ]
563
 
564
 
 
565
  gr.Interface(
566
  fn=predict,
567
  inputs=[
 
587
  "cs",
588
  "ar",
589
  "zh-cn",
590
+ "ja",
591
  ],
592
  max_choices=1,
593
  value="en",
 
598
  type="filepath",
599
  value="examples/female.wav",
600
  ),
601
+ gr.Audio(
602
+ source="microphone",
603
+ type="filepath",
604
+ info="Use your microphone to record audio",
605
+ label="Use Microphone for Reference",
606
+ ),
607
+ gr.Checkbox(
608
+ label="Use Microphone",
609
+ value=False,
610
+ info="Notice: Microphone input may not work properly under traffic",
611
+ ),
612
+ gr.Checkbox(
613
+ label="Cleanup Reference Voice",
614
+ value=False,
615
+ info="This check can improve output if your microphone or reference voice is noisy",
616
+ ),
617
+ gr.Checkbox(
618
+ label="Do not use language auto-detect",
619
+ value=False,
620
+ info="Check to disable language auto-detection",
621
+ ),
622
  gr.Checkbox(
623
  label="Agree",
624
  value=False,
625
  info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
626
  ),
 
 
627
  ],
628
  outputs=[
629
  gr.Video(label="Waveform Visual"),
630
+ gr.Audio(label="Synthesised Audio", autoplay=True),
631
  gr.Text(label="Metrics"),
632
  gr.Audio(label="Reference Audio Used"),
633
  ],
 
635
  description=description,
636
  article=article,
637
  examples=examples,
638
+ ).queue().launch(debug=True, show_api=True)
639
+