mrfakename commited on
Commit
d9c8497
1 Parent(s): 364456d

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show
  1. inference-cli.py +5 -4
inference-cli.py CHANGED
@@ -118,7 +118,7 @@ if args.load_vocoder_from_local:
118
  vocos.load_state_dict(state_dict)
119
  vocos.eval()
120
  else:
121
- print("Donwload Vocos from huggingface charactr/vocos-mel-24khz")
122
  vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
123
 
124
  print(f"Using {device} device")
@@ -323,7 +323,7 @@ def infer_batch(ref_audio, ref_text, gen_text_batches, model,ckpt_file,file_voca
323
  return final_wave, combined_spectrogram
324
 
325
  def process_voice(ref_audio_orig, ref_text):
326
- print("Converting audio...")
327
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
328
  aseg = AudioSegment.from_file(ref_audio_orig)
329
 
@@ -361,7 +361,6 @@ def process_voice(ref_audio_orig, ref_text):
361
  return ref_audio, ref_text
362
 
363
  def infer(ref_audio, ref_text, gen_text, model,ckpt_file,file_vocab, remove_silence, cross_fade_duration=0.15):
364
- print(gen_text)
365
  # Add the functionality to ensure it ends with ". "
366
  if not ref_text.endswith(". ") and not ref_text.endswith("。"):
367
  if ref_text.endswith("."):
@@ -373,7 +372,6 @@ def infer(ref_audio, ref_text, gen_text, model,ckpt_file,file_vocab, remove_sile
373
  audio, sr = torchaudio.load(ref_audio)
374
  max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
375
  gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
376
- print('ref_text', ref_text)
377
  for i, gen_text in enumerate(gen_text_batches):
378
  print(f'gen_text {i}', gen_text)
379
 
@@ -390,6 +388,9 @@ def process(ref_audio, ref_text, text_gen, model,ckpt_file,file_vocab, remove_si
390
  voices["main"] = main_voice
391
  for voice in voices:
392
  voices[voice]['ref_audio'], voices[voice]['ref_text'] = process_voice(voices[voice]['ref_audio'], voices[voice]['ref_text'])
 
 
 
393
 
394
  generated_audio_segments = []
395
  reg1 = r'(?=\[\w+\])'
 
118
  vocos.load_state_dict(state_dict)
119
  vocos.eval()
120
  else:
121
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
122
  vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
123
 
124
  print(f"Using {device} device")
 
323
  return final_wave, combined_spectrogram
324
 
325
  def process_voice(ref_audio_orig, ref_text):
326
+ print("Converting", ref_audio_orig)
327
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
328
  aseg = AudioSegment.from_file(ref_audio_orig)
329
 
 
361
  return ref_audio, ref_text
362
 
363
  def infer(ref_audio, ref_text, gen_text, model,ckpt_file,file_vocab, remove_silence, cross_fade_duration=0.15):
 
364
  # Add the functionality to ensure it ends with ". "
365
  if not ref_text.endswith(". ") and not ref_text.endswith("。"):
366
  if ref_text.endswith("."):
 
372
  audio, sr = torchaudio.load(ref_audio)
373
  max_chars = int(len(ref_text.encode('utf-8')) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
374
  gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
 
375
  for i, gen_text in enumerate(gen_text_batches):
376
  print(f'gen_text {i}', gen_text)
377
 
 
388
  voices["main"] = main_voice
389
  for voice in voices:
390
  voices[voice]['ref_audio'], voices[voice]['ref_text'] = process_voice(voices[voice]['ref_audio'], voices[voice]['ref_text'])
391
+ print("Voice:", voice)
392
+ print("Ref_audio:", voices[voice]['ref_audio'])
393
+ print("Ref_text:", voices[voice]['ref_text'])
394
 
395
  generated_audio_segments = []
396
  reg1 = r'(?=\[\w+\])'