styletts2-public

Running

App Files Files Community

styletts2-public / app.py

Dupaja

Update app.py

4825e21 10 months ago

raw

history blame contribute delete

6.63 kB

	import gradio as gr
	import ljspeechimportable
	import torch
	import numpy as np
	import styletts2importable
	import re
	import phonemizer

	def split_and_recombine_text(text, desired_length=200, max_length=400):
	"""Split text it into chunks of a desired length trying to keep sentences intact."""
	# normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
	text = re.sub(r'\n\n+', '\n', text)
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'[“”]', '"', text)

	rv = []
	in_quote = False
	current = ""
	split_pos = []
	pos = -1
	end_pos = len(text) - 1

	def seek(delta):
	nonlocal pos, in_quote, current
	is_neg = delta < 0
	for _ in range(abs(delta)):
	if is_neg:
	pos -= 1
	current = current[:-1]
	else:
	pos += 1
	current += text[pos]
	if text[pos] == '"':
	in_quote = not in_quote
	return text[pos]

	def peek(delta):
	p = pos + delta
	return text[p] if p < end_pos and p >= 0 else ""

	def commit():
	nonlocal rv, current, split_pos
	rv.append(current)
	current = ""
	split_pos = []

	while pos < end_pos:
	c = seek(1)
	# do we need to force a split?
	if len(current) >= max_length:
	if len(split_pos) > 0 and len(current) > (desired_length / 2):
	# we have at least one sentence and we are over half the desired length, seek back to the last split
	d = pos - split_pos[-1]
	seek(-d)
	else:
	# no full sentences, seek back until we are not in the middle of a word and split there
	while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
	c = seek(-1)
	commit()
	# check for sentence boundaries
	elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')):
	# seek forward if we have consecutive boundary markers but still within the max length
	while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
	c = seek(1)
	split_pos.append(pos)
	if len(current) >= desired_length:
	commit()
	# treat end of quote as a boundary if its followed by a space or newline
	elif in_quote and peek(1) == '"' and peek(2) in '\n ':
	seek(2)
	split_pos.append(pos)
	rv.append(current)

	# clean up, remove lines with only whitespace or punctuation
	rv = [s.strip() for s in rv]
	rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]

	return rv

	theme = gr.themes.Base(
	font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'],
	)

	voicelist = ['f-us-1', 'f-us-2', 'f-us-3', 'f-us-4', 'm-us-1', 'm-us-2', 'm-us-3', 'm-us-4']
	voices = {}

	global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)

	for v in voicelist:
	voices[v] = styletts2importable.compute_style(f'voices/{v}.wav')

	def synthesize(text, voice, lngsteps, password, progress=gr.Progress()):
	if text.strip() == "":
	raise gr.Error("You must enter some text")

	texts = split_and_recombine_text(text)
	v = voice.lower()
	audios = []
	for t in progress.tqdm(texts):
	audios.append(styletts2importable.inference(t, voices[v], alpha=0.3, beta=0.7, diffusion_steps=lngsteps, embedding_scale=1))
	return (24000, np.concatenate(audios))


	def ljsynthesize(text, steps, progress=gr.Progress()):
	noise = torch.randn(1,1,256).to('cuda' if torch.cuda.is_available() else 'cpu')
	if text.strip() == "":
	raise gr.Error("You must enter some text")
	texts = split_and_recombine_text(text)
	audios = []
	for t in progress.tqdm(texts):
	audios.append(ljspeechimportable.inference(t, noise, diffusion_steps=steps, embedding_scale=1))
	return (24000, np.concatenate(audios))

	with gr.Blocks() as libritts: # just realized it isn't vctk but libritts but i'm too lazy to change it rn
	with gr.Row():
	with gr.Column(scale=1):
	inp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
	voice = gr.Dropdown(voicelist, label="Voice", info="Select a default voice.", value='m-us-2', interactive=True)
	multispeakersteps = gr.Slider(minimum=3, maximum=15, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
	# use_gruut = gr.Checkbox(label="Use alternate phonemizer (Gruut) - Experimental")
	with gr.Column(scale=1):
	btn = gr.Button("Synthesize", variant="primary")
	audio = gr.Audio(interactive=False, label="Synthesized Audio")
	btn.click(synthesize, inputs=[inp, voice, multispeakersteps], outputs=[audio], concurrency_limit=4)

	with gr.Blocks() as lj:
	with gr.Row():
	with gr.Column(scale=1):
	ljinp = gr.Textbox(label="Text", info="What would you like StyleTTS 2 to read? It works better on full sentences.", interactive=True)
	ljsteps = gr.Slider(minimum=3, maximum=20, value=3, step=1, label="Diffusion Steps", info="Theoretically, higher should be better quality but slower, but we cannot notice a difference. Try with lower steps first - it is faster", interactive=True)
	with gr.Column(scale=1):
	ljbtn = gr.Button("Synthesize", variant="primary")
	ljaudio = gr.Audio(interactive=False, label="Synthesized Audio")
	ljbtn.click(ljsynthesize, inputs=[ljinp, ljsteps], outputs=[ljaudio], concurrency_limit=4)

	with gr.Blocks(title="StyleTTS 2", css="", theme=theme) as demo:
	gr.DuplicateButton("Duplicate Space")
	gr.TabbedInterface([libritts, lj], ['Multi-Voice', 'LJSpeech'])
	gr.Markdown("""
	Original Demo by [mrfakename](https://twitter.com/realmrfakename). I am not affiliated with the StyleTTS 2 authors.
	Run this demo locally using Docker:
	```bash
	docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all registry.hf.space/styletts2-styletts2:latest python app.py
	```
	""") # Please do not remove this line.

	if __name__ == "__main__":
	# demo.queue(api_open=False, max_size=15).launch(show_api=False)
	demo.queue(api_open=True, max_size=15).launch(show_api=True)