Spaces:

sanchit-gandhi
/

parler-tts-streaming

Running on Zero

App Files Files Community

sanchit-gandhi HF staff commited on Apr 24

Commit

efcdb1c

•

1 Parent(s): ab3a30c

for parler

Browse files

Files changed (2) hide show

app.py +16 -13
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from queue import Queue
 from threading import Thread
 from typing import Optional
@@ -11,12 +12,14 @@ from parler_tts import ParlerTTSForConditionalGeneration
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 from transformers.generation.streamers import BaseStreamer
-device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else  "cpu"
 torch_dtype = torch.float16 if device != "cpu" else torch.float32
 repo_id = "parler-tts/parler_tts_mini_v0.1"
-model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
@@ -83,7 +86,7 @@ class ParlerTTSStreamer(BaseStreamer):
         if stride is not None:
             self.stride = stride
         else:
-            hop_length = np.prod(self.audio_encoder.config.upsampling_ratios)
             self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
         self.token_cache = None
         self.to_yield = 0
@@ -95,19 +98,18 @@ class ParlerTTSStreamer(BaseStreamer):
     def apply_delay_pattern_mask(self, input_ids):
         # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
-        _, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
             input_ids[:, :1],
             pad_token_id=self.generation_config.decoder_start_token_id,
             max_length=input_ids.shape[-1],
         )
         # apply the pattern mask to the input ids
-        input_ids = self.decoder.apply_delay_pattern_mask(input_ids, decoder_delay_pattern_mask)
         # revert the pattern delay mask by filtering the pad token id
-        input_ids = input_ids[input_ids != self.generation_config.pad_token_id].reshape(
-            1, self.decoder.num_codebooks, -1
-        )
         # append the frame dimension back to the audio codes
         input_ids = input_ids[None, ...]
@@ -169,7 +171,7 @@ target_dtype = np.int16
 max_range = np.iinfo(target_dtype).max
 @spaces.GPU
-def gen_tts(text, description, play_steps_in_s=2.0):
     play_steps = int(frame_rate * play_steps_in_s)
     streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
@@ -182,6 +184,7 @@ def gen_tts(text, description, play_steps_in_s=2.0):
         streamer=streamer,
         do_sample=True,
         temperature=1.0,
     )
     set_seed(SEED)
@@ -267,12 +270,12 @@ with gr.Blocks(css=css) as block:
             description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
             run_button = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
-            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
     inputs = [input_text, description]
     outputs = [audio_out]
-    gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
-    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
     gr.HTML(
         """
         <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.

+import math
 from queue import Queue
 from threading import Thread
 from typing import Optional
 from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
 from transformers.generation.streamers import BaseStreamer
+device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
 torch_dtype = torch.float16 if device != "cpu" else torch.float32
 repo_id = "parler-tts/parler_tts_mini_v0.1"
+model = ParlerTTSForConditionalGeneration.from_pretrained(
+    repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
+).to(device)
 tokenizer = AutoTokenizer.from_pretrained(repo_id)
 feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
         if stride is not None:
             self.stride = stride
         else:
+            hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
             self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
         self.token_cache = None
         self.to_yield = 0
     def apply_delay_pattern_mask(self, input_ids):
         # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
+        _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
             input_ids[:, :1],
+            bos_token_id=self.generation_config.bos_token_id,
             pad_token_id=self.generation_config.decoder_start_token_id,
             max_length=input_ids.shape[-1],
         )
         # apply the pattern mask to the input ids
+        input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
         # revert the pattern delay mask by filtering the pad token id
+        mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
+        input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
         # append the frame dimension back to the audio codes
         input_ids = input_ids[None, ...]
 max_range = np.iinfo(target_dtype).max
 @spaces.GPU
+def generate_tts(text, description, play_steps_in_s=2.0):
     play_steps = int(frame_rate * play_steps_in_s)
     streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
         streamer=streamer,
         do_sample=True,
         temperature=1.0,
+        min_new_tokens=10,
     )
     set_seed(SEED)
             description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
             run_button = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
+            audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True, autoplay=True)
     inputs = [input_text, description]
     outputs = [audio_out]
+    gr.Examples(examples=examples, fn=generate_tts, inputs=inputs, outputs=outputs, cache_examples=False)
+    run_button.click(fn=generate_tts, inputs=inputs, outputs=outputs, queue=True)
     gr.HTML(
         """
         <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- git+https://github.com/huggingface/parler-tts.git


1	+ git+https://github.com/huggingface/parler-tts.git
2	+ accelerate