gabrielchua commited on
Commit
8a1ab06
1 Parent(s): 0d77404

use Parler-TTS Mini

Browse files
Files changed (3) hide show
  1. app.py +31 -7
  2. requirements.txt +1 -0
  3. utils.py +46 -16
app.py CHANGED
@@ -25,7 +25,7 @@ from utils import generate_script, generate_audio, parse_url
25
  class DialogueItem(BaseModel):
26
  """A single dialogue item."""
27
 
28
- speaker: Literal["Host (Jane)", "Guest"]
29
  text: str
30
 
31
 
@@ -41,10 +41,12 @@ def generate_podcast(
41
  files: List[str],
42
  url: Optional[str],
43
  tone: Optional[str],
 
44
  length: Optional[str],
45
  language: str
46
  ) -> Tuple[str, str]:
47
  """Generate the audio and transcript from the PDFs and/or URL."""
 
48
  text = ""
49
 
50
  # Change language to the appropriate code
@@ -57,6 +59,12 @@ def generate_podcast(
57
  "Korean": "KR",
58
  }
59
 
 
 
 
 
 
 
60
  # Check if at least one input is provided
61
  if not files and not url:
62
  raise gr.Error("Please provide at least one PDF file or a URL.")
@@ -109,16 +117,17 @@ def generate_podcast(
109
  total_characters = 0
110
 
111
  for line in llm_output.dialogue:
 
112
  logger.info(f"Generating audio for {line.speaker}: {line.text}")
113
- if line.speaker == "Host (Jane)":
114
- speaker = f"**Jane**: {line.text}"
115
  else:
116
  speaker = f"**{llm_output.name_of_guest}**: {line.text}"
117
  transcript += speaker + "\n\n"
118
  total_characters += len(line.text)
119
 
120
  # Get audio file path
121
- audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language])
122
  # Read the audio file into an AudioSegment
123
  audio_segment = AudioSegment.from_file(audio_file_path)
124
  audio_segments.append(audio_segment)
@@ -166,15 +175,20 @@ demo = gr.Interface(
166
  label="3. 🎭 Choose the tone",
167
  value="Fun"
168
  ),
 
 
 
 
 
169
  gr.Radio(
170
  choices=["Short (1-2 min)", "Medium (3-5 min)"],
171
- label="4. ⏱️ Choose the length",
172
  value="Medium (3-5 min)"
173
  ),
174
  gr.Dropdown(
175
  choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
176
  value="English",
177
- label="5. 🌐 Choose the language (Highly experimental, English is recommended)",
178
  ),
179
  ],
180
  outputs=[
@@ -190,13 +204,23 @@ demo = gr.Interface(
190
  [str(Path("examples/1310.4546v1.pdf"))],
191
  "",
192
  "Fun",
193
- "Short (1-2 min)",
 
194
  "English"
195
  ],
196
  [
197
  [],
198
  "https://en.wikipedia.org/wiki/Hugging_Face",
199
  "Fun",
 
 
 
 
 
 
 
 
 
200
  "Short (1-2 min)",
201
  "English"
202
  ],
 
25
  class DialogueItem(BaseModel):
26
  """A single dialogue item."""
27
 
28
+ speaker: Literal["Host (Jenna)", "Guest"]
29
  text: str
30
 
31
 
 
41
  files: List[str],
42
  url: Optional[str],
43
  tone: Optional[str],
44
+ voice: Optional[str],
45
  length: Optional[str],
46
  language: str
47
  ) -> Tuple[str, str]:
48
  """Generate the audio and transcript from the PDFs and/or URL."""
49
+ print(tone, voice, length, language)
50
  text = ""
51
 
52
  # Change language to the appropriate code
 
59
  "Korean": "KR",
60
  }
61
 
62
+ # Change voice to the appropriate code
63
+ voice_mapping = {
64
+ "Male": "Gary",
65
+ "Female": "Laura",
66
+ }
67
+
68
  # Check if at least one input is provided
69
  if not files and not url:
70
  raise gr.Error("Please provide at least one PDF file or a URL.")
 
117
  total_characters = 0
118
 
119
  for line in llm_output.dialogue:
120
+ print(line.speaker, line.text, language_mapping[language], voice_mapping[voice])
121
  logger.info(f"Generating audio for {line.speaker}: {line.text}")
122
+ if line.speaker == "Host (Jenna)":
123
+ speaker = f"**Jenna**: {line.text}"
124
  else:
125
  speaker = f"**{llm_output.name_of_guest}**: {line.text}"
126
  transcript += speaker + "\n\n"
127
  total_characters += len(line.text)
128
 
129
  # Get audio file path
130
+ audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language], voice_mapping[voice])
131
  # Read the audio file into an AudioSegment
132
  audio_segment = AudioSegment.from_file(audio_file_path)
133
  audio_segments.append(audio_segment)
 
175
  label="3. 🎭 Choose the tone",
176
  value="Fun"
177
  ),
178
+ gr.Radio(
179
+ choices=["Male", "Female"],
180
+ label="4. 🎭 Choose the guest's voice",
181
+ value="Female"
182
+ ),
183
  gr.Radio(
184
  choices=["Short (1-2 min)", "Medium (3-5 min)"],
185
+ label="5. ⏱️ Choose the length",
186
  value="Medium (3-5 min)"
187
  ),
188
  gr.Dropdown(
189
  choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
190
  value="English",
191
+ label="6. 🌐 Choose the language (Highly experimental, English is recommended)",
192
  ),
193
  ],
194
  outputs=[
 
204
  [str(Path("examples/1310.4546v1.pdf"))],
205
  "",
206
  "Fun",
207
+ "Female",
208
+ "Medium (3-5 min)",
209
  "English"
210
  ],
211
  [
212
  [],
213
  "https://en.wikipedia.org/wiki/Hugging_Face",
214
  "Fun",
215
+ "Male"
216
+ "Short (1-2 min)",
217
+ "English"
218
+ ],
219
+ [
220
+ [],
221
+ "https://simple.wikipedia.org/wiki/Taylor_Swift",
222
+ "Fun",
223
+ "Female"
224
  "Short (1-2 min)",
225
  "English"
226
  ],
requirements.txt CHANGED
@@ -2,6 +2,7 @@ gradio==4.44.0
2
  granian==1.4
3
  loguru==0.7
4
  openai==1.50.2
 
5
  promptic==0.7.5
6
  pydantic==2.7
7
  pypdf==4.1
 
2
  granian==1.4
3
  loguru==0.7
4
  openai==1.50.2
5
+ parler-tts @ git+https://github.com/huggingface/parler-tts@main
6
  promptic==0.7.5
7
  pydantic==2.7
8
  pypdf==4.1
utils.py CHANGED
@@ -7,12 +7,19 @@ Functions:
7
  - get_audio: Get the audio from the TTS model from HF Spaces.
8
  """
9
 
10
- import os
11
  import requests
 
12
 
 
 
 
13
  from gradio_client import Client
14
  from openai import OpenAI
 
15
  from pydantic import ValidationError
 
 
16
 
17
  MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
18
  JINA_URL = "https://r.jina.ai/"
@@ -24,6 +31,10 @@ client = OpenAI(
24
 
25
  hf_client = Client("mrfakename/MeloTTS")
26
 
 
 
 
 
27
 
28
  def generate_script(system_prompt: str, input_text: str, output_model):
29
  """Get the dialogue from the LLM."""
@@ -68,19 +79,38 @@ def parse_url(url: str) -> str:
68
  return response.text
69
 
70
 
71
- def generate_audio(text: str, speaker: str, language: str) -> bytes:
72
- """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
73
- if speaker == "Guest":
74
- accent = "EN-US" if language == "EN" else language
75
- speed = 0.9
76
- else: # host
77
- accent = "EN-Default" if language == "EN" else language
78
- speed = 1
79
- if language != "EN" and speaker != "Guest":
80
- speed = 1.1
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- # Generate audio
83
- result = hf_client.predict(
84
- text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
85
- )
86
- return result
 
 
 
 
 
 
 
7
  - get_audio: Get the audio from the TTS model from HF Spaces.
8
  """
9
 
10
+ import os
11
  import requests
12
+ import tempfile
13
 
14
+
15
+ import soundfile as sf
16
+ import torch
17
  from gradio_client import Client
18
  from openai import OpenAI
19
+ from parler_tts import ParlerTTSForConditionalGeneration
20
  from pydantic import ValidationError
21
+ from transformers import AutoTokenizer
22
+
23
 
24
  MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
25
  JINA_URL = "https://r.jina.ai/"
 
31
 
32
  hf_client = Client("mrfakename/MeloTTS")
33
 
34
+ # Initialize the model and tokenizer (do this outside the function for efficiency)
35
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
36
+ model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(device)
37
+ tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
38
 
39
  def generate_script(system_prompt: str, input_text: str, output_model):
40
  """Get the dialogue from the LLM."""
 
79
  return response.text
80
 
81
 
82
+ def generate_audio(text: str, speaker: str, language: str, voice: str) -> str:
83
+ """Generate audio using the local Parler TTS model or HuggingFace client."""
84
+
85
+ if language == "EN":
86
+ # Adjust the description based on speaker and language
87
+ if speaker == "Guest":
88
+ description = f"{voice} has a slightly expressive and animated speech, speaking at a moderate speed with natural pitch variations. The voice is clear and close-up, as if recorded in a professional studio."
89
+ else: # host
90
+ description = f"{voice} has a professional and engaging tone, speaking at a moderate to slightly faster pace. The voice is clear, warm, and sounds like a seasoned podcast host."
91
+
92
+ # Prepare inputs
93
+ input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
94
+ prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
95
+
96
+ # Generate audio
97
+ generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
98
+ audio_arr = generation.cpu().numpy().squeeze()
99
+
100
+ # Save to temporary file
101
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
102
+ sf.write(temp_file.name, audio_arr, model.config.sampling_rate, format='mp3')
103
+
104
+ return temp_file.name
105
 
106
+ else:
107
+ accent = language
108
+ if speaker == "Guest":
109
+ speed = 0.9
110
+ else: # host
111
+ speed = 1.1
112
+ # Generate audio
113
+ result = hf_client.predict(
114
+ text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
115
+ )
116
+ return result