open-notebooklm

Configuration error

App Files Files Community

gabrielchua commited on Sep 30

Commit

8a1ab06

•

1 Parent(s): 0d77404

use Parler-TTS Mini

Browse files

Files changed (3) hide show

app.py +31 -7
requirements.txt +1 -0
utils.py +46 -16

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ from utils import generate_script, generate_audio, parse_url
 class DialogueItem(BaseModel):
     """A single dialogue item."""
-    speaker: Literal["Host (Jane)", "Guest"]
     text: str
@@ -41,10 +41,12 @@ def generate_podcast(
     files: List[str],
     url: Optional[str],
     tone: Optional[str],
     length: Optional[str],
     language: str
 ) -> Tuple[str, str]:
     """Generate the audio and transcript from the PDFs and/or URL."""
     text = ""
     # Change language to the appropriate code
@@ -57,6 +59,12 @@ def generate_podcast(
         "Korean": "KR",
     }
     # Check if at least one input is provided
     if not files and not url:
         raise gr.Error("Please provide at least one PDF file or a URL.")
@@ -109,16 +117,17 @@ def generate_podcast(
     total_characters = 0
     for line in llm_output.dialogue:
         logger.info(f"Generating audio for {line.speaker}: {line.text}")
-        if line.speaker == "Host (Jane)":
-            speaker = f"**Jane**: {line.text}"
         else:
             speaker = f"**{llm_output.name_of_guest}**: {line.text}"
         transcript += speaker + "\n\n"
         total_characters += len(line.text)
         # Get audio file path
-        audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language])
         # Read the audio file into an AudioSegment
         audio_segment = AudioSegment.from_file(audio_file_path)
         audio_segments.append(audio_segment)
@@ -166,15 +175,20 @@ demo = gr.Interface(
             label="3. 🎭 Choose the tone",
             value="Fun"
         ),
         gr.Radio(
             choices=["Short (1-2 min)", "Medium (3-5 min)"],
-            label="4. ⏱️ Choose the length",
             value="Medium (3-5 min)"
         ),
         gr.Dropdown(
             choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
             value="English",
-            label="5. 🌐 Choose the language (Highly experimental, English is recommended)",
         ),
     ],
     outputs=[
@@ -190,13 +204,23 @@ demo = gr.Interface(
             [str(Path("examples/1310.4546v1.pdf"))],
             "",
             "Fun",
-            "Short (1-2 min)",
             "English"
         ],
         [
             [],
             "https://en.wikipedia.org/wiki/Hugging_Face",
             "Fun",
             "Short (1-2 min)",
             "English"
         ],

 class DialogueItem(BaseModel):
     """A single dialogue item."""
+    speaker: Literal["Host (Jenna)", "Guest"]
     text: str
     files: List[str],
     url: Optional[str],
     tone: Optional[str],
+    voice: Optional[str],
     length: Optional[str],
     language: str
 ) -> Tuple[str, str]:
     """Generate the audio and transcript from the PDFs and/or URL."""
+    print(tone, voice, length, language)
     text = ""
     # Change language to the appropriate code
         "Korean": "KR",
     }
+    # Change voice to the appropriate code
+    voice_mapping = {
+        "Male": "Gary",
+        "Female": "Laura",
+    }
     # Check if at least one input is provided
     if not files and not url:
         raise gr.Error("Please provide at least one PDF file or a URL.")
     total_characters = 0
     for line in llm_output.dialogue:
+        print(line.speaker, line.text, language_mapping[language], voice_mapping[voice])
         logger.info(f"Generating audio for {line.speaker}: {line.text}")
+        if line.speaker == "Host (Jenna)":
+            speaker = f"**Jenna**: {line.text}"
         else:
             speaker = f"**{llm_output.name_of_guest}**: {line.text}"
         transcript += speaker + "\n\n"
         total_characters += len(line.text)
         # Get audio file path
+        audio_file_path = generate_audio(line.text, line.speaker, language_mapping[language], voice_mapping[voice])
         # Read the audio file into an AudioSegment
         audio_segment = AudioSegment.from_file(audio_file_path)
         audio_segments.append(audio_segment)
             label="3. 🎭 Choose the tone",
             value="Fun"
         ),
+        gr.Radio(
+            choices=["Male", "Female"],
+            label="4. 🎭 Choose the guest's voice",
+            value="Female"
+        ),
         gr.Radio(
             choices=["Short (1-2 min)", "Medium (3-5 min)"],
+            label="5. ⏱️ Choose the length",
             value="Medium (3-5 min)"
         ),
         gr.Dropdown(
             choices=["English", "Spanish", "French", "Chinese", "Japanese", "Korean"],
             value="English",
+            label="6. 🌐 Choose the language (Highly experimental, English is recommended)",
         ),
     ],
     outputs=[
             [str(Path("examples/1310.4546v1.pdf"))],
             "",
             "Fun",
+            "Female",
+             "Medium (3-5 min)",
             "English"
         ],
         [
             [],
             "https://en.wikipedia.org/wiki/Hugging_Face",
             "Fun",
+            "Male"
+            "Short (1-2 min)",
+            "English"
+        ],
+        [
+            [],
+            "https://simple.wikipedia.org/wiki/Taylor_Swift",
+            "Fun",
+            "Female"
             "Short (1-2 min)",
             "English"
         ],

requirements.txt CHANGED Viewed

@@ -2,6 +2,7 @@ gradio==4.44.0
 granian==1.4
 loguru==0.7
 openai==1.50.2
 promptic==0.7.5
 pydantic==2.7
 pypdf==4.1

 granian==1.4
 loguru==0.7
 openai==1.50.2
+parler-tts @ git+https://github.com/huggingface/parler-tts@main
 promptic==0.7.5
 pydantic==2.7
 pypdf==4.1

utils.py CHANGED Viewed

@@ -7,12 +7,19 @@ Functions:
 - get_audio: Get the audio from the TTS model from HF Spaces.
 """
-import os
 import requests
 from gradio_client import Client
 from openai import OpenAI
 from pydantic import ValidationError
 MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
 JINA_URL = "https://r.jina.ai/"
@@ -24,6 +31,10 @@ client = OpenAI(
 hf_client = Client("mrfakename/MeloTTS")
 def generate_script(system_prompt: str, input_text: str, output_model):
     """Get the dialogue from the LLM."""
@@ -68,19 +79,38 @@ def parse_url(url: str) -> str:
     return response.text
-def generate_audio(text: str, speaker: str, language: str) -> bytes:
-    """Get the audio from the TTS model from HF Spaces and adjust pitch if necessary."""
-    if speaker == "Guest":
-        accent = "EN-US" if language == "EN" else language
-        speed = 0.9
-    else:  # host
-        accent = "EN-Default" if language == "EN" else language
-        speed = 1
-    if language != "EN" and speaker != "Guest":
-        speed = 1.1
-    # Generate audio
-    result = hf_client.predict(
-        text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
-    )
-    return result

 - get_audio: Get the audio from the TTS model from HF Spaces.
 """
+import os
 import requests
+import tempfile
+import soundfile as sf
+import torch
 from gradio_client import Client
 from openai import OpenAI
+from parler_tts import ParlerTTSForConditionalGeneration
 from pydantic import ValidationError
+from transformers import AutoTokenizer
 MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
 JINA_URL = "https://r.jina.ai/"
 hf_client = Client("mrfakename/MeloTTS")
+# Initialize the model and tokenizer (do this outside the function for efficiency)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1").to(device)
+tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
 def generate_script(system_prompt: str, input_text: str, output_model):
     """Get the dialogue from the LLM."""
     return response.text
+def generate_audio(text: str, speaker: str, language: str, voice: str) -> str:
+    """Generate audio using the local Parler TTS model or HuggingFace client."""
+    if language == "EN":
+        # Adjust the description based on speaker and language
+        if speaker == "Guest":
+            description = f"{voice} has a slightly expressive and animated speech, speaking at a moderate speed with natural pitch variations. The voice is clear and close-up, as if recorded in a professional studio."
+        else:  # host
+            description = f"{voice} has a professional and engaging tone, speaking at a moderate to slightly faster pace. The voice is clear, warm, and sounds like a seasoned podcast host."
+        # Prepare inputs
+        input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
+        prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
+        # Generate audio
+        generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+        audio_arr = generation.cpu().numpy().squeeze()
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
+            sf.write(temp_file.name, audio_arr, model.config.sampling_rate, format='mp3')
+        return temp_file.name
+    else:
+        accent = language
+        if speaker == "Guest":
+            speed = 0.9
+        else:  # host
+            speed = 1.1
+        # Generate audio
+        result = hf_client.predict(
+            text=text, language=language, speaker=accent, speed=speed, api_name="/synthesize"
+        )
+        return result