Spaces:

CohereForAI
/

aya_expanse

Running on T4

App Files Files Community

shivi commited on 13 days ago

Commit

c27baa1

•

1 Parent(s): 8831a61

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -61

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ from prompt_examples import TEXT_CHAT_EXAMPLES, IMG_GEN_PROMPT_EXAMPLES, AUDIO_E
 from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
 from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
 HF_API_TOKEN =  os.getenv("HF_API_KEY")
 ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
 NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
@@ -93,54 +94,50 @@ def replicate_api_inference(input_prompt):
     return image
 def generate_image(input_prompt, model_id="black-forest-labs/FLUX.1-schnell"):
-    if input_prompt!="":
-        if input_prompt=='Image generation blocked for prompts that include humans, kids, or children.':
-            return None
         else:
-            if USE_REPLICATE:
-                print("using replicate for image generation")
                 image = replicate_api_inference(input_prompt)
-            else:
-                try:
-                    print("using HF inference API for image generation")
-                    image_bytes = get_hf_inference_api_response({ "inputs": input_prompt}, model_id)
-                    image = np.array(Image.open(io.BytesIO(image_bytes)))
-                except Exception as e:
-                    print("HF API error:", e)
-                    # generate image with help replicate in case of error
-                    image = replicate_api_inference(input_prompt)
-            return image
     else:
         return None
 def generate_img_prompt(input_prompt):
-    # clean prompt before doing language detection
-    cleaned_prompt = clean_text(input_prompt, remove_bullets=True, remove_newline=True)
-    text_lang_code = predict_language(cleaned_prompt)
-    language = LID_LANGUAGES[text_lang_code]
-    gr.Info("Generating Image", duration=2)
-    if language!="english":
-        text = f"""
-        Translate the given input prompt to English.
-        Input Prompt: {input_prompt}
-        Then based on the English translation of the prompt, generate a detailed image description which can be used to generate an image using a text-to-image model.
-        Do not use more than 3-4 lines for the image description. Respond with only the image description.
-        """
     else:
-        text = f"""Generate a detailed image description which can be used to generate an image using a text-to-image model based on the given input prompt:
-        Input Prompt: {input_prompt}
-        Do not use more than 3-4 lines for the description.
-        """
-    response = img_prompt_client.chat(message=text, preamble=IMG_DESCRIPTION_PREAMBLE, model=AYA_MODEL_NAME)
-    output = response.text
-    return output
 # Chat with Aya util functions
@@ -151,7 +148,8 @@ def trigger_example(example):
 def generate_aya_chat_response(user_message, cid, token, history=None):
     if not token:
-        raise gr.Error("Error loading.")
     if history is None:
         history = []
@@ -186,7 +184,7 @@ def clear_chat():
 # Audio Pipeline util functions
-def transcribe_and_stream(inputs, show_info="no", model_name="openai/whisper-large-v3-turbo", language="english"):
     if inputs is not None and inputs!="":
         if show_info=="show_info":
             gr.Info("Processing Audio", duration=1)
@@ -242,11 +240,10 @@ def convert_text_to_speech(text, language="english"):
         # clean text before doing language detection
         cleaned_text = clean_text(text, remove_bullets=True, remove_newline=True)
         text_lang_code = predict_language(cleaned_text)
-        language = LID_LANGUAGES[text_lang_code]
         if not USE_ELVENLABS:
-            if language!= "japanese":
-                audio_path = neetsai_tts(text, language)
             else:
                 print("DEVICE:", DEVICE)
                 # if language is japanese then use XTTS for TTS since neets_ai doesn't support japanese voice
@@ -274,10 +271,16 @@ def elevenlabs_generate_audio(text):
     save(audio, audio_path)
     return audio_path
-def neetsai_tts(input_text, language):
-    lang_id = NEETS_AI_LANGID_MAP[language]
-    neets_vits_voice_id = f"vits-{lang_id}"
     response = requests.request(
     method="POST",
@@ -344,7 +347,7 @@ with demo:
             **Developed by**: [Cohere for AI](https://cohere.com/research) and [Cohere](https://cohere.com/)
             """
             )
-    # Text Chat
     with gr.TabItem("Chat with Aya") as chat_with_aya:
         cid = gr.State("")
         token = gr.State(value=None)
@@ -385,12 +388,13 @@ with demo:
                     example_labels=TEXT_CHAT_EXAMPLES_LABELS,
                 )
-    # Audio Pipeline
     with gr.TabItem("Speak with Aya") as speak_with_aya:
         with gr.Row():
             with gr.Column():
                 e2e_audio_file = gr.Audio(sources="microphone", type="filepath", min_length=None)
                 clear_button_microphone = gr.ClearButton()
                 gr.Examples(
@@ -407,14 +411,14 @@ with demo:
                 e2e_audio_file_aya_response = gr.Textbox(lines=3,label="Aya's Response", show_copy_button=True, container=True, interactive=False)
                 e2e_aya_audio_response = gr.Audio(type="filepath", label="Aya's Audio Response")
-        show_info = gr.Textbox(value="show_info", visible=False)
-        stt_model = gr.Textbox(value="groq_whisper", visible=False)
         with gr.Accordion("See Details", open=False):
             gr.Markdown("To enable voice interaction with Aya Expanse, this space uses [Whisper large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) and [Groq](https://groq.com/) for STT and [neets.ai](http://neets.ai/) for TTS.")
-    # Image Generation
     with gr.TabItem("Visualize with Aya") as visualize_with_aya:
         with gr.Row():
             with gr.Column():
@@ -465,31 +469,33 @@ with demo:
         generate_image, #run_flux,
         inputs=[generated_img_desc],
         outputs=[generated_img],
-        show_progress="hidden",
     )
    # Audio Pipeline
     clear_button_microphone.click(lambda: None, None, e2e_audio_file)
-    clear_button_microphone.click(lambda: None, None, e2e_audio_file_trans)
     clear_button_microphone.click(lambda: None, None, e2e_aya_audio_response)
-    e2e_audio_file.change(
         transcribe_and_stream,
-        inputs=[e2e_audio_file, show_info, stt_model],
         outputs=[e2e_audio_file_trans],
-        show_progress="hidden",
     ).then(
         aya_speech_text_response,
         inputs=[e2e_audio_file_trans],
         outputs=[e2e_audio_file_aya_response],
-        show_progress="minimal",
     ).then(
         convert_text_to_speech,
         inputs=[e2e_audio_file_aya_response],
         outputs=[e2e_aya_audio_response],
-        show_progress="minimal",
     )
     demo.load(lambda: secrets.token_hex(16), None, token)
-demo.queue(api_open=False, max_size=40).launch(show_api=False, allowed_paths=['/home/user/app'])

 from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
 from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
 HF_API_TOKEN =  os.getenv("HF_API_KEY")
 ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
 NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
     return image
 def generate_image(input_prompt, model_id="black-forest-labs/FLUX.1-schnell"):
+    if input_prompt is not None and input_prompt!="":
+        if USE_REPLICATE:
+            print("using replicate for image generation")
+            image = replicate_api_inference(input_prompt)
         else:
+            try:
+                print("using HF inference API for image generation")
+                image_bytes = get_hf_inference_api_response({ "inputs": input_prompt}, model_id)
+                image = np.array(Image.open(io.BytesIO(image_bytes)))
+            except Exception as e:
+                print("HF API error:", e)
+                # generate image with help replicate in case of error
                 image = replicate_api_inference(input_prompt)
+        return image
     else:
         return None
 def generate_img_prompt(input_prompt):
+    if input_prompt is not None and input_prompt!="":
+        # clean prompt before doing language detection
+        cleaned_prompt = clean_text(input_prompt, remove_bullets=True, remove_newline=True)
+        text_lang_code = predict_language(cleaned_prompt)
+        gr.Info("Generating Image", duration=2)
+        if text_lang_code!="eng_Latn":
+            text = f"""
+            Translate the given input prompt to English.
+            Input Prompt: {input_prompt}
+            Then based on the English translation of the prompt, generate a detailed image description which can be used to generate an image using a text-to-image model.
+            Do not use more than 3-4 lines for the image description. Respond with only the image description.
+            """
+        else:
+            text = f"""Generate a detailed image description which can be used to generate an image using a text-to-image model based on the given input prompt:
+            Input Prompt: {input_prompt}
+            Do not use more than 3-4 lines for the description.
+            """
+        response = img_prompt_client.chat(message=text, preamble=IMG_DESCRIPTION_PREAMBLE, model=AYA_MODEL_NAME)
+        output = response.text
+        return output
     else:
+        return None
 # Chat with Aya util functions
 def generate_aya_chat_response(user_message, cid, token, history=None):
     if not token:
+        print("no token")
+        #raise gr.Error("Error loading.")
     if history is None:
         history = []
 # Audio Pipeline util functions
+def transcribe_and_stream(inputs, model_name="groq_whisper", show_info="show_info", language="english"):
     if inputs is not None and inputs!="":
         if show_info=="show_info":
             gr.Info("Processing Audio", duration=1)
         # clean text before doing language detection
         cleaned_text = clean_text(text, remove_bullets=True, remove_newline=True)
         text_lang_code = predict_language(cleaned_text)
         if not USE_ELVENLABS:
+            if text_lang_code!= "jpn_Jpan":
+                audio_path = neetsai_tts(text, text_lang_code)
             else:
                 print("DEVICE:", DEVICE)
                 # if language is japanese then use XTTS for TTS since neets_ai doesn't support japanese voice
     save(audio, audio_path)
     return audio_path
+def neetsai_tts(input_text, text_lang_code):
+    if text_lang_code in LID_LANGUAGES.keys():
+        language = LID_LANGUAGES[text_lang_code]
+    else:
+        # use english voice as default for languages outside 23 languages of Aya Expanse
+        language = "english"
+    neets_lang_id = NEETS_AI_LANGID_MAP[language]
+    neets_vits_voice_id = f"vits-{neets_lang_id}"
     response = requests.request(
     method="POST",
             **Developed by**: [Cohere for AI](https://cohere.com/research) and [Cohere](https://cohere.com/)
             """
             )
     with gr.TabItem("Chat with Aya") as chat_with_aya:
         cid = gr.State("")
         token = gr.State(value=None)
                     example_labels=TEXT_CHAT_EXAMPLES_LABELS,
                 )
+    # End to End Testing Pipeline for speak with Aya
     with gr.TabItem("Speak with Aya") as speak_with_aya:
         with gr.Row():
             with gr.Column():
                 e2e_audio_file = gr.Audio(sources="microphone", type="filepath", min_length=None)
+                e2_audio_submit_button = gr.Button(value="Get Aya's Response", variant="primary")
                 clear_button_microphone = gr.ClearButton()
                 gr.Examples(
                 e2e_audio_file_aya_response = gr.Textbox(lines=3,label="Aya's Response", show_copy_button=True, container=True, interactive=False)
                 e2e_aya_audio_response = gr.Audio(type="filepath", label="Aya's Audio Response")
+        # show_info = gr.Textbox(value="show_info", visible=False)
+        # stt_model = gr.Textbox(value="groq_whisper", visible=False)
         with gr.Accordion("See Details", open=False):
             gr.Markdown("To enable voice interaction with Aya Expanse, this space uses [Whisper large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) and [Groq](https://groq.com/) for STT and [neets.ai](http://neets.ai/) for TTS.")
+    # Generate Images
     with gr.TabItem("Visualize with Aya") as visualize_with_aya:
         with gr.Row():
             with gr.Column():
         generate_image, #run_flux,
         inputs=[generated_img_desc],
         outputs=[generated_img],
+        show_progress="full",
     )
    # Audio Pipeline
     clear_button_microphone.click(lambda: None, None, e2e_audio_file)
     clear_button_microphone.click(lambda: None, None, e2e_aya_audio_response)
+    clear_button_microphone.click(lambda: None, None, e2e_audio_file_aya_response)
+    clear_button_microphone.click(lambda: None, None, e2e_audio_file_trans)
+    #e2e_audio_file.change(
+    e2_audio_submit_button.click(
         transcribe_and_stream,
+        inputs=[e2e_audio_file],
         outputs=[e2e_audio_file_trans],
+        show_progress="full",
     ).then(
         aya_speech_text_response,
         inputs=[e2e_audio_file_trans],
         outputs=[e2e_audio_file_aya_response],
+        show_progress="full",
     ).then(
         convert_text_to_speech,
         inputs=[e2e_audio_file_aya_response],
         outputs=[e2e_aya_audio_response],
+        show_progress="full",
     )
     demo.load(lambda: secrets.token_hex(16), None, token)
+demo.queue(api_open=False, max_size=20, default_concurrency_limit=4).launch(show_api=False, allowed_paths=['/home/user/app'])