shivi commited on
Commit
c27baa1
1 Parent(s): 8831a61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -61
app.py CHANGED
@@ -25,6 +25,7 @@ from prompt_examples import TEXT_CHAT_EXAMPLES, IMG_GEN_PROMPT_EXAMPLES, AUDIO_E
25
  from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
26
  from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
27
 
 
28
  HF_API_TOKEN = os.getenv("HF_API_KEY")
29
  ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
30
  NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
@@ -93,54 +94,50 @@ def replicate_api_inference(input_prompt):
93
  return image
94
 
95
  def generate_image(input_prompt, model_id="black-forest-labs/FLUX.1-schnell"):
96
- if input_prompt!="":
97
- if input_prompt=='Image generation blocked for prompts that include humans, kids, or children.':
98
- return None
 
99
  else:
100
- if USE_REPLICATE:
101
- print("using replicate for image generation")
 
 
 
 
 
102
  image = replicate_api_inference(input_prompt)
103
- else:
104
- try:
105
- print("using HF inference API for image generation")
106
- image_bytes = get_hf_inference_api_response({ "inputs": input_prompt}, model_id)
107
- image = np.array(Image.open(io.BytesIO(image_bytes)))
108
- except Exception as e:
109
- print("HF API error:", e)
110
- # generate image with help replicate in case of error
111
- image = replicate_api_inference(input_prompt)
112
- return image
113
  else:
114
  return None
115
 
116
  def generate_img_prompt(input_prompt):
117
- # clean prompt before doing language detection
118
- cleaned_prompt = clean_text(input_prompt, remove_bullets=True, remove_newline=True)
119
- text_lang_code = predict_language(cleaned_prompt)
120
- language = LID_LANGUAGES[text_lang_code]
121
-
122
- gr.Info("Generating Image", duration=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
- if language!="english":
125
- text = f"""
126
- Translate the given input prompt to English.
127
- Input Prompt: {input_prompt}
128
-
129
- Then based on the English translation of the prompt, generate a detailed image description which can be used to generate an image using a text-to-image model.
130
- Do not use more than 3-4 lines for the image description. Respond with only the image description.
131
- """
132
  else:
133
- text = f"""Generate a detailed image description which can be used to generate an image using a text-to-image model based on the given input prompt:
134
-
135
- Input Prompt: {input_prompt}
136
-
137
- Do not use more than 3-4 lines for the description.
138
- """
139
-
140
- response = img_prompt_client.chat(message=text, preamble=IMG_DESCRIPTION_PREAMBLE, model=AYA_MODEL_NAME)
141
- output = response.text
142
-
143
- return output
144
 
145
 
146
  # Chat with Aya util functions
@@ -151,7 +148,8 @@ def trigger_example(example):
151
 
152
  def generate_aya_chat_response(user_message, cid, token, history=None):
153
  if not token:
154
- raise gr.Error("Error loading.")
 
155
 
156
  if history is None:
157
  history = []
@@ -186,7 +184,7 @@ def clear_chat():
186
 
187
  # Audio Pipeline util functions
188
 
189
- def transcribe_and_stream(inputs, show_info="no", model_name="openai/whisper-large-v3-turbo", language="english"):
190
  if inputs is not None and inputs!="":
191
  if show_info=="show_info":
192
  gr.Info("Processing Audio", duration=1)
@@ -242,11 +240,10 @@ def convert_text_to_speech(text, language="english"):
242
  # clean text before doing language detection
243
  cleaned_text = clean_text(text, remove_bullets=True, remove_newline=True)
244
  text_lang_code = predict_language(cleaned_text)
245
- language = LID_LANGUAGES[text_lang_code]
246
 
247
  if not USE_ELVENLABS:
248
- if language!= "japanese":
249
- audio_path = neetsai_tts(text, language)
250
  else:
251
  print("DEVICE:", DEVICE)
252
  # if language is japanese then use XTTS for TTS since neets_ai doesn't support japanese voice
@@ -274,10 +271,16 @@ def elevenlabs_generate_audio(text):
274
  save(audio, audio_path)
275
  return audio_path
276
 
277
- def neetsai_tts(input_text, language):
 
 
 
 
 
 
278
 
279
- lang_id = NEETS_AI_LANGID_MAP[language]
280
- neets_vits_voice_id = f"vits-{lang_id}"
281
 
282
  response = requests.request(
283
  method="POST",
@@ -344,7 +347,7 @@ with demo:
344
  **Developed by**: [Cohere for AI](https://cohere.com/research) and [Cohere](https://cohere.com/)
345
  """
346
  )
347
- # Text Chat
348
  with gr.TabItem("Chat with Aya") as chat_with_aya:
349
  cid = gr.State("")
350
  token = gr.State(value=None)
@@ -385,12 +388,13 @@ with demo:
385
  example_labels=TEXT_CHAT_EXAMPLES_LABELS,
386
  )
387
 
388
- # Audio Pipeline
389
  with gr.TabItem("Speak with Aya") as speak_with_aya:
390
 
391
  with gr.Row():
392
  with gr.Column():
393
  e2e_audio_file = gr.Audio(sources="microphone", type="filepath", min_length=None)
 
394
 
395
  clear_button_microphone = gr.ClearButton()
396
  gr.Examples(
@@ -407,14 +411,14 @@ with demo:
407
  e2e_audio_file_aya_response = gr.Textbox(lines=3,label="Aya's Response", show_copy_button=True, container=True, interactive=False)
408
  e2e_aya_audio_response = gr.Audio(type="filepath", label="Aya's Audio Response")
409
 
410
- show_info = gr.Textbox(value="show_info", visible=False)
411
- stt_model = gr.Textbox(value="groq_whisper", visible=False)
412
 
413
  with gr.Accordion("See Details", open=False):
414
  gr.Markdown("To enable voice interaction with Aya Expanse, this space uses [Whisper large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) and [Groq](https://groq.com/) for STT and [neets.ai](http://neets.ai/) for TTS.")
415
 
416
 
417
- # Image Generation
418
  with gr.TabItem("Visualize with Aya") as visualize_with_aya:
419
  with gr.Row():
420
  with gr.Column():
@@ -465,31 +469,33 @@ with demo:
465
  generate_image, #run_flux,
466
  inputs=[generated_img_desc],
467
  outputs=[generated_img],
468
- show_progress="hidden",
469
  )
470
 
471
  # Audio Pipeline
472
  clear_button_microphone.click(lambda: None, None, e2e_audio_file)
473
- clear_button_microphone.click(lambda: None, None, e2e_audio_file_trans)
474
  clear_button_microphone.click(lambda: None, None, e2e_aya_audio_response)
 
 
475
 
476
- e2e_audio_file.change(
 
477
  transcribe_and_stream,
478
- inputs=[e2e_audio_file, show_info, stt_model],
479
  outputs=[e2e_audio_file_trans],
480
- show_progress="hidden",
481
  ).then(
482
  aya_speech_text_response,
483
  inputs=[e2e_audio_file_trans],
484
  outputs=[e2e_audio_file_aya_response],
485
- show_progress="minimal",
486
  ).then(
487
  convert_text_to_speech,
488
  inputs=[e2e_audio_file_aya_response],
489
  outputs=[e2e_aya_audio_response],
490
- show_progress="minimal",
491
  )
492
 
493
  demo.load(lambda: secrets.token_hex(16), None, token)
494
 
495
- demo.queue(api_open=False, max_size=40).launch(show_api=False, allowed_paths=['/home/user/app'])
 
25
  from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
26
  from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
27
 
28
+
29
  HF_API_TOKEN = os.getenv("HF_API_KEY")
30
  ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
31
  NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
 
94
  return image
95
 
96
  def generate_image(input_prompt, model_id="black-forest-labs/FLUX.1-schnell"):
97
+ if input_prompt is not None and input_prompt!="":
98
+ if USE_REPLICATE:
99
+ print("using replicate for image generation")
100
+ image = replicate_api_inference(input_prompt)
101
  else:
102
+ try:
103
+ print("using HF inference API for image generation")
104
+ image_bytes = get_hf_inference_api_response({ "inputs": input_prompt}, model_id)
105
+ image = np.array(Image.open(io.BytesIO(image_bytes)))
106
+ except Exception as e:
107
+ print("HF API error:", e)
108
+ # generate image with help replicate in case of error
109
  image = replicate_api_inference(input_prompt)
110
+ return image
 
 
 
 
 
 
 
 
 
111
  else:
112
  return None
113
 
114
  def generate_img_prompt(input_prompt):
115
+ if input_prompt is not None and input_prompt!="":
116
+ # clean prompt before doing language detection
117
+ cleaned_prompt = clean_text(input_prompt, remove_bullets=True, remove_newline=True)
118
+ text_lang_code = predict_language(cleaned_prompt)
119
+
120
+ gr.Info("Generating Image", duration=2)
121
+
122
+ if text_lang_code!="eng_Latn":
123
+ text = f"""
124
+ Translate the given input prompt to English.
125
+ Input Prompt: {input_prompt}
126
+ Then based on the English translation of the prompt, generate a detailed image description which can be used to generate an image using a text-to-image model.
127
+ Do not use more than 3-4 lines for the image description. Respond with only the image description.
128
+ """
129
+ else:
130
+ text = f"""Generate a detailed image description which can be used to generate an image using a text-to-image model based on the given input prompt:
131
+ Input Prompt: {input_prompt}
132
+ Do not use more than 3-4 lines for the description.
133
+ """
134
+
135
+ response = img_prompt_client.chat(message=text, preamble=IMG_DESCRIPTION_PREAMBLE, model=AYA_MODEL_NAME)
136
+ output = response.text
137
 
138
+ return output
 
 
 
 
 
 
 
139
  else:
140
+ return None
 
 
 
 
 
 
 
 
 
 
141
 
142
 
143
  # Chat with Aya util functions
 
148
 
149
  def generate_aya_chat_response(user_message, cid, token, history=None):
150
  if not token:
151
+ print("no token")
152
+ #raise gr.Error("Error loading.")
153
 
154
  if history is None:
155
  history = []
 
184
 
185
  # Audio Pipeline util functions
186
 
187
+ def transcribe_and_stream(inputs, model_name="groq_whisper", show_info="show_info", language="english"):
188
  if inputs is not None and inputs!="":
189
  if show_info=="show_info":
190
  gr.Info("Processing Audio", duration=1)
 
240
  # clean text before doing language detection
241
  cleaned_text = clean_text(text, remove_bullets=True, remove_newline=True)
242
  text_lang_code = predict_language(cleaned_text)
 
243
 
244
  if not USE_ELVENLABS:
245
+ if text_lang_code!= "jpn_Jpan":
246
+ audio_path = neetsai_tts(text, text_lang_code)
247
  else:
248
  print("DEVICE:", DEVICE)
249
  # if language is japanese then use XTTS for TTS since neets_ai doesn't support japanese voice
 
271
  save(audio, audio_path)
272
  return audio_path
273
 
274
+ def neetsai_tts(input_text, text_lang_code):
275
+
276
+ if text_lang_code in LID_LANGUAGES.keys():
277
+ language = LID_LANGUAGES[text_lang_code]
278
+ else:
279
+ # use english voice as default for languages outside 23 languages of Aya Expanse
280
+ language = "english"
281
 
282
+ neets_lang_id = NEETS_AI_LANGID_MAP[language]
283
+ neets_vits_voice_id = f"vits-{neets_lang_id}"
284
 
285
  response = requests.request(
286
  method="POST",
 
347
  **Developed by**: [Cohere for AI](https://cohere.com/research) and [Cohere](https://cohere.com/)
348
  """
349
  )
350
+
351
  with gr.TabItem("Chat with Aya") as chat_with_aya:
352
  cid = gr.State("")
353
  token = gr.State(value=None)
 
388
  example_labels=TEXT_CHAT_EXAMPLES_LABELS,
389
  )
390
 
391
+ # End to End Testing Pipeline for speak with Aya
392
  with gr.TabItem("Speak with Aya") as speak_with_aya:
393
 
394
  with gr.Row():
395
  with gr.Column():
396
  e2e_audio_file = gr.Audio(sources="microphone", type="filepath", min_length=None)
397
+ e2_audio_submit_button = gr.Button(value="Get Aya's Response", variant="primary")
398
 
399
  clear_button_microphone = gr.ClearButton()
400
  gr.Examples(
 
411
  e2e_audio_file_aya_response = gr.Textbox(lines=3,label="Aya's Response", show_copy_button=True, container=True, interactive=False)
412
  e2e_aya_audio_response = gr.Audio(type="filepath", label="Aya's Audio Response")
413
 
414
+ # show_info = gr.Textbox(value="show_info", visible=False)
415
+ # stt_model = gr.Textbox(value="groq_whisper", visible=False)
416
 
417
  with gr.Accordion("See Details", open=False):
418
  gr.Markdown("To enable voice interaction with Aya Expanse, this space uses [Whisper large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) and [Groq](https://groq.com/) for STT and [neets.ai](http://neets.ai/) for TTS.")
419
 
420
 
421
+ # Generate Images
422
  with gr.TabItem("Visualize with Aya") as visualize_with_aya:
423
  with gr.Row():
424
  with gr.Column():
 
469
  generate_image, #run_flux,
470
  inputs=[generated_img_desc],
471
  outputs=[generated_img],
472
+ show_progress="full",
473
  )
474
 
475
  # Audio Pipeline
476
  clear_button_microphone.click(lambda: None, None, e2e_audio_file)
 
477
  clear_button_microphone.click(lambda: None, None, e2e_aya_audio_response)
478
+ clear_button_microphone.click(lambda: None, None, e2e_audio_file_aya_response)
479
+ clear_button_microphone.click(lambda: None, None, e2e_audio_file_trans)
480
 
481
+ #e2e_audio_file.change(
482
+ e2_audio_submit_button.click(
483
  transcribe_and_stream,
484
+ inputs=[e2e_audio_file],
485
  outputs=[e2e_audio_file_trans],
486
+ show_progress="full",
487
  ).then(
488
  aya_speech_text_response,
489
  inputs=[e2e_audio_file_trans],
490
  outputs=[e2e_audio_file_aya_response],
491
+ show_progress="full",
492
  ).then(
493
  convert_text_to_speech,
494
  inputs=[e2e_audio_file_aya_response],
495
  outputs=[e2e_aya_audio_response],
496
+ show_progress="full",
497
  )
498
 
499
  demo.load(lambda: secrets.token_hex(16), None, token)
500
 
501
+ demo.queue(api_open=False, max_size=20, default_concurrency_limit=4).launch(show_api=False, allowed_paths=['/home/user/app'])