Spaces:
Running
on
T4
Running
on
T4
Update app.py
Browse files
app.py
CHANGED
@@ -25,6 +25,7 @@ from prompt_examples import TEXT_CHAT_EXAMPLES, IMG_GEN_PROMPT_EXAMPLES, AUDIO_E
|
|
25 |
from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
|
26 |
from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
|
27 |
|
|
|
28 |
HF_API_TOKEN = os.getenv("HF_API_KEY")
|
29 |
ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
|
30 |
NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
|
@@ -93,54 +94,50 @@ def replicate_api_inference(input_prompt):
|
|
93 |
return image
|
94 |
|
95 |
def generate_image(input_prompt, model_id="black-forest-labs/FLUX.1-schnell"):
|
96 |
-
if input_prompt!="":
|
97 |
-
if
|
98 |
-
|
|
|
99 |
else:
|
100 |
-
|
101 |
-
print("using
|
|
|
|
|
|
|
|
|
|
|
102 |
image = replicate_api_inference(input_prompt)
|
103 |
-
|
104 |
-
try:
|
105 |
-
print("using HF inference API for image generation")
|
106 |
-
image_bytes = get_hf_inference_api_response({ "inputs": input_prompt}, model_id)
|
107 |
-
image = np.array(Image.open(io.BytesIO(image_bytes)))
|
108 |
-
except Exception as e:
|
109 |
-
print("HF API error:", e)
|
110 |
-
# generate image with help replicate in case of error
|
111 |
-
image = replicate_api_inference(input_prompt)
|
112 |
-
return image
|
113 |
else:
|
114 |
return None
|
115 |
|
116 |
def generate_img_prompt(input_prompt):
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
-
|
125 |
-
text = f"""
|
126 |
-
Translate the given input prompt to English.
|
127 |
-
Input Prompt: {input_prompt}
|
128 |
-
|
129 |
-
Then based on the English translation of the prompt, generate a detailed image description which can be used to generate an image using a text-to-image model.
|
130 |
-
Do not use more than 3-4 lines for the image description. Respond with only the image description.
|
131 |
-
"""
|
132 |
else:
|
133 |
-
|
134 |
-
|
135 |
-
Input Prompt: {input_prompt}
|
136 |
-
|
137 |
-
Do not use more than 3-4 lines for the description.
|
138 |
-
"""
|
139 |
-
|
140 |
-
response = img_prompt_client.chat(message=text, preamble=IMG_DESCRIPTION_PREAMBLE, model=AYA_MODEL_NAME)
|
141 |
-
output = response.text
|
142 |
-
|
143 |
-
return output
|
144 |
|
145 |
|
146 |
# Chat with Aya util functions
|
@@ -151,7 +148,8 @@ def trigger_example(example):
|
|
151 |
|
152 |
def generate_aya_chat_response(user_message, cid, token, history=None):
|
153 |
if not token:
|
154 |
-
|
|
|
155 |
|
156 |
if history is None:
|
157 |
history = []
|
@@ -186,7 +184,7 @@ def clear_chat():
|
|
186 |
|
187 |
# Audio Pipeline util functions
|
188 |
|
189 |
-
def transcribe_and_stream(inputs,
|
190 |
if inputs is not None and inputs!="":
|
191 |
if show_info=="show_info":
|
192 |
gr.Info("Processing Audio", duration=1)
|
@@ -242,11 +240,10 @@ def convert_text_to_speech(text, language="english"):
|
|
242 |
# clean text before doing language detection
|
243 |
cleaned_text = clean_text(text, remove_bullets=True, remove_newline=True)
|
244 |
text_lang_code = predict_language(cleaned_text)
|
245 |
-
language = LID_LANGUAGES[text_lang_code]
|
246 |
|
247 |
if not USE_ELVENLABS:
|
248 |
-
if
|
249 |
-
audio_path = neetsai_tts(text,
|
250 |
else:
|
251 |
print("DEVICE:", DEVICE)
|
252 |
# if language is japanese then use XTTS for TTS since neets_ai doesn't support japanese voice
|
@@ -274,10 +271,16 @@ def elevenlabs_generate_audio(text):
|
|
274 |
save(audio, audio_path)
|
275 |
return audio_path
|
276 |
|
277 |
-
def neetsai_tts(input_text,
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
|
279 |
-
|
280 |
-
neets_vits_voice_id = f"vits-{
|
281 |
|
282 |
response = requests.request(
|
283 |
method="POST",
|
@@ -344,7 +347,7 @@ with demo:
|
|
344 |
**Developed by**: [Cohere for AI](https://cohere.com/research) and [Cohere](https://cohere.com/)
|
345 |
"""
|
346 |
)
|
347 |
-
|
348 |
with gr.TabItem("Chat with Aya") as chat_with_aya:
|
349 |
cid = gr.State("")
|
350 |
token = gr.State(value=None)
|
@@ -385,12 +388,13 @@ with demo:
|
|
385 |
example_labels=TEXT_CHAT_EXAMPLES_LABELS,
|
386 |
)
|
387 |
|
388 |
-
#
|
389 |
with gr.TabItem("Speak with Aya") as speak_with_aya:
|
390 |
|
391 |
with gr.Row():
|
392 |
with gr.Column():
|
393 |
e2e_audio_file = gr.Audio(sources="microphone", type="filepath", min_length=None)
|
|
|
394 |
|
395 |
clear_button_microphone = gr.ClearButton()
|
396 |
gr.Examples(
|
@@ -407,14 +411,14 @@ with demo:
|
|
407 |
e2e_audio_file_aya_response = gr.Textbox(lines=3,label="Aya's Response", show_copy_button=True, container=True, interactive=False)
|
408 |
e2e_aya_audio_response = gr.Audio(type="filepath", label="Aya's Audio Response")
|
409 |
|
410 |
-
show_info = gr.Textbox(value="show_info", visible=False)
|
411 |
-
stt_model = gr.Textbox(value="groq_whisper", visible=False)
|
412 |
|
413 |
with gr.Accordion("See Details", open=False):
|
414 |
gr.Markdown("To enable voice interaction with Aya Expanse, this space uses [Whisper large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) and [Groq](https://groq.com/) for STT and [neets.ai](http://neets.ai/) for TTS.")
|
415 |
|
416 |
|
417 |
-
#
|
418 |
with gr.TabItem("Visualize with Aya") as visualize_with_aya:
|
419 |
with gr.Row():
|
420 |
with gr.Column():
|
@@ -465,31 +469,33 @@ with demo:
|
|
465 |
generate_image, #run_flux,
|
466 |
inputs=[generated_img_desc],
|
467 |
outputs=[generated_img],
|
468 |
-
show_progress="
|
469 |
)
|
470 |
|
471 |
# Audio Pipeline
|
472 |
clear_button_microphone.click(lambda: None, None, e2e_audio_file)
|
473 |
-
clear_button_microphone.click(lambda: None, None, e2e_audio_file_trans)
|
474 |
clear_button_microphone.click(lambda: None, None, e2e_aya_audio_response)
|
|
|
|
|
475 |
|
476 |
-
e2e_audio_file.change(
|
|
|
477 |
transcribe_and_stream,
|
478 |
-
inputs=[e2e_audio_file
|
479 |
outputs=[e2e_audio_file_trans],
|
480 |
-
show_progress="
|
481 |
).then(
|
482 |
aya_speech_text_response,
|
483 |
inputs=[e2e_audio_file_trans],
|
484 |
outputs=[e2e_audio_file_aya_response],
|
485 |
-
show_progress="
|
486 |
).then(
|
487 |
convert_text_to_speech,
|
488 |
inputs=[e2e_audio_file_aya_response],
|
489 |
outputs=[e2e_aya_audio_response],
|
490 |
-
show_progress="
|
491 |
)
|
492 |
|
493 |
demo.load(lambda: secrets.token_hex(16), None, token)
|
494 |
|
495 |
-
demo.queue(api_open=False, max_size=
|
|
|
25 |
from preambles import CHAT_PREAMBLE, AUDIO_RESPONSE_PREAMBLE, IMG_DESCRIPTION_PREAMBLE
|
26 |
from constants import LID_LANGUAGES, NEETS_AI_LANGID_MAP, AYA_MODEL_NAME, BATCH_SIZE, USE_ELVENLABS, USE_REPLICATE
|
27 |
|
28 |
+
|
29 |
HF_API_TOKEN = os.getenv("HF_API_KEY")
|
30 |
ELEVEN_LABS_KEY = os.getenv("ELEVEN_LABS_KEY")
|
31 |
NEETS_AI_API_KEY = os.getenv("NEETS_AI_API_KEY")
|
|
|
94 |
return image
|
95 |
|
96 |
def generate_image(input_prompt, model_id="black-forest-labs/FLUX.1-schnell"):
|
97 |
+
if input_prompt is not None and input_prompt!="":
|
98 |
+
if USE_REPLICATE:
|
99 |
+
print("using replicate for image generation")
|
100 |
+
image = replicate_api_inference(input_prompt)
|
101 |
else:
|
102 |
+
try:
|
103 |
+
print("using HF inference API for image generation")
|
104 |
+
image_bytes = get_hf_inference_api_response({ "inputs": input_prompt}, model_id)
|
105 |
+
image = np.array(Image.open(io.BytesIO(image_bytes)))
|
106 |
+
except Exception as e:
|
107 |
+
print("HF API error:", e)
|
108 |
+
# generate image with help replicate in case of error
|
109 |
image = replicate_api_inference(input_prompt)
|
110 |
+
return image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
else:
|
112 |
return None
|
113 |
|
114 |
def generate_img_prompt(input_prompt):
|
115 |
+
if input_prompt is not None and input_prompt!="":
|
116 |
+
# clean prompt before doing language detection
|
117 |
+
cleaned_prompt = clean_text(input_prompt, remove_bullets=True, remove_newline=True)
|
118 |
+
text_lang_code = predict_language(cleaned_prompt)
|
119 |
+
|
120 |
+
gr.Info("Generating Image", duration=2)
|
121 |
+
|
122 |
+
if text_lang_code!="eng_Latn":
|
123 |
+
text = f"""
|
124 |
+
Translate the given input prompt to English.
|
125 |
+
Input Prompt: {input_prompt}
|
126 |
+
Then based on the English translation of the prompt, generate a detailed image description which can be used to generate an image using a text-to-image model.
|
127 |
+
Do not use more than 3-4 lines for the image description. Respond with only the image description.
|
128 |
+
"""
|
129 |
+
else:
|
130 |
+
text = f"""Generate a detailed image description which can be used to generate an image using a text-to-image model based on the given input prompt:
|
131 |
+
Input Prompt: {input_prompt}
|
132 |
+
Do not use more than 3-4 lines for the description.
|
133 |
+
"""
|
134 |
+
|
135 |
+
response = img_prompt_client.chat(message=text, preamble=IMG_DESCRIPTION_PREAMBLE, model=AYA_MODEL_NAME)
|
136 |
+
output = response.text
|
137 |
|
138 |
+
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
else:
|
140 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
|
143 |
# Chat with Aya util functions
|
|
|
148 |
|
149 |
def generate_aya_chat_response(user_message, cid, token, history=None):
|
150 |
if not token:
|
151 |
+
print("no token")
|
152 |
+
#raise gr.Error("Error loading.")
|
153 |
|
154 |
if history is None:
|
155 |
history = []
|
|
|
184 |
|
185 |
# Audio Pipeline util functions
|
186 |
|
187 |
+
def transcribe_and_stream(inputs, model_name="groq_whisper", show_info="show_info", language="english"):
|
188 |
if inputs is not None and inputs!="":
|
189 |
if show_info=="show_info":
|
190 |
gr.Info("Processing Audio", duration=1)
|
|
|
240 |
# clean text before doing language detection
|
241 |
cleaned_text = clean_text(text, remove_bullets=True, remove_newline=True)
|
242 |
text_lang_code = predict_language(cleaned_text)
|
|
|
243 |
|
244 |
if not USE_ELVENLABS:
|
245 |
+
if text_lang_code!= "jpn_Jpan":
|
246 |
+
audio_path = neetsai_tts(text, text_lang_code)
|
247 |
else:
|
248 |
print("DEVICE:", DEVICE)
|
249 |
# if language is japanese then use XTTS for TTS since neets_ai doesn't support japanese voice
|
|
|
271 |
save(audio, audio_path)
|
272 |
return audio_path
|
273 |
|
274 |
+
def neetsai_tts(input_text, text_lang_code):
|
275 |
+
|
276 |
+
if text_lang_code in LID_LANGUAGES.keys():
|
277 |
+
language = LID_LANGUAGES[text_lang_code]
|
278 |
+
else:
|
279 |
+
# use english voice as default for languages outside 23 languages of Aya Expanse
|
280 |
+
language = "english"
|
281 |
|
282 |
+
neets_lang_id = NEETS_AI_LANGID_MAP[language]
|
283 |
+
neets_vits_voice_id = f"vits-{neets_lang_id}"
|
284 |
|
285 |
response = requests.request(
|
286 |
method="POST",
|
|
|
347 |
**Developed by**: [Cohere for AI](https://cohere.com/research) and [Cohere](https://cohere.com/)
|
348 |
"""
|
349 |
)
|
350 |
+
|
351 |
with gr.TabItem("Chat with Aya") as chat_with_aya:
|
352 |
cid = gr.State("")
|
353 |
token = gr.State(value=None)
|
|
|
388 |
example_labels=TEXT_CHAT_EXAMPLES_LABELS,
|
389 |
)
|
390 |
|
391 |
+
# End to End Testing Pipeline for speak with Aya
|
392 |
with gr.TabItem("Speak with Aya") as speak_with_aya:
|
393 |
|
394 |
with gr.Row():
|
395 |
with gr.Column():
|
396 |
e2e_audio_file = gr.Audio(sources="microphone", type="filepath", min_length=None)
|
397 |
+
e2_audio_submit_button = gr.Button(value="Get Aya's Response", variant="primary")
|
398 |
|
399 |
clear_button_microphone = gr.ClearButton()
|
400 |
gr.Examples(
|
|
|
411 |
e2e_audio_file_aya_response = gr.Textbox(lines=3,label="Aya's Response", show_copy_button=True, container=True, interactive=False)
|
412 |
e2e_aya_audio_response = gr.Audio(type="filepath", label="Aya's Audio Response")
|
413 |
|
414 |
+
# show_info = gr.Textbox(value="show_info", visible=False)
|
415 |
+
# stt_model = gr.Textbox(value="groq_whisper", visible=False)
|
416 |
|
417 |
with gr.Accordion("See Details", open=False):
|
418 |
gr.Markdown("To enable voice interaction with Aya Expanse, this space uses [Whisper large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) and [Groq](https://groq.com/) for STT and [neets.ai](http://neets.ai/) for TTS.")
|
419 |
|
420 |
|
421 |
+
# Generate Images
|
422 |
with gr.TabItem("Visualize with Aya") as visualize_with_aya:
|
423 |
with gr.Row():
|
424 |
with gr.Column():
|
|
|
469 |
generate_image, #run_flux,
|
470 |
inputs=[generated_img_desc],
|
471 |
outputs=[generated_img],
|
472 |
+
show_progress="full",
|
473 |
)
|
474 |
|
475 |
# Audio Pipeline
|
476 |
clear_button_microphone.click(lambda: None, None, e2e_audio_file)
|
|
|
477 |
clear_button_microphone.click(lambda: None, None, e2e_aya_audio_response)
|
478 |
+
clear_button_microphone.click(lambda: None, None, e2e_audio_file_aya_response)
|
479 |
+
clear_button_microphone.click(lambda: None, None, e2e_audio_file_trans)
|
480 |
|
481 |
+
#e2e_audio_file.change(
|
482 |
+
e2_audio_submit_button.click(
|
483 |
transcribe_and_stream,
|
484 |
+
inputs=[e2e_audio_file],
|
485 |
outputs=[e2e_audio_file_trans],
|
486 |
+
show_progress="full",
|
487 |
).then(
|
488 |
aya_speech_text_response,
|
489 |
inputs=[e2e_audio_file_trans],
|
490 |
outputs=[e2e_audio_file_aya_response],
|
491 |
+
show_progress="full",
|
492 |
).then(
|
493 |
convert_text_to_speech,
|
494 |
inputs=[e2e_audio_file_aya_response],
|
495 |
outputs=[e2e_aya_audio_response],
|
496 |
+
show_progress="full",
|
497 |
)
|
498 |
|
499 |
demo.load(lambda: secrets.token_hex(16), None, token)
|
500 |
|
501 |
+
demo.queue(api_open=False, max_size=20, default_concurrency_limit=4).launch(show_api=False, allowed_paths=['/home/user/app'])
|