Spaces:
Runtime error
Runtime error
image-to-audio init commit
Browse files- .gitignore +6 -1
- README.md +19 -2
- app.py +178 -0
- audio/.placeholder +0 -0
- ref/gpt_chatbot_with_langchain.py +21 -0
- ref/text-to-audio.py +23 -0
- requirements.txt +5 -0
.gitignore
CHANGED
@@ -157,4 +157,9 @@ cython_debug/
|
|
157 |
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
157 |
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
.idea/
|
161 |
+
.vscode/
|
162 |
+
|
163 |
+
audio/*.wav
|
164 |
+
img/*.jpeg
|
165 |
+
img/*.jpg
|
README.md
CHANGED
@@ -1,2 +1,19 @@
|
|
1 |
-
#
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# The Image Reader π’
|
2 |
+
|
3 |
+
[The Image Reader π’ - Playground](www.google.com)
|
4 |
+
|
5 |
+
This application analyzes the uploaded image, generates an imaginative phrase, and then converts it into audio.
|
6 |
+
|
7 |
+
- For **image_to_audio** following technologies were used:
|
8 |
+
- **Image Reader:**
|
9 |
+
- HuggingFace ```image-to-text``` task used with ```Salesforce/blip-image-captioning-base``` pretrained model. Which produces a small description about the image.
|
10 |
+
- [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)
|
11 |
+
- BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
|
12 |
+
- **Generate an imaginative phrase:**
|
13 |
+
- OpenAI ```GPT-3.5-Turbo``` used to produce an imaginative narrative from the description generated earlier.
|
14 |
+
- The phrase generated with more than 40 words.
|
15 |
+
- [GPT-3.5 Turbo](https://openai.com/blog/gpt-3-5-turbo-fine-tuning-and-api-updates)
|
16 |
+
- **text-to-audio:**
|
17 |
+
- ```suno/bark-small``` used to generate the audio version of the imaginative narrative earlier.
|
18 |
+
- [suno/bark-small](https://huggingface.co/suno/bark-small)
|
19 |
+
- **BARK**: Bark is a transformer-based text-to-audio model created by [Suno](https://www.suno.ai/). Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. The model can also produce nonverbal communications like laughing, sighing and crying.
|
app.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import scipy
|
4 |
+
import streamlit as st
|
5 |
+
import torch
|
6 |
+
from langchain.chat_models import ChatOpenAI
|
7 |
+
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
|
8 |
+
from langchain_core.messages import SystemMessage
|
9 |
+
from transformers import AutoProcessor, BarkModel, pipeline
|
10 |
+
|
11 |
+
|
12 |
+
# create image-to-text pipeline
|
13 |
+
@st.cache_resource
|
14 |
+
def create_image_to_text_pipeline():
|
15 |
+
"""create image to text pipeline"""
|
16 |
+
|
17 |
+
task = "image-to-text"
|
18 |
+
model = "Salesforce/blip-image-captioning-base"
|
19 |
+
img_to_text_pipeline = pipeline(task, model=model)
|
20 |
+
return img_to_text_pipeline
|
21 |
+
|
22 |
+
|
23 |
+
# generate information about the image
|
24 |
+
def image_to_text(url):
|
25 |
+
"""image to text"""
|
26 |
+
|
27 |
+
generate_kwargs = {
|
28 |
+
"do_sample": True,
|
29 |
+
"temperature": 0.7,
|
30 |
+
"max_new_tokens": 256,
|
31 |
+
}
|
32 |
+
|
33 |
+
pipe = create_image_to_text_pipeline()
|
34 |
+
txt = pipe(url, generate_kwargs=generate_kwargs)[0]["generated_text"]
|
35 |
+
return txt
|
36 |
+
|
37 |
+
|
38 |
+
# load language models
|
39 |
+
@st.cache_resource
|
40 |
+
def load_llm_model(openai_key):
|
41 |
+
"""load llm model"""
|
42 |
+
|
43 |
+
model = ChatOpenAI(
|
44 |
+
model_name="gpt-3.5-turbo", openai_api_key=openai_key, temperature=0
|
45 |
+
)
|
46 |
+
return model
|
47 |
+
|
48 |
+
|
49 |
+
# generate audio script
|
50 |
+
def generate_audio_script(openai_key, scenario):
|
51 |
+
"""generate audio script"""
|
52 |
+
|
53 |
+
chat_template = ChatPromptTemplate.from_messages(
|
54 |
+
[
|
55 |
+
SystemMessage(
|
56 |
+
content=(
|
57 |
+
"You are a story teller. "
|
58 |
+
"You can generate a story based on a simple narrative, "
|
59 |
+
"the story be no more than 40 words."
|
60 |
+
)
|
61 |
+
),
|
62 |
+
HumanMessagePromptTemplate.from_template("{scenario}"),
|
63 |
+
]
|
64 |
+
)
|
65 |
+
|
66 |
+
llm_model = load_llm_model(openai_key)
|
67 |
+
ai_response = llm_model(chat_template.format_messages(scenario=scenario))
|
68 |
+
script = ai_response.content
|
69 |
+
return script
|
70 |
+
|
71 |
+
|
72 |
+
# load audio pipeline
|
73 |
+
@st.cache_resource
|
74 |
+
def load_audio_pipeline():
|
75 |
+
"""load audio pipeline"""
|
76 |
+
|
77 |
+
synthesiser = BarkModel.from_pretrained("suno/bark-small")
|
78 |
+
audio_processor = AutoProcessor.from_pretrained("suno/bark")
|
79 |
+
return synthesiser, audio_processor
|
80 |
+
|
81 |
+
|
82 |
+
def generate_audio(script):
|
83 |
+
"""generate audio"""
|
84 |
+
|
85 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
86 |
+
print("Device: ", device)
|
87 |
+
|
88 |
+
print("Script: ", script)
|
89 |
+
model, processor = load_audio_pipeline()
|
90 |
+
|
91 |
+
inputs = processor(script)
|
92 |
+
model = model.to(device)
|
93 |
+
|
94 |
+
speech_output = model.generate(**inputs.to(device))
|
95 |
+
sampling_rate = model.generation_config.sample_rate
|
96 |
+
scipy.io.wavfile.write(
|
97 |
+
"audio/bark_output.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy()
|
98 |
+
)
|
99 |
+
|
100 |
+
|
101 |
+
def main():
|
102 |
+
"""main"""
|
103 |
+
|
104 |
+
st.set_page_config(
|
105 |
+
page_title="Image to Speech",
|
106 |
+
page_icon="π’",
|
107 |
+
layout="centered",
|
108 |
+
initial_sidebar_state="collapsed",
|
109 |
+
)
|
110 |
+
|
111 |
+
st.header("The Image Reader π’", divider="rainbow")
|
112 |
+
|
113 |
+
st.subheader(
|
114 |
+
"This application :red[analyzes] the uploaded image, generates an :green[imaginative phrase], and then converts it into :blue[audio] :sunglasses:"
|
115 |
+
)
|
116 |
+
|
117 |
+
st.markdown("[check out the repository](https://github.com/ThivaV/image_to_audio)")
|
118 |
+
|
119 |
+
openai_key = st.text_input("Enter your OpenAI key π", type="password")
|
120 |
+
|
121 |
+
progress_bar_message = "Operation in progress. Please wait."
|
122 |
+
|
123 |
+
uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg"])
|
124 |
+
if uploaded_image is not None:
|
125 |
+
progress_bar = st.progress(0, text=progress_bar_message)
|
126 |
+
|
127 |
+
# rename all the uploaded images to "uploaded_image"
|
128 |
+
image_ext = os.path.splitext(uploaded_image.name)[1]
|
129 |
+
new_image_name = "uploaded_image" + image_ext
|
130 |
+
image_save_path = "img/" + new_image_name
|
131 |
+
|
132 |
+
byte_data = uploaded_image.getvalue()
|
133 |
+
with open(image_save_path, "wb") as file:
|
134 |
+
file.write(byte_data)
|
135 |
+
|
136 |
+
# 10% completed
|
137 |
+
progress_bar.progress(10, text=progress_bar_message)
|
138 |
+
|
139 |
+
col_1, col_2 = st.columns([6, 4])
|
140 |
+
|
141 |
+
with col_1:
|
142 |
+
st.image(uploaded_image, caption="Uploaded image.", use_column_width=True)
|
143 |
+
|
144 |
+
# 20% completed
|
145 |
+
progress_bar.progress(20, text=progress_bar_message)
|
146 |
+
|
147 |
+
scenario = image_to_text(image_save_path)
|
148 |
+
|
149 |
+
# 40% completed
|
150 |
+
progress_bar.progress(40, text=progress_bar_message)
|
151 |
+
|
152 |
+
script = generate_audio_script(openai_key, scenario)
|
153 |
+
|
154 |
+
# 60% completed
|
155 |
+
progress_bar.progress(60, text=progress_bar_message)
|
156 |
+
|
157 |
+
generate_audio(script)
|
158 |
+
|
159 |
+
# 90% completed
|
160 |
+
progress_bar.progress(90, text=progress_bar_message)
|
161 |
+
|
162 |
+
with col_2:
|
163 |
+
with st.expander("About the image"):
|
164 |
+
st.write(scenario)
|
165 |
+
|
166 |
+
with st.expander("Script"):
|
167 |
+
st.write(script)
|
168 |
+
|
169 |
+
st.audio("audio/bark_output.wav")
|
170 |
+
|
171 |
+
# 100% completed
|
172 |
+
progress_bar.progress(
|
173 |
+
100, text="Operation completed. Thank you for your patients."
|
174 |
+
)
|
175 |
+
|
176 |
+
|
177 |
+
if __name__ == "__main__":
|
178 |
+
main()
|
audio/.placeholder
ADDED
File without changes
|
ref/gpt_chatbot_with_langchain.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.chat_models import ChatOpenAI
|
2 |
+
from langchain.prompts import HumanMessagePromptTemplate
|
3 |
+
from langchain_core.messages import SystemMessage
|
4 |
+
from langchain.prompts import ChatPromptTemplate
|
5 |
+
|
6 |
+
chat_template = ChatPromptTemplate.from_messages(
|
7 |
+
[
|
8 |
+
SystemMessage(
|
9 |
+
content=(
|
10 |
+
"You are a story teller. "
|
11 |
+
"You can generate a story based on a simple narrative, "
|
12 |
+
"the story be no more than 20 words."
|
13 |
+
)
|
14 |
+
),
|
15 |
+
HumanMessagePromptTemplate.from_template("{text}")
|
16 |
+
]
|
17 |
+
)
|
18 |
+
|
19 |
+
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
|
20 |
+
resp = llm(chat_template.format_messages(text="white wool shirt"))
|
21 |
+
print(resp.content)
|
ref/text-to-audio.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# text-to-audio using suno/bark
|
2 |
+
import scipy
|
3 |
+
import torch
|
4 |
+
from transformers import AutoProcessor
|
5 |
+
from transformers import BarkModel
|
6 |
+
|
7 |
+
model = BarkModel.from_pretrained("suno/bark-small")
|
8 |
+
|
9 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
10 |
+
print("device: ", device)
|
11 |
+
|
12 |
+
processor = AutoProcessor.from_pretrained("suno/bark")
|
13 |
+
|
14 |
+
# prepare the inputs
|
15 |
+
text_prompt = "You are a story teller. You can generate a story based on a simple narrative, the story be no more than 20 words."
|
16 |
+
inputs = processor(text_prompt)
|
17 |
+
|
18 |
+
# generate speech
|
19 |
+
model = model.to(device)
|
20 |
+
speech_output = model.generate(**inputs.to(device))
|
21 |
+
|
22 |
+
sampling_rate = model.generation_config.sample_rate
|
23 |
+
scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy())
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
scipy
|
3 |
+
transformers[torch]
|
4 |
+
langchain==0.0.352
|
5 |
+
openai==1.6.1
|