thivav commited on
Commit
c826025
β€’
1 Parent(s): 7143f85

image-to-audio init commit

Browse files
.gitignore CHANGED
@@ -157,4 +157,9 @@ cython_debug/
157
  # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
- #.idea/
 
 
 
 
 
 
157
  # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
  # and can be added to the global gitignore or merged into this file. For a more nuclear
159
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ .idea/
161
+ .vscode/
162
+
163
+ audio/*.wav
164
+ img/*.jpeg
165
+ img/*.jpg
README.md CHANGED
@@ -1,2 +1,19 @@
1
- # image_to_audio
2
- image_to_audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The Image Reader πŸ“’
2
+
3
+ [The Image Reader πŸ“’ - Playground](www.google.com)
4
+
5
+ This application analyzes the uploaded image, generates an imaginative phrase, and then converts it into audio.
6
+
7
+ - For **image_to_audio** following technologies were used:
8
+ - **Image Reader:**
9
+ - HuggingFace ```image-to-text``` task used with ```Salesforce/blip-image-captioning-base``` pretrained model. Which produces a small description about the image.
10
+ - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base)
11
+ - BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
12
+ - **Generate an imaginative phrase:**
13
+ - OpenAI ```GPT-3.5-Turbo``` used to produce an imaginative narrative from the description generated earlier.
14
+ - The phrase generated with more than 40 words.
15
+ - [GPT-3.5 Turbo](https://openai.com/blog/gpt-3-5-turbo-fine-tuning-and-api-updates)
16
+ - **text-to-audio:**
17
+ - ```suno/bark-small``` used to generate the audio version of the imaginative narrative earlier.
18
+ - [suno/bark-small](https://huggingface.co/suno/bark-small)
19
+ - **BARK**: Bark is a transformer-based text-to-audio model created by [Suno](https://www.suno.ai/). Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. The model can also produce nonverbal communications like laughing, sighing and crying.
app.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import scipy
4
+ import streamlit as st
5
+ import torch
6
+ from langchain.chat_models import ChatOpenAI
7
+ from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
8
+ from langchain_core.messages import SystemMessage
9
+ from transformers import AutoProcessor, BarkModel, pipeline
10
+
11
+
12
+ # create image-to-text pipeline
13
+ @st.cache_resource
14
+ def create_image_to_text_pipeline():
15
+ """create image to text pipeline"""
16
+
17
+ task = "image-to-text"
18
+ model = "Salesforce/blip-image-captioning-base"
19
+ img_to_text_pipeline = pipeline(task, model=model)
20
+ return img_to_text_pipeline
21
+
22
+
23
+ # generate information about the image
24
+ def image_to_text(url):
25
+ """image to text"""
26
+
27
+ generate_kwargs = {
28
+ "do_sample": True,
29
+ "temperature": 0.7,
30
+ "max_new_tokens": 256,
31
+ }
32
+
33
+ pipe = create_image_to_text_pipeline()
34
+ txt = pipe(url, generate_kwargs=generate_kwargs)[0]["generated_text"]
35
+ return txt
36
+
37
+
38
+ # load language models
39
+ @st.cache_resource
40
+ def load_llm_model(openai_key):
41
+ """load llm model"""
42
+
43
+ model = ChatOpenAI(
44
+ model_name="gpt-3.5-turbo", openai_api_key=openai_key, temperature=0
45
+ )
46
+ return model
47
+
48
+
49
+ # generate audio script
50
+ def generate_audio_script(openai_key, scenario):
51
+ """generate audio script"""
52
+
53
+ chat_template = ChatPromptTemplate.from_messages(
54
+ [
55
+ SystemMessage(
56
+ content=(
57
+ "You are a story teller. "
58
+ "You can generate a story based on a simple narrative, "
59
+ "the story be no more than 40 words."
60
+ )
61
+ ),
62
+ HumanMessagePromptTemplate.from_template("{scenario}"),
63
+ ]
64
+ )
65
+
66
+ llm_model = load_llm_model(openai_key)
67
+ ai_response = llm_model(chat_template.format_messages(scenario=scenario))
68
+ script = ai_response.content
69
+ return script
70
+
71
+
72
+ # load audio pipeline
73
+ @st.cache_resource
74
+ def load_audio_pipeline():
75
+ """load audio pipeline"""
76
+
77
+ synthesiser = BarkModel.from_pretrained("suno/bark-small")
78
+ audio_processor = AutoProcessor.from_pretrained("suno/bark")
79
+ return synthesiser, audio_processor
80
+
81
+
82
+ def generate_audio(script):
83
+ """generate audio"""
84
+
85
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
86
+ print("Device: ", device)
87
+
88
+ print("Script: ", script)
89
+ model, processor = load_audio_pipeline()
90
+
91
+ inputs = processor(script)
92
+ model = model.to(device)
93
+
94
+ speech_output = model.generate(**inputs.to(device))
95
+ sampling_rate = model.generation_config.sample_rate
96
+ scipy.io.wavfile.write(
97
+ "audio/bark_output.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy()
98
+ )
99
+
100
+
101
+ def main():
102
+ """main"""
103
+
104
+ st.set_page_config(
105
+ page_title="Image to Speech",
106
+ page_icon="πŸ“’",
107
+ layout="centered",
108
+ initial_sidebar_state="collapsed",
109
+ )
110
+
111
+ st.header("The Image Reader πŸ“’", divider="rainbow")
112
+
113
+ st.subheader(
114
+ "This application :red[analyzes] the uploaded image, generates an :green[imaginative phrase], and then converts it into :blue[audio] :sunglasses:"
115
+ )
116
+
117
+ st.markdown("[check out the repository](https://github.com/ThivaV/image_to_audio)")
118
+
119
+ openai_key = st.text_input("Enter your OpenAI key πŸ‘‡", type="password")
120
+
121
+ progress_bar_message = "Operation in progress. Please wait."
122
+
123
+ uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg"])
124
+ if uploaded_image is not None:
125
+ progress_bar = st.progress(0, text=progress_bar_message)
126
+
127
+ # rename all the uploaded images to "uploaded_image"
128
+ image_ext = os.path.splitext(uploaded_image.name)[1]
129
+ new_image_name = "uploaded_image" + image_ext
130
+ image_save_path = "img/" + new_image_name
131
+
132
+ byte_data = uploaded_image.getvalue()
133
+ with open(image_save_path, "wb") as file:
134
+ file.write(byte_data)
135
+
136
+ # 10% completed
137
+ progress_bar.progress(10, text=progress_bar_message)
138
+
139
+ col_1, col_2 = st.columns([6, 4])
140
+
141
+ with col_1:
142
+ st.image(uploaded_image, caption="Uploaded image.", use_column_width=True)
143
+
144
+ # 20% completed
145
+ progress_bar.progress(20, text=progress_bar_message)
146
+
147
+ scenario = image_to_text(image_save_path)
148
+
149
+ # 40% completed
150
+ progress_bar.progress(40, text=progress_bar_message)
151
+
152
+ script = generate_audio_script(openai_key, scenario)
153
+
154
+ # 60% completed
155
+ progress_bar.progress(60, text=progress_bar_message)
156
+
157
+ generate_audio(script)
158
+
159
+ # 90% completed
160
+ progress_bar.progress(90, text=progress_bar_message)
161
+
162
+ with col_2:
163
+ with st.expander("About the image"):
164
+ st.write(scenario)
165
+
166
+ with st.expander("Script"):
167
+ st.write(script)
168
+
169
+ st.audio("audio/bark_output.wav")
170
+
171
+ # 100% completed
172
+ progress_bar.progress(
173
+ 100, text="Operation completed. Thank you for your patients."
174
+ )
175
+
176
+
177
+ if __name__ == "__main__":
178
+ main()
audio/.placeholder ADDED
File without changes
ref/gpt_chatbot_with_langchain.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chat_models import ChatOpenAI
2
+ from langchain.prompts import HumanMessagePromptTemplate
3
+ from langchain_core.messages import SystemMessage
4
+ from langchain.prompts import ChatPromptTemplate
5
+
6
+ chat_template = ChatPromptTemplate.from_messages(
7
+ [
8
+ SystemMessage(
9
+ content=(
10
+ "You are a story teller. "
11
+ "You can generate a story based on a simple narrative, "
12
+ "the story be no more than 20 words."
13
+ )
14
+ ),
15
+ HumanMessagePromptTemplate.from_template("{text}")
16
+ ]
17
+ )
18
+
19
+ llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
20
+ resp = llm(chat_template.format_messages(text="white wool shirt"))
21
+ print(resp.content)
ref/text-to-audio.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # text-to-audio using suno/bark
2
+ import scipy
3
+ import torch
4
+ from transformers import AutoProcessor
5
+ from transformers import BarkModel
6
+
7
+ model = BarkModel.from_pretrained("suno/bark-small")
8
+
9
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
+ print("device: ", device)
11
+
12
+ processor = AutoProcessor.from_pretrained("suno/bark")
13
+
14
+ # prepare the inputs
15
+ text_prompt = "You are a story teller. You can generate a story based on a simple narrative, the story be no more than 20 words."
16
+ inputs = processor(text_prompt)
17
+
18
+ # generate speech
19
+ model = model.to(device)
20
+ speech_output = model.generate(**inputs.to(device))
21
+
22
+ sampling_rate = model.generation_config.sample_rate
23
+ scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy())
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit
2
+ scipy
3
+ transformers[torch]
4
+ langchain==0.0.352
5
+ openai==1.6.1