Spaces:

thivav
/

image-to-audio

Sleeping

App Files Files Community

image-to-audio / app.py

thivav

image-to-audio init commit

c826025 9 months ago

raw

history blame contribute delete

No virus

4.98 kB

	import os

	import scipy
	import streamlit as st
	import torch
	from langchain.chat_models import ChatOpenAI
	from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
	from langchain_core.messages import SystemMessage
	from transformers import AutoProcessor, BarkModel, pipeline


	# create image-to-text pipeline
	@st.cache_resource
	def create_image_to_text_pipeline():
	"""create image to text pipeline"""

	task = "image-to-text"
	model = "Salesforce/blip-image-captioning-base"
	img_to_text_pipeline = pipeline(task, model=model)
	return img_to_text_pipeline


	# generate information about the image
	def image_to_text(url):
	"""image to text"""

	generate_kwargs = {
	"do_sample": True,
	"temperature": 0.7,
	"max_new_tokens": 256,
	}

	pipe = create_image_to_text_pipeline()
	txt = pipe(url, generate_kwargs=generate_kwargs)[0]["generated_text"]
	return txt


	# load language models
	@st.cache_resource
	def load_llm_model(openai_key):
	"""load llm model"""

	model = ChatOpenAI(
	model_name="gpt-3.5-turbo", openai_api_key=openai_key, temperature=0
	)
	return model


	# generate audio script
	def generate_audio_script(openai_key, scenario):
	"""generate audio script"""

	chat_template = ChatPromptTemplate.from_messages(
	[
	SystemMessage(
	content=(
	"You are a story teller. "
	"You can generate a story based on a simple narrative, "
	"the story be no more than 40 words."
	)
	),
	HumanMessagePromptTemplate.from_template("{scenario}"),
	]
	)

	llm_model = load_llm_model(openai_key)
	ai_response = llm_model(chat_template.format_messages(scenario=scenario))
	script = ai_response.content
	return script


	# load audio pipeline
	@st.cache_resource
	def load_audio_pipeline():
	"""load audio pipeline"""

	synthesiser = BarkModel.from_pretrained("suno/bark-small")
	audio_processor = AutoProcessor.from_pretrained("suno/bark")
	return synthesiser, audio_processor


	def generate_audio(script):
	"""generate audio"""

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	print("Device: ", device)

	print("Script: ", script)
	model, processor = load_audio_pipeline()

	inputs = processor(script)
	model = model.to(device)

	speech_output = model.generate(**inputs.to(device))
	sampling_rate = model.generation_config.sample_rate
	scipy.io.wavfile.write(
	"audio/bark_output.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy()
	)


	def main():
	"""main"""

	st.set_page_config(
	page_title="Image to Speech",
	page_icon="📢",
	layout="centered",
	initial_sidebar_state="collapsed",
	)

	st.header("The Image Reader 📢", divider="rainbow")

	st.subheader(
	"This application :red[analyzes] the uploaded image, generates an :green[imaginative phrase], and then converts it into :blue[audio] :sunglasses:"
	)

	st.markdown("[check out the repository](https://github.com/ThivaV/image_to_audio)")

	openai_key = st.text_input("Enter your OpenAI key 👇", type="password")

	progress_bar_message = "Operation in progress. Please wait."

	uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg"])
	if uploaded_image is not None:
	progress_bar = st.progress(0, text=progress_bar_message)

	# rename all the uploaded images to "uploaded_image"
	image_ext = os.path.splitext(uploaded_image.name)[1]
	new_image_name = "uploaded_image" + image_ext
	image_save_path = "img/" + new_image_name

	byte_data = uploaded_image.getvalue()
	with open(image_save_path, "wb") as file:
	file.write(byte_data)

	# 10% completed
	progress_bar.progress(10, text=progress_bar_message)

	col_1, col_2 = st.columns([6, 4])

	with col_1:
	st.image(uploaded_image, caption="Uploaded image.", use_column_width=True)

	# 20% completed
	progress_bar.progress(20, text=progress_bar_message)

	scenario = image_to_text(image_save_path)

	# 40% completed
	progress_bar.progress(40, text=progress_bar_message)

	script = generate_audio_script(openai_key, scenario)

	# 60% completed
	progress_bar.progress(60, text=progress_bar_message)

	generate_audio(script)

	# 90% completed
	progress_bar.progress(90, text=progress_bar_message)

	with col_2:
	with st.expander("About the image"):
	st.write(scenario)

	with st.expander("Script"):
	st.write(script)

	st.audio("audio/bark_output.wav")

	# 100% completed
	progress_bar.progress(
	100, text="Operation completed. Thank you for your patients."
	)


	if __name__ == "__main__":
	main()