Spaces:

yashasgupta
/

rag_system

Sleeping

App Files Files Community

rag_system / app.py

yashasgupta

Update app.py

20e3703 verified 4 months ago

raw

history blame contribute delete

6.91 kB

	# import streamlit as st
	# from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
	# from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
	# import os
	# import nltk
	# import io
	# import fitz
	# nltk.download("punkt")

	# st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
	# st.header("AI Chatbot :robot_face:")

	# os.environ["GOOGLE_API_KEY"] = os.getenv("k4")
	# # Creating a template

	# chat_template = ChatPromptTemplate.from_messages([
	# # System Message establishes bot's role and general behavior guidelines
	# SystemMessage(content="""You are a Helpful AI Bot.
	# You take the context and question from user. Your answer should be based on the specific context."""),
	# # Human Message Prompt Template
	# HumanMessagePromptTemplate.from_template("""Answer the question based on the given context.
	# Context:
	# {context}

	# Question:
	# {question}

	# Answer: """)
	# ])

	# #user's question.
	# #how many results we want to print.

	# from langchain_google_genai import ChatGoogleGenerativeAI

	# chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest")

	# from langchain_core.output_parsers import StrOutputParser

	# output_parser = StrOutputParser()

	# chain = chat_template \| chat_model \| output_parser

	# from langchain_community.document_loaders import PDFMinerLoader
	# from langchain_text_splitters import NLTKTextSplitter
	# from langchain_google_genai import GoogleGenerativeAIEmbeddings
	# from langchain_community.vectorstores import Chroma
	# from langchain_core.runnables import RunnablePassthrough

	# def extract_text_from_pdf(pdf_file):
	# document = fitz.open(stream=pdf_file, filetype="pdf")
	# text = ""
	# for page_num in range(len(document)):
	# page = document.load_page(page_num)
	# text += page.get_text()
	# return text


	# uploaded_file = st.file_uploader("Choose a pdf file",type = "pdf")

	# if uploaded_file is not None:

	# pdf_file = io.BytesIO(uploaded_file.read())
	# text = extract_text_from_pdf(pdf_file)
	# #pdf_loader = PDFMinerLoader(pdf_file)
	# #dat_nik = pdf_loader.load()
	# text_splitter = NLTKTextSplitter(chunk_size = 500,chunk_overlap = 100)
	# chunks = text_splitter.split_documents([text])

	# embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

	# db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db_1")

	# db.persist()

	# db_connection = Chroma(persist_directory="./chroma_db_1", embedding_function=embedding_model)

	# retriever = db_connection.as_retriever(search_kwargs={"k": 5})

	# def format_docs(docs):
	# return "\n\n".join(doc.page_content for doc in docs)

	# rag_chain = (
	# {"context": retriever \| format_docs, "question": RunnablePassthrough()}
	# \| chat_template
	# \| chat_model
	# \| output_parser
	# )

	# user_input = st.text_area("Ask Questions to AI")
	# if st.button("Submit"):
	# st.subheader(":green[Query:]")
	# st.subheader(user_input)
	# response = rag_chain.invoke(user_input)
	# st.subheader(":green[Response:-]")
	# st.write(response)

	##################################################### chatgpt code model #############################################

	import streamlit as st
	from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
	from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
	import os
	import nltk
	import io
	import fitz
	nltk.download("punkt")

	st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
	st.header("AI Chatbot :robot_face:")

	# Set up environment variables
	os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

	# Creating a template
	chat_template = ChatPromptTemplate.from_messages([
	SystemMessage(content="""You are a Helpful AI Bot.
	You take the context and question from user. Your answer should be based on the specific context."""),
	HumanMessagePromptTemplate.from_template("""Answer the question based on the given context.
	Context:
	{context}

	Question:
	{question}

	Answer: """)
	])

	# Initialize chat model
	from langchain_google_genai import ChatGoogleGenerativeAI
	chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest")

	# Initialize output parser
	from langchain_core.output_parsers import StrOutputParser
	output_parser = StrOutputParser()

	# Initialize the chain
	chain = chat_template \| chat_model \| output_parser

	# Initialize document loaders and splitters
	from langchain_community.document_loaders import PDFMinerLoader
	from langchain_text_splitters import NLTKTextSplitter
	from langchain_google_genai import GoogleGenerativeAIEmbeddings
	from langchain_community.vectorstores import Chroma
	from langchain_core.runnables import RunnablePassthrough

	def extract_text_from_pdf(pdf_file):
	document = fitz.open(stream=pdf_file, filetype="pdf")
	text = ""
	for page_num in range(len(document)):
	page = document.load_page(page_num)
	text += page.get_text()
	return text

	# Streamlit file uploader
	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file is not None:
	# Extract text from the uploaded PDF
	pdf_file = io.BytesIO(uploaded_file.read())
	text = extract_text_from_pdf(pdf_file)

	# Split the document into chunks
	text_splitter = NLTKTextSplitter(chunk_size=500, chunk_overlap=100)
	chunks = text_splitter.split_documents([text])

	# Initialize embeddings and vectorstore
	embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
	db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db")
	print(f"Current working directory: {os.getcwd()}")

	# Check if the 'static' directory exists
	if not os.path.exists('static'):
	print("'static' directory does not exist. Creating it...")
	os.makedirs('static')

	db.persist()

	db_connection = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)
	retriever = db_connection.as_retriever(search_kwargs={"k": 5})

	def format_docs(docs):
	return "\n\n".join(doc.page_content for doc in docs)

	rag_chain = (
	{"context": retriever \| format_docs, "question": RunnablePassthrough()}
	\| chat_template
	\| chat_model
	\| output_parser
	)

	user_input = st.text_area("Ask Questions to AI")
	if st.button("Submit"):
	st.subheader(":green[Query:]")
	st.subheader(user_input)
	response = rag_chain.invoke({"question": user_input})
	st.subheader(":green[Response:]")
	st.write(response)
	else:
	st.write("Please upload a PDF file to get started.")