Spaces:

yashasgupta
/

rag_system

Sleeping

File size: 6,913 Bytes

# import streamlit as st  
# from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
# from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
# import os 
# import nltk 
# import io 
# import fitz
# nltk.download("punkt")

# st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
# st.header("AI Chatbot :robot_face:")

# os.environ["GOOGLE_API_KEY"] = os.getenv("k4")
# # Creating a template

# chat_template = ChatPromptTemplate.from_messages([
#     # System Message establishes bot's role and general behavior guidelines
#     SystemMessage(content="""You are a Helpful AI Bot. 
#     You take the context and question from user. Your answer should be based on the specific context."""),
#     # Human Message Prompt Template
#     HumanMessagePromptTemplate.from_template("""Answer the question based on the given context.
#     Context:
#     {context} 
    
#     Question: 
#     {question}
    
#     Answer: """)
# ])

# #user's question.
# #how many results we want to print.

# from langchain_google_genai import ChatGoogleGenerativeAI  

# chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest")

# from langchain_core.output_parsers import StrOutputParser

# output_parser = StrOutputParser()

# chain = chat_template | chat_model | output_parser

# from langchain_community.document_loaders import PDFMinerLoader
# from langchain_text_splitters import NLTKTextSplitter
# from langchain_google_genai import GoogleGenerativeAIEmbeddings 
# from langchain_community.vectorstores import Chroma  
# from langchain_core.runnables import RunnablePassthrough

# def extract_text_from_pdf(pdf_file):
#     document = fitz.open(stream=pdf_file, filetype="pdf")
#     text = ""
#     for page_num in range(len(document)):
#         page = document.load_page(page_num)
#         text += page.get_text()
#     return text


# uploaded_file = st.file_uploader("Choose a pdf file",type = "pdf")

# if uploaded_file is not None:
    
#     pdf_file = io.BytesIO(uploaded_file.read())
#     text = extract_text_from_pdf(pdf_file)
#     #pdf_loader = PDFMinerLoader(pdf_file)
#     #dat_nik = pdf_loader.load()
#     text_splitter = NLTKTextSplitter(chunk_size = 500,chunk_overlap = 100)
#     chunks = text_splitter.split_documents([text])

#     embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

#     db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db_1")

#     db.persist()

#     db_connection = Chroma(persist_directory="./chroma_db_1", embedding_function=embedding_model)

#     retriever = db_connection.as_retriever(search_kwargs={"k": 5})

#     def format_docs(docs):
#         return "\n\n".join(doc.page_content for doc in docs)

#     rag_chain = (
#         {"context": retriever | format_docs, "question": RunnablePassthrough()}
#         | chat_template
#         | chat_model
#         | output_parser
#     )

#     user_input = st.text_area("Ask Questions to AI")
#     if st.button("Submit"):
#         st.subheader(":green[Query:]")
#         st.subheader(user_input)
#         response = rag_chain.invoke(user_input)
#         st.subheader(":green[Response:-]")
#         st.write(response)
    
##################################################### chatgpt code model #############################################

import streamlit as st  
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
import os 
import nltk 
import io 
import fitz
nltk.download("punkt")

st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
st.header("AI Chatbot :robot_face:")

# Set up environment variables
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

# Creating a template
chat_template = ChatPromptTemplate.from_messages([
    SystemMessage(content="""You are a Helpful AI Bot. 
    You take the context and question from user. Your answer should be based on the specific context."""),
    HumanMessagePromptTemplate.from_template("""Answer the question based on the given context.
    Context:
    {context} 
    
    Question: 
    {question}
    
    Answer: """)
])

# Initialize chat model
from langchain_google_genai import ChatGoogleGenerativeAI  
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest")

# Initialize output parser
from langchain_core.output_parsers import StrOutputParser
output_parser = StrOutputParser()

# Initialize the chain
chain = chat_template | chat_model | output_parser

# Initialize document loaders and splitters
from langchain_community.document_loaders import PDFMinerLoader
from langchain_text_splitters import NLTKTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings 
from langchain_community.vectorstores import Chroma  
from langchain_core.runnables import RunnablePassthrough

def extract_text_from_pdf(pdf_file):
    document = fitz.open(stream=pdf_file, filetype="pdf")
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Streamlit file uploader
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:
    # Extract text from the uploaded PDF
    pdf_file = io.BytesIO(uploaded_file.read())
    text = extract_text_from_pdf(pdf_file)
    
    # Split the document into chunks
    text_splitter = NLTKTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = text_splitter.split_documents([text])

    # Initialize embeddings and vectorstore
    embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db")
    print(f"Current working directory: {os.getcwd()}")

# Check if the 'static' directory exists
    if not os.path.exists('static'):
        print("'static' directory does not exist. Creating it...")
        os.makedirs('static')

    db.persist()

    db_connection = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)
    retriever = db_connection.as_retriever(search_kwargs={"k": 5})

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | chat_template
        | chat_model
        | output_parser
    )

    user_input = st.text_area("Ask Questions to AI")
    if st.button("Submit"):
        st.subheader(":green[Query:]")
        st.subheader(user_input)
        response = rag_chain.invoke({"question": user_input})
        st.subheader(":green[Response:]")
        st.write(response)
else:
    st.write("Please upload a PDF file to get started.")