File size: 6,913 Bytes
2a18488
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ce02a3
 
 
1501391
6483701
fa2ec69
4e4a24d
f02f8c6
8ce02a3
 
 
 
2a18488
 
8ce02a3
2a18488
8ce02a3
 
 
 
 
 
 
 
 
 
 
 
 
2a18488
8ce02a3
1501391
8ce02a3
2a18488
8ce02a3
 
 
2a18488
8ce02a3
 
2a18488
1501391
8ce02a3
281101c
 
cfb6e62
 
4e4a24d
 
 
 
 
 
 
cfb6e62
2a18488
 
bd8a766
4a2a968
2a18488
281101c
4e4a24d
2a18488
 
 
87901a4
281101c
2a18488
281101c
2a18488
20e3703
 
 
 
 
 
281101c
 
 
2a18488
281101c
cfb6e62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a18488
 
cfb6e62
2a18488
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# import streamlit as st  
# from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
# from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
# import os 
# import nltk 
# import io 
# import fitz
# nltk.download("punkt")

# st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
# st.header("AI Chatbot :robot_face:")

# os.environ["GOOGLE_API_KEY"] = os.getenv("k4")
# # Creating a template

# chat_template = ChatPromptTemplate.from_messages([
#     # System Message establishes bot's role and general behavior guidelines
#     SystemMessage(content="""You are a Helpful AI Bot. 
#     You take the context and question from user. Your answer should be based on the specific context."""),
#     # Human Message Prompt Template
#     HumanMessagePromptTemplate.from_template("""Answer the question based on the given context.
#     Context:
#     {context} 
    
#     Question: 
#     {question}
    
#     Answer: """)
# ])

# #user's question.
# #how many results we want to print.

# from langchain_google_genai import ChatGoogleGenerativeAI  

# chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest")

# from langchain_core.output_parsers import StrOutputParser

# output_parser = StrOutputParser()

# chain = chat_template | chat_model | output_parser

# from langchain_community.document_loaders import PDFMinerLoader
# from langchain_text_splitters import NLTKTextSplitter
# from langchain_google_genai import GoogleGenerativeAIEmbeddings 
# from langchain_community.vectorstores import Chroma  
# from langchain_core.runnables import RunnablePassthrough

# def extract_text_from_pdf(pdf_file):
#     document = fitz.open(stream=pdf_file, filetype="pdf")
#     text = ""
#     for page_num in range(len(document)):
#         page = document.load_page(page_num)
#         text += page.get_text()
#     return text


# uploaded_file = st.file_uploader("Choose a pdf file",type = "pdf")

# if uploaded_file is not None:
    
#     pdf_file = io.BytesIO(uploaded_file.read())
#     text = extract_text_from_pdf(pdf_file)
#     #pdf_loader = PDFMinerLoader(pdf_file)
#     #dat_nik = pdf_loader.load()
#     text_splitter = NLTKTextSplitter(chunk_size = 500,chunk_overlap = 100)
#     chunks = text_splitter.split_documents([text])

#     embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

#     db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db_1")

#     db.persist()

#     db_connection = Chroma(persist_directory="./chroma_db_1", embedding_function=embedding_model)

#     retriever = db_connection.as_retriever(search_kwargs={"k": 5})

#     def format_docs(docs):
#         return "\n\n".join(doc.page_content for doc in docs)

#     rag_chain = (
#         {"context": retriever | format_docs, "question": RunnablePassthrough()}
#         | chat_template
#         | chat_model
#         | output_parser
#     )

#     user_input = st.text_area("Ask Questions to AI")
#     if st.button("Submit"):
#         st.subheader(":green[Query:]")
#         st.subheader(user_input)
#         response = rag_chain.invoke(user_input)
#         st.subheader(":green[Response:-]")
#         st.write(response)
    
##################################################### chatgpt code model #############################################

import streamlit as st  
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
import os 
import nltk 
import io 
import fitz
nltk.download("punkt")

st.title(':blue[Langchain:] A Rag System on “Leave No Context Behind” Paper')
st.header("AI Chatbot :robot_face:")

# Set up environment variables
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

# Creating a template
chat_template = ChatPromptTemplate.from_messages([
    SystemMessage(content="""You are a Helpful AI Bot. 
    You take the context and question from user. Your answer should be based on the specific context."""),
    HumanMessagePromptTemplate.from_template("""Answer the question based on the given context.
    Context:
    {context} 
    
    Question: 
    {question}
    
    Answer: """)
])

# Initialize chat model
from langchain_google_genai import ChatGoogleGenerativeAI  
chat_model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest")

# Initialize output parser
from langchain_core.output_parsers import StrOutputParser
output_parser = StrOutputParser()

# Initialize the chain
chain = chat_template | chat_model | output_parser

# Initialize document loaders and splitters
from langchain_community.document_loaders import PDFMinerLoader
from langchain_text_splitters import NLTKTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings 
from langchain_community.vectorstores import Chroma  
from langchain_core.runnables import RunnablePassthrough

def extract_text_from_pdf(pdf_file):
    document = fitz.open(stream=pdf_file, filetype="pdf")
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Streamlit file uploader
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:
    # Extract text from the uploaded PDF
    pdf_file = io.BytesIO(uploaded_file.read())
    text = extract_text_from_pdf(pdf_file)
    
    # Split the document into chunks
    text_splitter = NLTKTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = text_splitter.split_documents([text])

    # Initialize embeddings and vectorstore
    embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db")
    print(f"Current working directory: {os.getcwd()}")

# Check if the 'static' directory exists
    if not os.path.exists('static'):
        print("'static' directory does not exist. Creating it...")
        os.makedirs('static')

    db.persist()

    db_connection = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)
    retriever = db_connection.as_retriever(search_kwargs={"k": 5})

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | chat_template
        | chat_model
        | output_parser
    )

    user_input = st.text_area("Ask Questions to AI")
    if st.button("Submit"):
        st.subheader(":green[Query:]")
        st.subheader(user_input)
        response = rag_chain.invoke({"question": user_input})
        st.subheader(":green[Response:]")
        st.write(response)
else:
    st.write("Please upload a PDF file to get started.")