File size: 3,261 Bytes
eaf0e00
 
952eb35
5aee298
eaf0e00
 
 
 
 
 
 
 
 
 
 
 
 
8c5d334
 
 
eaf0e00
 
 
 
8c5d334
 
 
 
 
 
952eb35
8c5d334
 
 
952eb35
8c5d334
 
952eb35
8c5d334
 
 
 
 
 
 
 
 
952eb35
8c5d334
 
952eb35
8c5d334
 
952eb35
8c5d334
 
952eb35
8c5d334
 
 
952eb35
8c5d334
 
 
 
 
 
5aee298
8c5d334
 
 
5aee298
8c5d334
5aee298
 
 
 
8c5d334
 
 
5aee298
8c5d334
 
eaf0e00
8c5d334
 
eaf0e00
8c5d334
eaf0e00
8c5d334
 
eaf0e00
8c5d334
 
 
eaf0e00
8c5d334
 
eaf0e00
8c5d334
 
 
 
37c7e44
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import streamlit as st
from pathlib import Path
from io import StringIO

#for textfiles
from langchain.document_loaders import TextLoader
#text splitter
from langchain.text_splitter import CharacterTextSplitter
#for using HugginFace models & embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS
#facebook vectorization
from langchain.chains.question_answering import load_qa_chain
#load pdf
#vectorize db index with chromadb
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredPDFLoader

os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]

def init():
	global embeddings, llm, llm2, chain
	# Embeddings
	embeddings = HuggingFaceEmbeddings()
	llm=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
	chain = load_qa_chain(llm, chain_type="stuff")

def pdf_file(txtFileObj):
	st.subheader('Uploaded PDF File:')
	st.write(txtFileObj.name)

	with open(txtFileObj.name, "wb") as f:
  		f.write(txtFileObj.getbuffer())

	loaders = [UnstructuredPDFLoader(txtFileObj.name)]
	index = VectorstoreIndexCreator(
    		embedding=embeddings,
    		text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
	
	chain = RetrievalQA.from_chain_type(llm=llm,
				     chain_type="stuff",
					 retriever=index.vectorstore.as_retriever(),
					 input_key="question")

	st.subheader('Enter query')
	query = st.text_input('Ask anything about the Document you uploaded')

	if (query):
		answer = chain.run(question=query)

		st.subheader('Answer')
		st.write(answer)

def text_file(txtFileObj):
	st.subheader('Uploaded Text File:')
	st.write(txtFileObj.name)

	#stringio = StringIO(txtFileObj.getvalue().decode("utf-8"))
	with open(txtFileObj.name, "wb") as f:
  		f.write(txtFileObj.getbuffer())	
	
	loader = TextLoader(txtFileObj.name)
	documents = loader.load()

	# Text Splitter
	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
	docs = text_splitter.split_documents(documents)

	db = FAISS.from_documents(docs, embeddings)

	st.subheader('Enter query')
	query = st.text_input('Ask anything about the Document you uploaded')

	if (query):
		docs = db.similarity_search(query)
		answer = chain.run(input_documents=docs, question=query)

		st.subheader('Answer')
		st.write(answer)

st.title('Document Q&A - Ask anything in your Document')
st.subheader('This application can be used to upload text(.txt) and PDF(.pdf) files and ask questions about their contents.')

init()

st.sidebar.subheader('Upload document')
uploaded_file = st.sidebar.file_uploader("Upload File",type=['txt','pdf'])

if uploaded_file and Path(uploaded_file.name).suffix == '.txt':
	st.sidebar.info(Path(uploaded_file.name))
	text_file(uploaded_file)

if uploaded_file and Path(uploaded_file.name).suffix == '.pdf':
	pdf_file(uploaded_file)

with st.sidebar.expander('File'):
    if (uploaded_file):
	    st.info(uploaded_file.name)
if os.path.exists('/content/'):
	st.info(os.listdir('/content/'))