Document-QandA / app.py
raseel-zymr's picture
Uploader in sidebar
f70522c
raw
history blame
3.54 kB
import os
import streamlit as st
#for textfiles
from langchain.document_loaders import TextLoader
#text splitter
from langchain.text_splitter import CharacterTextSplitter
#for using HugginFace models & embeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFaceHub
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS
#facebook vectorization
from langchain.chains.question_answering import load_qa_chain
#load pdf
from langchain.document_loaders import UnstructuredPDFLoader
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["hf_api_key"]
st.title('Document Q&A - Ask anything in your Document')
st.sidebar.subheader('Upload document')
uploaded_file = st.sidebar.file_uploader("Upload File",type=['txt','pdf'])
# url2 = "https://github.com/fabiomatricardi/cdQnA/raw/main/KS-all-info_rev1.txt"
# res = requests.get(url2)
# with open("KS-all-info_rev1.txt", "w") as f:
# f.write(res.text)
st.subheader('Enter query')
query = st.text_input('Ask anything about the Document you uploaded')
st.subheader('Answer')
st.write('Answer from document')
# # Document Loader
# loader = TextLoader('./KS-all-info_rev1.txt')
# documents = loader.load()
# import textwrap
# def wrap_text_preserve_newlines(text, width=110):
# # Split the input text into lines based on newline characters
# lines = text.split('\n')
# # Wrap each line individually
# wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
# # Join the wrapped lines back together using newline characters
# wrapped_text = '\n'.join(wrapped_lines)
# return wrapped_text
# # Text Splitter
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
# docs = text_splitter.split_documents(documents)
# # Embeddings
# embeddings = HuggingFaceEmbeddings()
# #Create the vectorized db
# db = FAISS.from_documents(docs, embeddings)
# llm=HuggingFaceHub(repo_id="google/flan-t5-xl", model_kwargs={"temperature":0, "max_length":512})
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
# chain = load_qa_chain(llm2, chain_type="stuff")
# # Sample question
# # query = "What the actual issues and drawbacks ?"
# # docs = db.similarity_search(query)
# # chain.run(input_documents=docs, question=query)
# # PDFs
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/PLC_mediumArticle.pdf
# # !wget https://github.com/fabiomatricardi/cdQnA/raw/main/BridgingTheGaap_fromMedium.pdf
# # !mkdir pdfs
# # !cp *pdf '/content/pdfs'
# # pdf_folder_path = '/content/pdfs'
# # os.listdir(pdf_folder_path)
# # loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
# # loaders
# index = VectorstoreIndexCreator(
# embedding=HuggingFaceEmbeddings(),
# text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)
# #Load llm with selected one
# llm2=HuggingFaceHub(repo_id="declare-lab/flan-alpaca-large", model_kwargs={"temperature":0, "max_length":512})
# #Prepare the pipeline
# from langchain.chains import RetrievalQA
# chain = RetrievalQA.from_chain_type(llm=llm2,
# chain_type="stuff",
# retriever=index.vectorstore.as_retriever(),
# input_key="question")
# #get reply to our questions
# # chain.run('What is the difference between a PLC and a PC?')