|
import streamlit as st |
|
from langchain.embeddings import HuggingFaceEmbeddings |
|
from langchain.vectorstores import FAISS |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.llms import HuggingFaceHub |
|
from langchain.chains import RetrievalQA |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
from pdfminer.high_level import extract_text |
|
def get_pdf_text(files): |
|
full_text = "" |
|
for file in files: |
|
text = extract_text(file) |
|
text = text.replace("\n", " ") |
|
full_text = text + full_text |
|
return full_text |
|
|
|
st.title("Embedding Creation for Langchain") |
|
st.header("File Upload") |
|
files = st.file_uploader("Upload your files", accept_multiple_files=True, type="pdf") |
|
|
|
if files: |
|
st.header("Start Conversion") |
|
if st.button("Ready!"): |
|
with st.spinner("Creating chain..."): |
|
full_text = get_pdf_text(files) |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) |
|
chunks = text_splitter.split_text(full_text) |
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
vectorstore = FAISS.from_texts(chunks, embeddings) |
|
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True,) |
|
llm = AutoModelForCausalLM.from_pretrained("red1xe/Llama-2-7B-codeGPT") |
|
chain = RetrievalQA.from_chain_type( |
|
llm=llm, |
|
chain_type="retrieval-qa", |
|
retriever=vectorstore.as_retriever(), |
|
memory=memory, |
|
) |
|
st.success("Done!") |
|
st.header("Start Chat") |
|
st.subheader("Ask a question") |
|
question = st.text_input("Question") |
|
if st.button("Ask"): |
|
with st.spinner("Thinking..."): |
|
answer = chain.query(question) |
|
st.success(answer) |