aerospace_chatbot_ams / pages /2_Document_Upload.py
dsmueller's picture
rag_study_merge2 (#2)
48a66db verified
raw
history blame
4.58 kB
import data_import, setup
import os
import time
import logging
import glob
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import VoyageEmbeddings
from ragatouille import RAGPretrainedModel
import streamlit as st
# Set up the page, enable logging
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)
logging.basicConfig(filename='app_2_document_upload.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
# Set the page title
st.set_page_config(
page_title='Upload PDFs',
)
st.title('Upload PDFs')
sb=setup.load_sidebar(config_file='../config/config.json',
index_data_file='../config/index_data.json',
vector_databases=True,
embeddings=True,
index_name=True,
secret_keys=True)
secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar
# Populate the main screen
logging.info(f'index_type test, {sb["index_type"]}')
if sb["index_type"]=='RAGatouille':
logging.info('Set hugging face model for queries.')
query_model=sb['query_model']
elif sb['query_model']=='Openai' or 'Voyage':
logging.info('Set embeddings model for queries.')
if sb['query_model']=='Openai':
query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
elif sb['query_model']=='Voyage':
query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY'])
logging.info('Query model set: '+str(query_model))
# Find docs
index_name_md=st.markdown('Enter a directory relative to the current directory, or an absolute path.')
data_folder = st.text_input('Enter a directory','../data/AMS/')
if not os.path.isdir(data_folder):
st.error('The entered directory does not exist')
docs = glob.glob(data_folder+'*.pdf') # Only get the PDFs in the directory
st.markdown('PDFs found: '+str(docs))
st.markdown('Number of PDFs found: ' + str(len(docs)))
logging.info('Docs: '+str(docs))
# Add an expandable box for options
with st.expander("Options"):
use_json = st.checkbox('Use existing jsonl, if available (will ignore chunk method, size, and overlap)?', value=True)
json_file=st.text_input('Jsonl file',data_folder+'ams_data.jsonl')
clear_database = st.checkbox('Clear existing database?')
chunk_method= st.selectbox('Chunk method', ['tiktoken_recursive'], index=0)
if sb['query_model']=='Openai' or 'ChromaDB':
# OpenAI will time out if the batch size is too large
batch_size=st.number_input('Batch size for upsert', min_value=1, step=1, value=100)
else:
batch_size=None
if chunk_method=='tiktoken_recursive':
chunk_size=st.number_input('Chunk size (tokens)', min_value=1, step=1, value=500)
chunk_overlap=st.number_input('Chunk overlap (tokens)', min_value=0, step=1, value=0)
else:
raise NotImplementedError
# Add a button to run the function
if st.button('Chunk docs to jsonl file'):
start_time = time.time() # Start the timer
data_import.chunk_docs(docs,
file=json_file,
chunk_method=chunk_method,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
use_json=False)
end_time = time.time() # Stop the timer
elapsed_time = end_time - start_time
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
if st.button('Load docs into vector database'):
start_time = time.time() # Start the timer
data_import.load_docs(sb['index_type'],
docs,
query_model=query_model,
index_name=sb['index_name'],
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
use_json=use_json,
clear=clear_database,
file=json_file,
batch_size=batch_size)
end_time = time.time() # Stop the timer
elapsed_time = end_time - start_time
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
# Add a button to delete the index
if st.button('Delete existing index'):
start_time = time.time() # Start the timer
data_import.delete_index(sb['index_type'],sb['index_name'])
end_time = time.time() # Stop the timer
elapsed_time = end_time - start_time
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")