Spaces:

ai-aerospace
/

aerospace_chatbot_ams

Sleeping

App Files Files Community

aerospace_chatbot_ams / pages /2_Document_Upload.py

dsmueller

rag_study_merge2 (#2)

48a66db verified 9 months ago

raw

history blame

4.58 kB

	import data_import, setup

	import os
	import time
	import logging
	import glob

	from langchain_openai import OpenAIEmbeddings
	from langchain_community.embeddings import VoyageEmbeddings

	from ragatouille import RAGPretrainedModel

	import streamlit as st

	# Set up the page, enable logging
	from dotenv import load_dotenv,find_dotenv
	load_dotenv(find_dotenv(),override=True)
	logging.basicConfig(filename='app_2_document_upload.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)

	# Set the page title
	st.set_page_config(
	page_title='Upload PDFs',
	)
	st.title('Upload PDFs')

	sb=setup.load_sidebar(config_file='../config/config.json',
	index_data_file='../config/index_data.json',
	vector_databases=True,
	embeddings=True,
	index_name=True,
	secret_keys=True)

	secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar

	# Populate the main screen
	logging.info(f'index_type test, {sb["index_type"]}')

	if sb["index_type"]=='RAGatouille':
	logging.info('Set hugging face model for queries.')
	query_model=sb['query_model']
	elif sb['query_model']=='Openai' or 'Voyage':
	logging.info('Set embeddings model for queries.')
	if sb['query_model']=='Openai':
	query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
	elif sb['query_model']=='Voyage':
	query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY'])
	logging.info('Query model set: '+str(query_model))

	# Find docs
	index_name_md=st.markdown('Enter a directory relative to the current directory, or an absolute path.')
	data_folder = st.text_input('Enter a directory','../data/AMS/')
	if not os.path.isdir(data_folder):
	st.error('The entered directory does not exist')
	docs = glob.glob(data_folder+'*.pdf') # Only get the PDFs in the directory
	st.markdown('PDFs found: '+str(docs))
	st.markdown('Number of PDFs found: ' + str(len(docs)))
	logging.info('Docs: '+str(docs))

	# Add an expandable box for options
	with st.expander("Options"):
	use_json = st.checkbox('Use existing jsonl, if available (will ignore chunk method, size, and overlap)?', value=True)
	json_file=st.text_input('Jsonl file',data_folder+'ams_data.jsonl')
	clear_database = st.checkbox('Clear existing database?')
	chunk_method= st.selectbox('Chunk method', ['tiktoken_recursive'], index=0)
	if sb['query_model']=='Openai' or 'ChromaDB':
	# OpenAI will time out if the batch size is too large
	batch_size=st.number_input('Batch size for upsert', min_value=1, step=1, value=100)
	else:
	batch_size=None
	if chunk_method=='tiktoken_recursive':
	chunk_size=st.number_input('Chunk size (tokens)', min_value=1, step=1, value=500)
	chunk_overlap=st.number_input('Chunk overlap (tokens)', min_value=0, step=1, value=0)
	else:
	raise NotImplementedError

	# Add a button to run the function
	if st.button('Chunk docs to jsonl file'):
	start_time = time.time() # Start the timer
	data_import.chunk_docs(docs,
	file=json_file,
	chunk_method=chunk_method,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	use_json=False)
	end_time = time.time() # Stop the timer
	elapsed_time = end_time - start_time
	st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
	if st.button('Load docs into vector database'):
	start_time = time.time() # Start the timer
	data_import.load_docs(sb['index_type'],
	docs,
	query_model=query_model,
	index_name=sb['index_name'],
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	use_json=use_json,
	clear=clear_database,
	file=json_file,
	batch_size=batch_size)
	end_time = time.time() # Stop the timer
	elapsed_time = end_time - start_time
	st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
	# Add a button to delete the index
	if st.button('Delete existing index'):
	start_time = time.time() # Start the timer
	data_import.delete_index(sb['index_type'],sb['index_name'])
	end_time = time.time() # Stop the timer
	elapsed_time = end_time - start_time
	st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")