|
import data_import, setup |
|
|
|
import os |
|
import time |
|
import logging |
|
import glob |
|
|
|
from langchain_openai import OpenAIEmbeddings |
|
from langchain_community.embeddings import VoyageEmbeddings |
|
|
|
from ragatouille import RAGPretrainedModel |
|
|
|
import streamlit as st |
|
|
|
|
|
from dotenv import load_dotenv,find_dotenv |
|
load_dotenv(find_dotenv(),override=True) |
|
logging.basicConfig(filename='app_2_document_upload.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) |
|
|
|
|
|
st.set_page_config( |
|
page_title='Upload PDFs', |
|
) |
|
st.title('Upload PDFs') |
|
|
|
sb=setup.load_sidebar(config_file='../config/config.json', |
|
index_data_file='../config/index_data.json', |
|
vector_databases=True, |
|
embeddings=True, |
|
index_name=True, |
|
secret_keys=True) |
|
|
|
secrets=setup.set_secrets(sb) |
|
|
|
|
|
logging.info(f'index_type test, {sb["index_type"]}') |
|
|
|
if sb["index_type"]=='RAGatouille': |
|
logging.info('Set hugging face model for queries.') |
|
query_model=sb['query_model'] |
|
elif sb['query_model']=='Openai' or 'Voyage': |
|
logging.info('Set embeddings model for queries.') |
|
if sb['query_model']=='Openai': |
|
query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY']) |
|
elif sb['query_model']=='Voyage': |
|
query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY']) |
|
logging.info('Query model set: '+str(query_model)) |
|
|
|
|
|
index_name_md=st.markdown('Enter a directory relative to the current directory, or an absolute path.') |
|
data_folder = st.text_input('Enter a directory','../data/AMS/') |
|
if not os.path.isdir(data_folder): |
|
st.error('The entered directory does not exist') |
|
docs = glob.glob(data_folder+'*.pdf') |
|
st.markdown('PDFs found: '+str(docs)) |
|
st.markdown('Number of PDFs found: ' + str(len(docs))) |
|
logging.info('Docs: '+str(docs)) |
|
|
|
|
|
with st.expander("Options"): |
|
use_json = st.checkbox('Use existing jsonl, if available (will ignore chunk method, size, and overlap)?', value=True) |
|
json_file=st.text_input('Jsonl file',data_folder+'ams_data.jsonl') |
|
clear_database = st.checkbox('Clear existing database?') |
|
chunk_method= st.selectbox('Chunk method', ['tiktoken_recursive'], index=0) |
|
if sb['query_model']=='Openai' or 'ChromaDB': |
|
|
|
batch_size=st.number_input('Batch size for upsert', min_value=1, step=1, value=100) |
|
else: |
|
batch_size=None |
|
if chunk_method=='tiktoken_recursive': |
|
chunk_size=st.number_input('Chunk size (tokens)', min_value=1, step=1, value=500) |
|
chunk_overlap=st.number_input('Chunk overlap (tokens)', min_value=0, step=1, value=0) |
|
else: |
|
raise NotImplementedError |
|
|
|
|
|
if st.button('Chunk docs to jsonl file'): |
|
start_time = time.time() |
|
data_import.chunk_docs(docs, |
|
file=json_file, |
|
chunk_method=chunk_method, |
|
chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap, |
|
use_json=False) |
|
end_time = time.time() |
|
elapsed_time = end_time - start_time |
|
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds") |
|
if st.button('Load docs into vector database'): |
|
start_time = time.time() |
|
data_import.load_docs(sb['index_type'], |
|
docs, |
|
query_model=query_model, |
|
index_name=sb['index_name'], |
|
chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap, |
|
use_json=use_json, |
|
clear=clear_database, |
|
file=json_file, |
|
batch_size=batch_size) |
|
end_time = time.time() |
|
elapsed_time = end_time - start_time |
|
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds") |
|
|
|
if st.button('Delete existing index'): |
|
start_time = time.time() |
|
data_import.delete_index(sb['index_type'],sb['index_name']) |
|
end_time = time.time() |
|
elapsed_time = end_time - start_time |
|
st.write(f"Elapsed Time: {elapsed_time:.2f} seconds") |