File size: 4,581 Bytes
48a66db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import data_import, setup

import os
import time
import logging
import glob

from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import VoyageEmbeddings

from ragatouille import RAGPretrainedModel

import streamlit as st

# Set up the page, enable logging 
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)
logging.basicConfig(filename='app_2_document_upload.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)

# Set the page title
st.set_page_config(
    page_title='Upload PDFs',
)
st.title('Upload PDFs')

sb=setup.load_sidebar(config_file='../config/config.json',
                      index_data_file='../config/index_data.json',
                      vector_databases=True,
                      embeddings=True,
                      index_name=True,
                      secret_keys=True)

secrets=setup.set_secrets(sb) # Take secrets from .env file first, otherwise from sidebar

# Populate the main screen
logging.info(f'index_type test, {sb["index_type"]}')

if sb["index_type"]=='RAGatouille':
    logging.info('Set hugging face model for queries.')
    query_model=sb['query_model']
elif sb['query_model']=='Openai' or 'Voyage':
    logging.info('Set embeddings model for queries.')
    if sb['query_model']=='Openai':
        query_model=OpenAIEmbeddings(model=sb['embedding_name'],openai_api_key=secrets['OPENAI_API_KEY'])
    elif sb['query_model']=='Voyage':
        query_model=VoyageEmbeddings(voyage_api_key=secrets['VOYAGE_API_KEY'])
logging.info('Query model set: '+str(query_model))

# Find docs
index_name_md=st.markdown('Enter a directory relative to the current directory, or an absolute path.')
data_folder = st.text_input('Enter a directory','../data/AMS/')
if not os.path.isdir(data_folder):
    st.error('The entered directory does not exist')
docs = glob.glob(data_folder+'*.pdf')   # Only get the PDFs in the directory
st.markdown('PDFs found: '+str(docs))
st.markdown('Number of PDFs found: ' + str(len(docs)))
logging.info('Docs: '+str(docs))

# Add an expandable box for options
with st.expander("Options"):
    use_json = st.checkbox('Use existing jsonl, if available (will ignore chunk method, size, and overlap)?', value=True)
    json_file=st.text_input('Jsonl file',data_folder+'ams_data.jsonl')
    clear_database = st.checkbox('Clear existing database?')
    chunk_method= st.selectbox('Chunk method', ['tiktoken_recursive'], index=0)
    if sb['query_model']=='Openai' or 'ChromaDB':
        # OpenAI will time out if the batch size is too large
        batch_size=st.number_input('Batch size for upsert', min_value=1, step=1, value=100)
    else:
        batch_size=None
    if chunk_method=='tiktoken_recursive':
        chunk_size=st.number_input('Chunk size (tokens)', min_value=1, step=1, value=500)
        chunk_overlap=st.number_input('Chunk overlap (tokens)', min_value=0, step=1, value=0)
    else:
        raise NotImplementedError

# Add a button to run the function
if st.button('Chunk docs to jsonl file'):
    start_time = time.time()  # Start the timer
    data_import.chunk_docs(docs,
                           file=json_file,
                           chunk_method=chunk_method,
                           chunk_size=chunk_size,
                           chunk_overlap=chunk_overlap,
                           use_json=False)
    end_time = time.time()  # Stop the timer
    elapsed_time = end_time - start_time 
    st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
if st.button('Load docs into vector database'):
    start_time = time.time()  # Start the timer
    data_import.load_docs(sb['index_type'],
                          docs,
                          query_model=query_model,
                          index_name=sb['index_name'],
                          chunk_size=chunk_size,
                          chunk_overlap=chunk_overlap,
                          use_json=use_json,
                          clear=clear_database,
                          file=json_file,
                          batch_size=batch_size)
    end_time = time.time()  # Stop the timer
    elapsed_time = end_time - start_time 
    st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")
# Add a button to delete the index
if st.button('Delete existing index'):
    start_time = time.time()  # Start the timer
    data_import.delete_index(sb['index_type'],sb['index_name'])
    end_time = time.time()  # Stop the timer
    elapsed_time = end_time - start_time 
    st.write(f"Elapsed Time: {elapsed_time:.2f} seconds")