mr

Build error

App Files Files Community

JPBianchi commited on May 16

Commit

10d6a86

•

1 Parent(s): c413bdd

big upload

Browse files

Files changed (24) hide show

Dockerfile +17 -0
README.md +8 -1
app/api/__init__.py +0 -0
app/api/routers/__init__.py +0 -0
app/engine/__init__.py +0 -0
app/engine/chunk_embed.py +90 -0
app/engine/llm.py +0 -0
app/engine/loaders/__init__.py +0 -0
app/engine/loaders/file.py +105 -0
app/engine/logger.py +10 -0
app/engine/processing.py +48 -0
app/engine/vectorstore.py +178 -0
app/engine/weaviate_interface_v4.py +526 -0
app/main.py +231 -0
app/notebooks/chunking_indexing.ipynb +0 -0
app/notebooks/lite_lll.ipynb +158 -0
app/notebooks/pdf_readers.ipynb +0 -0
app/notebooks/upload_index.ipynb +0 -0
app/notebooks/weaviate.ipynb +0 -0
app/rag/__init__.py +0 -0
app/rag/llm.py +149 -0
app/rag/rag.py +56 -0
app/requirements.txt +20 -0
app/settings.py +4 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE 1
+# ^ saves space by not writing .pyc files
+ENV PYTHONUNBUFFERED 1
+# ^ ensures that the output from the Python app is sent straight to the terminal without being buffered -> real time monitoring
+ENV ENVIRONMENT=dev
+COPY ./app /app
+WORKDIR /app
+RUN mkdir /data
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+# ^ no caching of the packages to save space
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,4 +1,11 @@
 ---
-license: mit
 title: FinRAG
 ---

 ---
 title: FinRAG
+emoji: 🐢
+colorFrom: blue
+colorTo: blue
+sdk: docker
+pinned: false
+license: mit
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app/api/__init__.py ADDED Viewed

File without changes

app/api/routers/__init__.py ADDED Viewed

File without changes

app/engine/__init__.py ADDED Viewed

File without changes

app/engine/chunk_embed.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import pandas as pd
+import torch
+from settings import parquet_file
+import tiktoken  # tokenizer library for use with OpenAI LLMs
+from llama_index.legacy.text_splitter import SentenceSplitter
+from sentence_transformers import SentenceTransformer
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# create tensors on GPU if available
+if torch.cuda.is_available():
+    torch.set_default_tensor_type('torch.cuda.FloatTensor')
+def chunk_vectorize(doc_content: dict = None,
+                    chunk_size: int = 256,    # limit for 'all-mpnet-base-v2'
+                    chunk_overlap: int = 20,  # some overlap to link the chunks
+                    encoder: str = 'gpt-3.5-turbo-0613',
+                    model_name: str = 'sentence-transformers/all-mpnet-base-v2'):  # can try all-MiniLM-L6-v2
+    # see tests in chunking_indexing.ipynb for more details
+    encoding = tiktoken.encoding_for_model(encoder)
+    splitter = SentenceSplitter(chunk_size=chunk_size,
+                                tokenizer=encoding.encode,
+                                chunk_overlap=chunk_overlap)
+    # let's create the splits for every document
+    contents_splits = {}
+    for fname, content in doc_content.items():
+        splits = [splitter.split_text(page) for page in content]
+        contents_splits[fname] = [split for sublist in splits for split in sublist]
+    model = SentenceTransformer(model_name)
+    content_emb = {}
+    for fname, splits in contents_splits.items():
+        content_emb[fname] = [(split, model.encode(split)) for split in splits]
+    # save fname since it carries information, and could be used as a property in Weaviate
+    text_vector_tuples = [(fname, split, emb.tolist()) for fname, splits_emb in content_emb.items() for split, emb in splits_emb]
+    new_df = pd.DataFrame(
+        text_vector_tuples,
+        columns=['file', 'content', 'content_embedding']
+    )
+    # load the existing parquet file if it exists and update it
+    if os.path.exists(parquet_file):
+        new_df = pd.concat([pd.read_parquet(parquet_file), new_df])
+    # no optimization here (zipping etc) since the data is small
+    new_df.to_parquet(parquet_file, index=False)
+    return
+# TODO
+# import unittest
+# from unitesting_utils import load_impact_theory_data
+# class TestSplitContents(unittest.TestCase):
+#     '''
+#     Unit test to ensure proper functionality of split_contents function
+#     '''
+#     def test_split_contents(self):
+#         import tiktoken
+#         from llama_index.text_splitter import SentenceSplitter
+#         data = load_impact_theory_data()
+#         subset = data[:3]
+#         chunk_size = 256
+#         chunk_overlap = 0
+#         encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
+#         gpt35_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=chunk_overlap)
+#         results = split_contents(subset, gpt35_txt_splitter)
+#         self.assertEqual(len(results), 3)
+#         self.assertEqual(len(results[0]), 83)
+#         self.assertEqual(len(results[1]), 178)
+#         self.assertEqual(len(results[2]), 144)
+#         self.assertTrue(isinstance(results, list))
+#         self.assertTrue(isinstance(results[0], list))
+#         self.assertTrue(isinstance(results[0][0], str))
+# unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestSplitContents))

app/engine/llm.py ADDED Viewed

File without changes

app/engine/loaders/__init__.py ADDED Viewed

File without changes

app/engine/loaders/file.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+# from langchain.document_loaders import PyPDFLoader  # deprecated
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from llama_parse import LlamaParse
+from typing import Union, List, Dict
+from abc import ABC, abstractmethod
+class PDFExtractor(ABC):
+    def __init__(self, file_or_list: Union[str, List[str]], num_workers: int = 1, verbose: bool = False):
+        """ We can provide a list of files or a single file """
+        if isinstance(file_or_list, str):
+            self.filelist = [file_or_list]
+        else:
+            self.filelist = file_or_list
+        self.num_workers = num_workers
+        self.verbose = verbose
+        super().__init__()
+    @abstractmethod
+    def extract_text(self) -> Dict[str, List[str]]:
+        """ Extracts text from the PDF, no processing.
+            Return a dictionary, key = filename, value = list of strings, one for each page.
+        """
+        pass
+    @abstractmethod
+    def extract_images(self):
+        """Extracts images from the PDF, no processing."""
+        pass
+    @abstractmethod
+    def extract_tables(self):
+        """ Extracts tables from the PDF, no processing.
+            Return in json format
+        """
+        pass
+class _PyPDFLoader(PDFExtractor):
+    def extract_text(self):
+        output_dict = {}
+        for fpath in self.filelist:
+            fname = fpath.split('/')[-1]
+            output_dict[fname] = [p.page_content for p in PyPDFLoader(fpath).load()]
+        return output_dict
+    def extract_images(self):
+        raise NotImplementedError("Not implemented or PyPDFLoader does not support image extraction")
+        return
+    def extract_tables(self):
+        raise NotImplementedError("Not implemented or PyPDFLoader does not support table extraction")
+        return
+class _LlamaParse(PDFExtractor):
+    def extract_text(self):
+        # https://github.com/run-llama/llama_parse
+        if os.getenv("LLAMA_PARSE_API_KEY") is None:
+            raise ValueError("LLAMA_PARSE_API_KEY is not set.")
+        parser = LlamaParse(
+            api_key = os.getenv("LLAMA_PARSE_API_KEY"),
+            num_workers=self.num_workers,
+            verbose=self.verbose,
+            language="en",
+            result_type="text"  # or "markdown"
+        )
+        output_dict = {}
+        for fpath in self.filelist:
+            # https://github.com/run-llama/llama_parse/blob/main/examples/demo_json.ipynb
+            docs = parser.get_json_result(fpath)
+            docs[0]['pages'][0]['text']
+            output_dict[fpath] = None
+        return output_dict
+    def extract_images(self):
+        raise NotImplementedError("Not implemented or LlamaParse does not support image extraction")
+        return
+    def extract_tables(self):
+        raise NotImplementedError("Not implemented or LlamaParse does not support table extraction")
+        return
+def pdf_extractor(extractor_type: str, *args, **kwargs) -> PDFExtractor:
+    """ Factory function to return the appropriate PDF extractor instance, properly initialized """
+    if extractor_type == 'PyPDFLoader':
+        return _PyPDFLoader(*args, **kwargs)
+    elif extractor_type == 'LlamaParse':
+        return _LlamaParse(*args, **kwargs)
+    else:
+        raise ValueError(f"Unsupported PDF extractor type: {extractor_type}")

app/engine/logger.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os, logging
+environment = os.getenv("ENVIRONMENT", "dev")  # TODO put the logger creation in its own file
+if environment == "dev":
+    logger = logging.getLogger("uvicorn")
+else:
+    logger = lambda x: _
+    # we should log also in production  TODO
+    # check how it works on HuggingFace, if possible
+    # because we don't have access to the container's file system

app/engine/processing.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os, pickle
+from typing import List
+from engine.loaders.file import pdf_extractor
+from engine.chunk_embed import chunk_vectorize
+from settings import parquet_file
+from .logger import logger
+from .vectorstore import VectorStore
+# I allow relative imports inside the engine package
+# I could have created a module but things are still changing
+finrag_vectorstore = VectorStore(model_path='sentence-transformers/all-mpnet-base-v2')
+def empty_collection():
+    """ Deletes the Finrag collection if it exists """
+    status = finrag_vectorstore.empty_collection()
+    return status
+def index_data():
+    if not os.path.exists(parquet_file):
+        logger.info(f"Parquet file {parquet_file} does not exists")
+        return 'no data to index'
+    # load the parquet file into the vectorstore
+    finrag_vectorstore.index_data()
+    os.remove(parquet_file)
+    # delete the files so we can load several files and index them when we want
+    # without having to keep track of those that have been indexed already
+    # this is a simple solution for now, but we can do better
+    return "Index creation successful"
+def process_pdf(filepath:str) -> dict:
+    new_content = pdf_extractor('PyPDFLoader', filepath).extract_text()
+    logger.info(f"Successfully extracted text from PDF")
+    chunk_vectorize(new_content)
+    logger.info(f"Successfully vectorized PDF content")
+    return new_content
+def process_question(question:str) -> List[str]:
+    ans = finrag_vectorstore.hybrid_search(query=question, limit=3, alpha=0.8)
+    return ans

app/engine/vectorstore.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os, logging
+from typing import List, Any
+import pandas as pd
+from weaviate.classes.config import Property, DataType
+from .weaviate_interface_v4 import WeaviateWCS, WeaviateIndexer
+from .logger import logger
+from settings import parquet_file
+class VectorStore:
+    def __init__(self, model_path:str = 'sentence-transformers/all-mpnet-base-v2'):
+        # we can create several instances to test various models, especially if we finetune one
+        self.finrag_properties = [
+                Property(name='filename',
+                         data_type=DataType.TEXT,
+                         description='Name of the file',
+                         index_filterable=True,
+                         index_searchable=True),
+                # Property(name='keywords',
+                #          data_type=DataType.TEXT_ARRAY,
+                #          description='Keywords associated with the file',
+                #          index_filterable=True,
+                #          index_searchable=True),
+                Property(name='content',
+                         data_type=DataType.TEXT,
+                         description='Splits of the article',
+                         index_filterable=True,
+                         index_searchable=True),
+              ]
+        self.class_name = "FinRag_all-mpnet-base-v2"
+        self.class_config = {'classes': [
+                            {"class": self.class_name,
+                            "description": "Financial reports",
+                            "vectorIndexType": "hnsw",
+                            # Vector index specific settings for HSNW
+                            "vectorIndexConfig": {
+                                    "ef": 64,  # higher is better quality vs slower search
+                                    "efConstruction": 128, # higher = better index but slower build
+                                    "maxConnections": 32,  # max conn per layer - higher = more memory
+                            },
+                            "vectorizer": "none",
+                            "properties": self.finrag_properties }
+                            ]
+        }
+        self.model_path = model_path
+        try:
+            self.api_key = os.environ['FINRAG_WEAVIATE_API_KEY']
+            self.url =  os.environ['FINRAG_WEAVIATE_ENDPOINT']
+            self.client = WeaviateWCS(endpoint=self.url,
+                                    api_key=self.api_key,
+                                    model_name_or_path=self.model_path)
+        except Exception as e:
+            # raise Exception(f"Could not create Weaviate client: {e}")
+            pass
+        assert self.client._client.is_live(), "Weaviate is not live"
+        assert self.client._client.is_ready(), "Weaviate is not ready"
+        # careful with accessing '_client' since the weaviate helper usually closes the connection every time
+        self.indexer = None
+        self.create_collection()
+    @property
+    def collections(self):
+        return self.client.show_all_collections()
+    def create_collection(self, collection_name: str='Finrag', description: str='Financial reports'):
+        self.collection_name = collection_name
+        if collection_name not in self.collections:
+            self.client.create_collection(collection_name=collection_name,
+                                          properties=self.finrag_properties,
+                                          description=description)
+            self.collection_name = collection_name
+        else:
+            logging.warning(f"Collection {collection_name} already exists")
+    def empty_collection(self, collection_name: str='Finrag') -> bool:
+        # not in the library yet, so I simply delete and recreate it
+        if collection_name in self.collections:
+            self.client.delete_collection(collection_name=collection_name)
+            self.create_collection()
+            return True
+        else:
+            logging.warning(f"Collection {collection_name} doesn't exist")
+            return False
+    def index_data(self, data: List[dict]= None, collection_name: str='Finrag'):
+        if self.indexer is None:
+            self.indexer = WeaviateIndexer(self.client)
+        if data is None:
+            # use the parquet file, otherwise use the data passed
+            data = pd.read_parquet(parquet_file).to_dict('records')
+            # the parquet file was created/incremented when a new article was uploaded
+            # it is a dataframe with columns: file, content, content_embedding
+            # and reflects exactly the data that we want to index at all times
+        self.status = self.indexer.batch_index_data(data, collection_name, 256)
+        self.num_errors, self.error_messages, self.doc_ids = self.status
+        # in this case with few articles, we don't tolerate errors
+        # batch_index_data already tests errors against a threshold
+        # assert self.num_errors == 0, f"Errors: {self.num_errors}"
+    def keyword_search(self,
+                       query: str,
+                       limit: int=5,
+                       return_properties: List[str]=['filename', 'content'],
+                       alpha=None  # dummy parameter to match the hybrid_search signature
+                       ) -> List[str]:
+        response = self.client.keyword_search(
+                                request=query,
+                                collection_name=self.collection_name,
+                                query_properties=['content'],
+                                limit=limit,
+                                filter=None,
+                                return_properties=return_properties,
+                                return_raw=False)
+        return [res['content'] for res in response]
+    def vector_search(self,
+                      query: str,
+                      limit: int=5,
+                      return_properties: List[str]=['filename', 'content'],
+                      alpha=None  # dummy parameter to match the hybrid_search signature
+                      ) -> List[str]:
+        response = self.client.vector_search(
+                                request=query,
+                                collection_name=self.collection_name,
+                                limit=limit,
+                                filter=None,
+                                return_properties=return_properties,
+                                return_raw=False)
+        return [res['content'] for res in response]
+    def hybrid_search(self,
+                      query: str,
+                      limit: int=5,
+                      alpha=0.5,  # higher = more vector search
+                      return_properties: List[str]=['filename', 'content']
+                      ) -> List[str]:
+        response = self.client.hybrid_search(
+                                request=query,
+                                collection_name=self.collection_name,
+                                query_properties=['content'],
+                                alpha=alpha,
+                                limit=limit,
+                                filter=None,
+                                return_properties=return_properties,
+                                return_raw=False)
+        return [res['content'] for res in response]

app/engine/weaviate_interface_v4.py ADDED Viewed

	@@ -0,0 +1,526 @@

+# Disclaimer: I didn't write this module
+from weaviate.auth import AuthApiKey
+from weaviate.collections.classes.internal import (MetadataReturn, QueryReturn,
+                                                   MetadataQuery)
+import weaviate
+from weaviate.classes.config import Property
+from weaviate.classes.query import Filter
+from weaviate.config import ConnectionConfig
+from openai import OpenAI
+from sentence_transformers import SentenceTransformer
+from typing import Any
+from torch import cuda
+from tqdm import tqdm
+import time
+import os
+from dataclasses import dataclass
+class WeaviateWCS:
+    '''
+    A python native Weaviate Client class that encapsulates Weaviate functionalities
+    in one object. Several convenience methods are added for ease of use.
+    Args
+    ----
+    api_key: str
+        The API key for the Weaviate Cloud Service (WCS) instance.
+        https://console.weaviate.cloud/dashboard
+    endpoint: str
+        The url endpoint for the Weaviate Cloud Service instance.
+    model_name_or_path: str='sentence-transformers/all-MiniLM-L6-v2'
+        The name or path of the SentenceTransformer model to use for vector search.
+        Will also support OpenAI text-embedding-ada-002 model.  This param enables
+        the use of most leading models on MTEB Leaderboard:
+        https://huggingface.co/spaces/mteb/leaderboard
+    openai_api_key: str=None
+        The API key for the OpenAI API. Only required if using OpenAI text-embedding-ada-002 model.
+    '''
+    def __init__(self,
+                 endpoint: str=None,
+                 api_key: str=None,
+                 model_name_or_path: str='sentence-transformers/all-MiniLM-L6-v2',
+                 embedded: bool=False,
+                 openai_api_key: str=None,
+                 skip_init_checks: bool=False,
+                 **kwargs
+                ):
+        self.endpoint = endpoint
+        if embedded:
+            self._client = weaviate.connect_to_embedded(**kwargs)
+        else:
+            auth_config = AuthApiKey(api_key=api_key)
+            self._client = weaviate.connect_to_wcs(cluster_url=endpoint,
+                                                   auth_credentials=auth_config,
+                                                   skip_init_checks=skip_init_checks)
+        self.model_name_or_path = model_name_or_path
+        self._openai_model = False
+        if self.model_name_or_path == 'text-embedding-ada-002':
+            if not openai_api_key:
+                raise ValueError(f'OpenAI API key must be provided to use this model: {self.model_name_or_path}')
+            self.model = OpenAI(api_key=openai_api_key)
+            self._openai_model = True
+        else:
+            self.model = SentenceTransformer(self.model_name_or_path) if self.model_name_or_path else None
+        self.return_properties = ['guest', 'title', 'summary', 'content', 'video_id', 'doc_id', 'episode_url', 'thumbnail_url']
+    def _connect(self) -> None:
+        '''
+        Connects to Weaviate instance.
+        '''
+        if not self._client.is_connected():
+            self._client.connect()
+    def create_collection(self,
+                          collection_name: str,
+                          properties: list[Property],
+                          description: str=None,
+                          **kwargs
+                          ) -> None:
+        '''
+        Creates a collection (index) on the Weaviate instance.
+        Args
+        ----
+        collection_name: str
+            Name of the collection to create.
+        properties: list[Property]
+            List of properties to add to data objects in the collection.
+        description: str=None
+            User-defined description of the collection.
+        '''
+        self._connect()
+        if self._client.collections.exists(collection_name):
+            print(f'Collection "{collection_name}" already exists')
+            return
+        else:
+            try:
+                self._client.collections.create(name=collection_name,
+                                                properties=properties,
+                                                description=description,
+                                                **kwargs)
+                print(f'Collection "{collection_name}" created')
+            except Exception as e:
+                print(f'Error creating collection, due to: {e}')
+        self._client.close()
+        return
+    def show_all_collections(self,
+                             detailed: bool=False,
+                             max_details: bool=False
+                             ) -> list[str] | dict:
+        '''
+        Shows all available collections(indexes) on the Weaviate cluster.
+        By default will only return list of collection names.
+        Otherwise, increasing details about each collection can be returned.
+        '''
+        self._connect()
+        collections = self._client.collections.list_all(simple=not max_details)
+        self._client.close()
+        if not detailed and not max_details:
+            return list(collections.keys())
+        else:
+            if not any(collections):
+                print('No collections found on host')
+            return collections
+    def show_collection_config(self, collection_name: str) -> ConnectionConfig:
+        '''
+        Shows all information of a specific collection.
+        '''
+        self._connect()
+        if self._client.collections.exists(collection_name):
+            collection = self.show_all_collections(max_details=True)[collection_name]
+            self._client.close()
+            return collection
+        else:
+            print(f'Collection "{collection_name}" not found on host')
+    def show_collection_properties(self, collection_name: str) -> dict | str:
+        '''
+        Shows all properties of a collection (index) on the Weaviate instance.
+        '''
+        self._connect()
+        if self._client.collections.exists(collection_name):
+            collection = self.show_all_collections(max_details=True)[collection_name]
+            self._client.close()
+            return collection.properties
+        else:
+            print(f'Collection "{collection_name}" not found on host')
+    def delete_collection(self, collection_name: str) -> str:
+        '''
+        Deletes a collection (index) on the Weaviate instance, if it exists.
+        '''
+        self._connect()
+        if self._client.collections.exists(collection_name):
+            try:
+                self._client.collections.delete(collection_name)
+                self._client.close()
+                print(f'Collection "{collection_name}" deleted')
+            except Exception as e:
+                print(f'Error deleting collection, due to: {e}')
+        else:
+            print(f'Collection "{collection_name}" not found on host')
+    def get_doc_count(self, collection_name: str) -> str:
+        '''
+        Returns the number of documents in a collection.
+        '''
+        self._connect()
+        if self._client.collections.exists(collection_name):
+            collection = self._client.collections.get(collection_name)
+            aggregate = collection.aggregate.over_all()
+            total_count = aggregate.total_count
+            print(f'Found {total_count} documents in collection "{collection_name}"')
+            return total_count
+        else:
+            print(f'Collection "{collection_name}" not found on host')
+    def format_response(self,
+                        response: QueryReturn,
+                        ) -> list[dict]:
+        '''
+        Formats json response from Weaviate into a list of dictionaries.
+        Expands _additional fields if present into top-level dictionary.
+        '''
+        results = [{**o.properties, **self._get_meta(o.metadata)} for o in response.objects]
+        return results
+    def _get_meta(self, metadata: MetadataReturn):
+        '''
+        Extracts metadata from MetadataQuery object if meta exists.
+        '''
+        temp_dict = metadata.__dict__
+        return {k:v for k,v in temp_dict.items() if v}
+    def keyword_search(self,
+                       request: str,
+                       collection_name: str,
+                       query_properties: list[str]=['content'],
+                       limit: int=10,
+                       filter: Filter=None,
+                       return_properties: list[str]=None,
+                       return_raw: bool=False
+                       ) -> dict | list[dict]:
+        '''
+        Executes Keyword (BM25) search.
+        Args
+        ----
+        request: str
+            User query.
+        collection_name: str
+            Collection (index) to search.
+        query_properties: list[str]
+            list of properties to search across.
+        limit: int=10
+            Number of results to return.
+        where_filter: dict=None
+            Property filter to apply to search results.
+        return_properties: list[str]=None
+            list of properties to return in response.
+            If None, returns self.return_properties.
+        return_raw: bool=False
+            If True, returns raw response from Weaviate.
+        '''
+        self._connect()
+        return_properties = return_properties if return_properties else self.return_properties
+        collection = self._client.collections.get(collection_name)
+        response = collection.query.bm25(query=request,
+                                         query_properties=query_properties,
+                                         limit=limit,
+                                         filters=filter,
+                                         return_metadata=MetadataQuery(score=True),
+                                         return_properties=return_properties)
+        # response = response.with_where(where_filter).do() if where_filter else response.do()
+        if return_raw:
+            return response
+        else:
+            return self.format_response(response)
+    def vector_search(self,
+                      request: str,
+                      collection_name: str,
+                      limit: int=10,
+                      return_properties: list[str]=None,
+                      filter: Filter=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                      ) -> dict | list[dict]:
+        '''
+        Executes vector search using embedding model defined on instantiation
+        of WeaviateClient instance.
+        Args
+        ----
+        request: str
+            User query.
+        collection_name: str
+            Collection (index) to search.
+        limit: int=10
+            Number of results to return.
+        return_properties: list[str]=None
+            list of properties to return in response.
+            If None, returns all properties.
+        return_raw: bool=False
+            If True, returns raw response from Weaviate.
+        device: str
+            Device to use for encoding query.
+        '''
+        self._connect()
+        return_properties = return_properties if return_properties else self.return_properties
+        query_vector = self._create_query_vector(request, device=device)
+        collection = self._client.collections.get(collection_name)
+        response = collection.query.near_vector(near_vector=query_vector,
+                                                limit=limit,
+                                                filters=filter,
+                                                return_metadata=MetadataQuery(distance=True),
+                                                return_properties=return_properties)
+        #  response = response.with_where(where_filter).do() if where_filter else response.do()
+        if return_raw:
+            return response
+        else:
+            return self.format_response(response)
+    def _create_query_vector(self, query: str, device: str) -> list[float]:
+        '''
+        Creates embedding vector from text query.
+        '''
+        return self.get_openai_embedding(query) if self._openai_model else self.model.encode(query, device=device).tolist()
+    def get_openai_embedding(self, query: str) -> list[float]:
+        '''
+        Gets embedding from OpenAI API for query.
+        '''
+        embedding = self.model.embeddings.create(input=query, model='text-embedding-ada-002').model_dump()
+        if embedding:
+            return embedding['data'][0]['embedding']
+        else:
+           raise ValueError(f'No embedding found for query: {query}')
+    def hybrid_search(self,
+                      request: str,
+                      collection_name: str,
+                      query_properties: list[str]=['content'],
+                      alpha: float=0.5,
+                      limit: int=10,
+                      filter: Filter=None,
+                      return_properties: list[str]=None,
+                      return_raw: bool=False,
+                      device: str='cuda:0' if cuda.is_available() else 'cpu'
+                     ) -> dict | list[dict]:
+        '''
+        Executes Hybrid (Keyword + Vector) search.
+        Args
+        ----
+        request: str
+            User query.
+        collection_name: str
+            Collection (index) to search.
+        query_properties: list[str]
+            list of properties to search across (using BM25)
+        alpha: float=0.5
+            Weighting factor for BM25 and Vector search.
+            alpha can be any number from 0 to 1, defaulting to 0.5:
+                alpha = 0 executes a pure keyword search method (BM25)
+                alpha = 0.5 weighs the BM25 and vector methods evenly
+                alpha = 1 executes a pure vector search method
+        limit: int=10
+            Number of results to return.
+        filter: Filter=None
+            Property filter to apply to search results.
+        return_properties: list[str]=None
+            list of properties to return in response.
+            If None, returns all properties.
+        return_raw: bool=False
+            If True, returns raw response from Weaviate.
+        '''
+        self._connect()
+        return_properties = return_properties if return_properties else self.return_properties
+        query_vector = self._create_query_vector(request, device=device)
+        collection = self._client.collections.get(collection_name)
+        response = collection.query.hybrid(query=request,
+                                           query_properties=query_properties,
+                                           filters=filter,
+                                           vector=query_vector,
+                                           alpha=alpha,
+                                           limit=limit,
+                                           return_metadata=MetadataQuery(score=True, distance=True),
+                                           return_properties=return_properties)
+        if return_raw:
+            return response
+        else:
+            return self.format_response(response)
+class WeaviateIndexer:
+    def __init__(self,
+                 client: WeaviateWCS
+                 ):
+        '''
+        Class designed to batch index documents into Weaviate. Instantiating
+        this class will automatically configure the Weaviate batch client.
+        '''
+        self._client = client._client
+    def _connect(self):
+        '''
+        Connects to Weaviate instance.
+        '''
+        if not self._client.is_connected():
+            self._client.connect()
+    def create_collection(self,
+                          collection_name: str,
+                          properties: list[Property],
+                          description: str=None,
+                          **kwargs
+                          ) -> str:
+        '''
+        Creates a collection (index) on the Weaviate instance.
+        '''
+        if collection_name.find('-') != -1:
+            raise ValueError('Collection name cannot contain hyphens')
+        try:
+            self._connect()
+            self._client.collections.create(name=collection_name,
+                                            description=description,
+                                            properties=properties,
+                                            **kwargs
+                                            )
+            if self._client.collections.exists(collection_name):
+                print(f'Collection "{collection_name}" created')
+            else:
+                print(f'Collection not found at the moment, try again later')
+            self._client.close()
+        except Exception as e:
+            print(f'Error creating collection, due to: {e}')
+    def batch_index_data(self,
+                         data: list[dict],
+                         collection_name: str,
+                         error_threshold: float=0.01,
+                         vector_property: str='content_embedding',
+                         unique_id_field: str='doc_id',
+                         properties: list[Property]=None,
+                         collection_description: str=None,
+                         **kwargs
+                         ) -> dict:
+        '''
+        Batch function for fast indexing of data onto Weaviate cluster.
+        Args
+        ----
+        data: list[dict]
+            List of dictionaries where each dictionary represents a document.
+        collection_name: str
+            Name of the collection to index data into.
+        error_threshold: float=0.01
+            Threshold for error rate during batch upload. This value is a percentage of the total data
+            that the end user is willing to tolerate as errors. If the error rate exceeds this threshold,
+            the batch job will be aborted.
+        vector_property: str='content_embedding'
+            Name of the property that contains the vector representation of the document.
+        unique_id_field: str='doc_id'
+            Name of the unique identifier field in the document.
+        properties: list[Property]=None
+            List of properties to create the collection with. Required if collection does not exist.
+        collection_description: str=None
+            Description of the collection. Optional parameter.
+        Returns
+        -------
+        dict
+            Dictionary containing error information if any with the following keys:
+            ['num_errors', 'error_messages', 'doc_ids']
+        '''
+        self._connect()
+        if not self._client.collections.exists(collection_name):
+            print(f'Collection "{collection_name}" not found on host, creating Collection first...')
+            if properties is None:
+                raise ValueError(f'Tried to create Collection <{collection_name}> but no properties were provided.')
+            self.create_collection(collection_name=collection_name,
+                                   properties=properties,
+                                   description=collection_description,
+                                   **kwargs)
+            self._client.close()
+        self._connect()
+        error_threshold_size = int(len(data) * error_threshold)
+        collection = self._client.collections.get(collection_name)
+        start = time.perf_counter()
+        completed_job = True
+        with collection.batch.dynamic() as batch:
+            for doc in tqdm(data):
+                batch.add_object(properties={k:v for k,v in doc.items() if k != vector_property},
+                                 vector=doc[vector_property])
+                if batch.number_errors > error_threshold_size:
+                    print('Upload errors exceed error_threshold...')
+                    completed_job = False
+                    break
+        end = time.perf_counter() - start
+        print(f'Processing finished in {round(end/60, 2)} minutes.')
+        failed_objects = collection.batch.failed_objects
+        if any(failed_objects):
+            error_messages = [obj.message for obj in failed_objects]
+            doc_ids = [obj.object_.properties.get(unique_id_field, 'Not Found') for obj in failed_objects]
+        else:
+            error_messages, doc_ids = [], []
+        error_object = {'num_errors':batch.number_errors,
+                        'error_messages': error_messages,
+                        'doc_ids': doc_ids}
+        if not completed_job:
+            print(f'Batch job failed. Review errors using these keys: {list(error_object.keys())}')
+            return error_object
+        if batch.number_errors > 0:
+                print(f'Batch job completed with {batch.number_errors} errors.  Review errors using these keys: {list(error_object.keys())}')
+        else:
+            print('Batch job completed with zero errors.')
+        return error_object
+@dataclass
+class SearchFilter(Filter):
+    '''
+    Simplified interface for constructing a Filter object.
+    Args
+    ----
+    property : str
+        Property to filter on.
+    query_value : str
+        Query value to filter on.
+    '''
+    property: str
+    query_value: str
+    def exact_match(self):
+        return self.by_property(self.property).equal(self.query_value)
+    def fuzzy_match(self):
+        return self.by_property(self.property).like(f'*{self.query_value}*')
+def get_weaviate_client(endpoint: str=os.getenv('FINRAG_WEAVIATE_ENDPOINT'),
+                        api_key: str=os.getenv('FINRAG_WEAVIATE_API_KEY'),
+                        model_name_or_path: str='sentence-transformers/all-MiniLM-L6-v2',
+                        embedded: bool=False,
+                        openai_api_key: str=None,
+                        skip_init_checks: bool=False,
+                        **kwargs
+                        ) -> WeaviateWCS:
+    return WeaviateWCS(endpoint, api_key, model_name_or_path, embedded, openai_api_key, skip_init_checks, **kwargs)

app/main.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import os, random, logging, pickle, shutil
+from dotenv import load_dotenv, find_dotenv
+from typing import Optional
+from pydantic import BaseModel, Field
+from fastapi import FastAPI, HTTPException, File, UploadFile, status
+from fastapi.responses import HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware
+from engine.processing import process_pdf, index_data, empty_collection, vector_search
+from rag.rag import rag_it
+from engine.logger import logger
+from settings import datadir
+os.makedirs(datadir, exist_ok=True)
+app = FastAPI()
+environment = os.getenv("ENVIRONMENT", "dev")  # created by dockerfile
+if environment == "dev":
+    logger.warning("Running in development mode - allowing CORS for all origins")
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+try:
+    # will not work on HuggingFace
+    # and Liquidity dont' have the env anyway
+    load_dotenv(find_dotenv('env'))
+except Exception as e:
+    pass
+@app.get("/", response_class=HTMLResponse)
+def read_root():
+    logger.info("Title displayed on home page")
+    return """
+    <html>
+        <body>
+            <h1>Welcome to FinExpert, a RAG system designed by JP Bianchi!</h1>
+        </body>
+    </html>
+    """
+@app.get("/ping/")
+def ping():
+    """ Testing """
+    logger.info("Someone is pinging the server")
+    return {"answer": str(random.random() * 100)}
+@app.delete("/erase_data/")
+def erase_data():
+    """ Erase all files in the data directory, but not the vector store """
+    if len(os.listdir(datadir)) == 0:
+        logger.info("No data to erase")
+        return {"message": "No data to erase"}
+    shutil.rmtree(datadir, ignore_errors=True)
+    os.mkdir(datadir)
+    logger.warning("All data has been erased")
+    return {"message": "All data has been erased"}
+@app.delete("/empty_collection/")
+def delete_vectors():
+    """ Empty the collection in the vector store """
+    try:
+        status = empty_collection()
+        return {f"""message": "Collection{'' if status else ' NOT'} erased!"""}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+@app.get("/list_files/")
+def list_files():
+    """ List all files in the data directory """
+    files = os.listdir(datadir)
+    logger.info(f"Files in data directory: {files}")
+    return {"files": files}
+@app.post("/upload/")
+# @limiter.limit("5/minute") see 'slowapi' for rate limiting
+async def upload_file(file: UploadFile = File(...)):
+    """  Uploads a file in data directory, for later indexing """
+    try:
+        filepath = os.path.join(datadir, file.filename)
+        logger.info(f"Fiename detected: {file.filename}")
+        if os.path.exists(filepath):
+            logger.warning(f"File {file.filename} already exists: no processing done")
+            return {"message": f"File {file.filename} already exists: no processing done"}
+        else:
+            logger.info(f"Receiving file: {file.filename}")
+            contents = await file.read()
+            logger.info(f"File reception complete!")
+    except Exception as e:
+        logger.error(f"Error during file upload: {str(e)}")
+        return {"message": f"Error during file upload:  {str(e)}"}
+    if file.filename.endswith('.pdf'):
+        # let's save the file in /data even if it's temp storage on HF
+        with open(filepath, 'wb') as f:
+            f.write(contents)
+        try:
+            logger.info(f"Starting to process {file.filename}")
+            new_content = process_pdf(filepath)
+            success = {"message": f"Successfully uploaded {file.filename}"}
+            success.update(new_content)
+            return success
+        except Exception as e:
+            return {"message": f"Failed to extract text from PDF: {str(e)}"}
+    else:
+        return {"message": "Only PDF files are accepted"}
+@app.post("/create_index/")
+async def create_index():
+    """ Create an index for the uploaded files """
+    logger.info("Creating index for uploaded files")
+    try:
+        msg = index_data()
+        return {"message": msg}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+class Question(BaseModel):
+    question: str
+@app.post("/ask/")
+async def hybrid_search(question: Question):
+    logger.info(f"Processing question: {question.question}")
+    try:
+        search_results = vector_search(question.question)
+        logger.info(f"Answer: {search_results}")
+        return {"answer": search_results}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+@app.post("/ragit/")
+async def ragit(question: Question):
+    logger.info(f"Processing question: {question.question}")
+    try:
+        search_results = vector_search(question.question)
+        logger.info(f"Search results generated: {search_results}")
+        answer = rag_it(question.question, search_results)
+        logger.info(f"Answer: {answer}")
+        return {"answer": answer}
+    except Exception as e:
+        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e))
+# TODO
+#   rejects searches with a search score below a threshold
+#   scrape the tables (and find a way to reject them from the text search -> LLamaparse)
+#   see why the filename in search results is always empty
+#       -> add it to the search results to avoid confusion Google-Amazon for instance
+#   add python scripts to create index, rag etc
+if __name__ == '__main__':
+    import uvicorn
+    from os import getenv
+    port = int(getenv("PORT", 80))
+    print(f"Starting server on port {port}")
+    reload = True if environment == "dev" else False
+    uvicorn.run("main:app", host="0.0.0.0", port=port, reload=reload)
+# Examples:
+# curl -X POST "http://localhost:80/upload" -F "[email protected]"
+# curl -X DELETE "http://localhost:80/erase_data/"
+# curl -X GET "http://localhost:80/list_files/"
+# hf space is at https://jpbianchi-finrag.hf.space/
+# code given by https://jpbianchi-finrag.hf.space/docs
+# Space must be public
+# curl -X POST "https://jpbianchi-finrag.hf.space/upload/" -F "[email protected]"
+# curl -X POST http://localhost:80/ask/ -H "Content-Type: application/json" -d '{"question": "what is Amazon loss"}'
+# curl -X POST http://localhost:80/ragit/ -H "Content-Type: application/json" -d '{"question": "Does ATT have postpaid phone customers?"}'
+# TODO
+# import unittest
+# from unitesting_utils import load_impact_theory_data
+# class TestSplitContents(unittest.TestCase):
+#     '''
+#     Unit test to ensure proper functionality of split_contents function
+#     '''
+#     def test_split_contents(self):
+#         import tiktoken
+#         from llama_index.text_splitter import SentenceSplitter
+#         data = load_impact_theory_data()
+#         subset = data[:3]
+#         chunk_size = 256
+#         chunk_overlap = 0
+#         encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
+#         gpt35_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=chunk_overlap)
+#         results = split_contents(subset, gpt35_txt_splitter)
+#         self.assertEqual(len(results), 3)
+#         self.assertEqual(len(results[0]), 83)
+#         self.assertEqual(len(results[1]), 178)
+#         self.assertEqual(len(results[2]), 144)
+#         self.assertTrue(isinstance(results, list))
+#         self.assertTrue(isinstance(results[0], list))
+#         self.assertTrue(isinstance(results[0][0], str))
+# unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestSplitContents))

app/notebooks/chunking_indexing.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app/notebooks/lite_lll.ipynb ADDED Viewed

	@@ -0,0 +1,158 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The autoreload extension is already loaded. To reload it, use:\n",
+      "  %reload_ext autoreload\n"
+     ]
+    }
+   ],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('../')\n",
+    "\n",
+    "from dotenv import load_dotenv, find_dotenv\n",
+    "envs = load_dotenv(find_dotenv('env'), override=True)\n",
+    "\n",
+    "from warnings import filterwarnings\n",
+    "filterwarnings('ignore')\n",
+    "\n",
+    "from llm.llm import LLM\n",
+    "\n",
+    "from litellm import ModelResponse\n",
+    "\n",
+    "from typing import Literal\n",
+    "from rich import print\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#instantiate the LLM Class\n",
+    "turbo = 'gpt-3.5-turbo-0125'\n",
+    "#the LLM Class will use the OPENAI_API_KEY env var as the default api_key \n",
+    "llm = LLM(turbo)\n",
+    "\n",
+    "# use the gpt3.5 model that is free - recent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vs = [\"Lastly, during the quarter, we increased our reserves for general product and automobile self-\\ninsurance liabilities, driven by changes in our estimates about the cost of asserted and unasserted \\nclaims, resulting in additional expense of $1.3 billion. This impact is primarily recorded in cost of \\nsales on our income statement. As our business has grown quickly over the last several years, \\nparticularly as we've built out our fulfillment and transportation network, and claim amounts have \\nseen industry-wide inflation, we've continued to evaluate and adjust this reserve for both asserted \\nclaims, as well as our estimate for unasserted claims.\\nWe reported overall net income of $278 million in the fourth quarter. While we primarily focus our \\ncomments on operating income, I'd point out that this net income includes a pre-tax valuation loss \\nof $2.3 billion included in non-operating income from our common stock investment in Rivian \\nAutomotive. As we've noted in recent quarters, this activity is not related to Amazon's ongoing \\noperations, but rather the quarter-to-quarter fluctuations in Rivian's stock price. As we head into \\nthe New Year, we remain heads down focused on driving a better customer experience.\",\"tenet of we want to find a way to meaningfully streamline our costs in all of our businesses, not \\njust their existing large businesses, but also in some of the investments we're making, we want to \\nactually do a pretty good thorough look about what we're investing and how much we think we \\nneed to. But doing so, without having to give up our ability to invest in the key long-term strategic \\ninvestments that we think could change broad customer experiences, and change Amazon over \\ntime.\\nAnd you saw that process led to us choosing to pause on incremental headcount, as we tried to \\nassess what was happening in the economy, and we eliminated some programs, Fabric.com, and \\nAmazon Care, and Amazon Glow, and Amazon Explore, and we decided to go slower on some -- \\non the physical store expansion and the grocery space until we had a format that we really \\nbelieved in rolling out, and we went a little bit slower on some devices. Until we made the very \\nhard decision that Brian talked about earlier, which was the hardest decision I think we've all been \\na part of, which was to reduce or eliminate 18,000 roles.\",\"operating income. This operating income was negatively impacted by three large items, which \\nadded approximately $2.7 billion of costs in the quarter. This is related to employee severance, \\nimpairments of property and equipment and operating leases, and changes in estimates related to \\nself-insurance liabilities. These costs primarily impacted our North America segment. If we had not \\nincurred these charges in Q4, our operating income would have been approximately $5.4 billion. \\nWe are encouraged with the progress we continue to make in streamlining the costs in our \\nAmazon Stores business. We entered the quarter with labor more appropriately matched to \\ndemand across our operations network, compared to Q4 of last year, allowing us to have the right \\nlabor, in the right place, at the right time, and drive productivity gains. We also saw continued \\nefficiencies across our transportation network, where process and tech improvements resulted in \\nhigher Amazon Logistics productivity and improved line haul fill rates. While transportation \\noverperformed expectations in the quarter, we also saw productivity improvements across our \\nfulfillment centers, in line with our plan. We also saw good leverage driven by strong holiday \\nvolumes.\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Search result 0: The 2022 decline reflects the separation of U.S. videoand lower personnel costs associated with ongoingtransformation initiatives, partially offset by higher baddebt expense, the elimination of Connect America FundPhase II (CAF II) government credits and increasedwholesale network access charges. Wireless equipmentcosts were up slightly, with higher sales volumes and thesale of higher-priced smartphones largely offset by lower3G shutdown costs in 2022. In the first quarter of 2022, weupdated the expected economic lives of customerrelationships, which extended the amortization period ofdeferred acquisition and fulfillment costs and reducedexpenses approximately $395, with $150 recorded toMobility, $115 to Business Wireline and $130 to ConsumerWireline.\\nThe 2021 decline reflects our 2021 business divestitures,\\nlower bad debt expense and lower personnel costsassociated with our transformation initiatives. Declineswere mostly offset by increased domestic wirelessequipment expense from higher volumes.\\nAsset impairments and abandonments and\\nrestructuring increased in 2022 and decreased in 2021.\\nSearch result 1: The 2022 decline reflects the separation of U.S. videoand lower personnel costs associated with ongoingtransformation initiatives, partially offset by higher baddebt expense, the elimination of Connect America FundPhase II (CAF II) government credits and increasedwholesale network access charges. Wireless equipmentcosts were up slightly, with higher sales volumes and thesale of higher-priced smartphones largely offset by lower3G shutdown costs in 2022. In the first quarter of 2022, weupdated the expected economic lives of customerrelationships, which extended the amortization period ofdeferred acquisition and fulfillment costs and reducedexpenses approximately $395, with $150 recorded toMobility, $115 to Business Wireline and $130 to ConsumerWireline.\\nThe 2021 decline reflects our 2021 business divestitures,\\nlower bad debt expense and lower personnel costsassociated with our transformation initiatives. Declineswere mostly offset by increased domestic wirelessequipment expense from higher volumes.\\nAsset impairments and abandonments and\\nrestructuring increased in 2022 and decreased in 2021.\\nSearch result 2: Credit Losses As of January 1, 2020, we adopted,\\nthrough modified retrospective application, ASU No.2016-13, “Financial Instruments—Credit Losses (Topic 326):Measurement of Credit Losses on Financial Instruments,”or Accounting Standards Codification (ASC) 326 (ASC 326),which replaces the incurred loss impairment methodologyunder prior GAAP with an expected credit loss model. ASC326 affects trade receivables, loans, contract assets,certain beneficial interests, off-balance-sheet creditexposures not accounted for as insurance and otherfinancial assets that are not subject to fair value throughnet income, as defined by the standard. Under theexpected credit loss model, we are required to considerfuture economic trends to estimate expected creditlosses over the lifetime of the asset. Upon adoption onJanuary 1, 2020, we recorded a $293 reduction to“Retained earnings,” $395 increase to “Allowances forcredit losses” applicable to our trade and loan receivables,$10 reduction of contract assets, $105 reduction of netdeferred income tax liability and $7 reduction of“Noncontrolling interest.” Our adoption of ASC 326 did nothave a material impact on our financial statements.'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "searches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"Amazon's loss includes a pre-tax valuation loss of $2.3 billion included in non-operating income from their common stock investment in Rivian Automotive. This loss is not related to Amazon's ongoing operations but rather reflects quarter-to-quarter fluctuations in Rivian's stock price.\""
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "system_message = \"\"\"\n",
+    "You are a financial analyst, with a deep expertise in financial reports.\n",
+    "You are able to quickly understand a series of paragraphs, or quips even, extracted from financial reports by a vector search system.  \n",
+    "\"\"\" \n",
+    "searches = \"\\n\".join([f\"Search result {i}: {v}\" for i,v in enumerate(vs,1)])\n",
+    "\n",
+    "question = \"What is Amazon's loss?\"\n",
+    "\n",
+    "user_prompt = f\"\"\"\n",
+    "Use the below context enclosed in triple back ticks to answer the question. \\n\n",
+    "The context is given by a vector search into a vector database of financial reports, so you can assume the context is accurate.\n",
+    "They search results are given in order of relevance (most relevant first). \\n\n",
+    "```\n",
+    "Context:\n",
+    "```\n",
+    "{searches}\n",
+    "```\n",
+    "Question:\\n\n",
+    "{question}\\n\n",
+    "------------------------\n",
+    "1. If the context does not provide enough information to answer the question, then\n",
+    "state that you cannot answer the question with the provided context.\n",
+    "2. Do not use any external knowledge or resources to answer the question.\n",
+    "3. Answer the question directly and with as much detail as possible, within the limits of the context.\n",
+    "------------------------\n",
+    "Answer:\\n\n",
+    "\"\"\".format(searches=searches, question=question)\n",
+    "\n",
+    "\n",
+    "response = llm.chat_completion(system_message=system_message,\n",
+    "                               user_message=user_prompt,\n",
+    "                               temperature=0.01,\n",
+    "                               stream=False,\n",
+    "                               raw_response=False)\n",
+    "response\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

app/notebooks/pdf_readers.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app/notebooks/upload_index.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app/notebooks/weaviate.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app/rag/__init__.py ADDED Viewed

File without changes

app/rag/llm.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# I didn't write this code
+from litellm import completion, acompletion
+from litellm.utils import CustomStreamWrapper, ModelResponse
+import os
+class LLM:
+    '''
+    Creates primary Class instance for interacting with various LLM model APIs.
+    Primary APIs supported are OpenAI and Anthropic.
+    '''
+    # non-exhaustive list of supported models
+    # these models are known to work
+    valid_models = {'openai': [
+                        "gpt-4-turbo-preview",
+                        "gpt-4-0125-preview",
+                        "gpt-4-1106-preview",
+                        "gpt-3.5-turbo",
+                        "gpt-3.5-turbo-1106",
+                        "gpt-3.5-turbo-0125",
+                        ],
+                    'anthropic': [ 'claude-3-haiku-20240307',
+                                   'claude-3-sonnet-2024022',
+                                   'claude-3-opus-20240229'
+                                   ],
+                    'cohere': ['command-r',
+                               'command-r-plus'
+                               ]
+                    }
+    def __init__(self,
+                 model_name: str='gpt-3.5-turbo-0125',
+                 api_key: str=None,
+                 api_version: str=None,
+                 api_base: str=None
+                 ):
+        self.model_name = model_name
+        if not api_key:
+            try:
+                self._api_key = os.environ['OPENAI_API_KEY']
+            except KeyError:
+                raise ValueError('Default api_key expects OPENAI_API_KEY environment variable. Check that you have this variable or pass in another api_key.')
+        else:
+            self._api_key = api_key
+        self.api_version = api_version
+        self.api_base = api_base
+    def chat_completion(self,
+                        system_message: str,
+                        user_message: str='',
+                        temperature: int=0,
+                        max_tokens: int=500,
+                        stream: bool=False,
+                        raw_response: bool=False,
+                        **kwargs
+                        ) -> str | CustomStreamWrapper | ModelResponse:
+        '''
+        Generative text completion method.
+        Args:
+        -----
+        system_message: str
+            The system message to be sent to the model.
+        user_message: str
+            The user message to be sent to the model.
+        temperature: int
+            The temperature parameter for the model.
+        max_tokens: int
+            The maximum tokens to be generated.
+        stream: bool
+            Whether to stream the response.
+        raw_response: bool
+            If True, returns the raw model response.
+        '''
+        #reformat roles for claude models
+        initial_role = 'user' if self.model_name.startswith('claude') else 'system'
+        secondary_role = 'assistant' if self.model_name.startswith('claude') else 'user'
+        #handle temperature for claude models
+        if self.model_name.startswith('claude'):
+            temperature = temperature/2
+        messages =  [
+            {'role': initial_role, 'content': system_message},
+            {'role': secondary_role, 'content': user_message}
+                    ]
+        response = completion(model=self.model_name,
+                              messages=messages,
+                              temperature=temperature,
+                              max_tokens=max_tokens,
+                              stream=stream,
+                              api_key=self._api_key,
+                              api_base=self.api_base,
+                              api_version=self.api_version,
+                              **kwargs)
+        if raw_response or stream:
+            return response
+        return response.choices[0].message.content
+    async def achat_completion(self,
+                               system_message: str,
+                               user_message: str=None,
+                               temperature: int=0,
+                               max_tokens: int=500,
+                               stream: bool=False,
+                               raw_response: bool=False,
+                               **kwargs
+                               ) -> str | CustomStreamWrapper | ModelResponse:
+        '''
+        Asynchronous generative text completion method.
+        Args:
+        -----
+        system_message: str
+            The system message to be sent to the model.
+        user_message: str
+            The user message to be sent to the model.
+        temperature: int
+            The temperature parameter for the model.
+        max_tokens: int
+            The maximum tokens to be generated.
+        stream: bool
+            Whether to stream the response.
+        raw_response: bool
+            If True, returns the raw model response.
+        '''
+        initial_role = 'user' if self.model_name.startswith('claude') else 'system'
+        if self.model_name.startswith('claude'):
+            temperature = temperature/2
+        messages =  [
+            {'role': initial_role, 'content': system_message},
+            {'role': 'user', 'content': user_message}
+                    ]
+        response = await acompletion(model=self.model_name,
+                                     messages=messages,
+                                     temperature=temperature,
+                                     max_tokens=max_tokens,
+                                     stream=stream,
+                                     api_key=self._api_key,
+                                     api_base=self.api_base,
+                                     api_version=self.api_version,
+                                     **kwargs)
+        if raw_response or stream:
+            return response
+        return response.choices[0].message.content

app/rag/rag.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from typing import List
+from .llm import LLM
+#the LLM Class uses the OPENAI_API_KEY env var as the default api_key
+def rag_it(question: str,
+           search_results: List[str],
+           model: str = 'gpt-3.5-turbo-0125',
+           ) -> str:
+    # TODO turn this into a class if time allows
+    llm = LLM(model)
+    system_message = """
+    You are a financial analyst, with a deep expertise in financial reports.
+    You are able to quickly understand a series of paragraphs, or quips even, extracted
+    from financial reports by a vector search system.
+    """
+    searches = "\n".join([f"Search result {i}: {v}" for i,v in enumerate(search_results,1)])
+    user_prompt = f"""
+    Use the below context enclosed in triple back ticks to answer the question. \n
+    The context is given by a vector search into a vector database of financial reports,
+    so you can assume the context is accurate.
+    They search results are given in order of relevance (most relevant first). \n
+    ```
+    Context:
+    ```
+    {searches}
+    ```
+    Question:\n
+    {question}\n
+    ------------------------
+    1. If the context does not provide enough information to answer the question, then
+    state that you cannot answer the question with the provided context.
+    Pay great attention to making sure your answer is relevant to the question
+    (for instance, never answer a question about a topic or company that are not explicitely mentioned in the context)
+    2. Do not use any external knowledge or resources to answer the question.
+    3. Answer the question directly and with as much detail as possible, within the limits of the context.
+    4. Avoid mentioning 'search results' in the answer.
+       Instead, incorporate the information from the search results into the answer.
+    5. Create a clean answer, without backticks, or starting with a new line for instance.
+    ------------------------
+    Answer:\n
+    """.format(searches=searches, question=question)
+    response = llm.chat_completion(system_message=system_message,
+                                   user_message=user_prompt,
+                                   temperature=0.01,  # let's not allow the model to be creative
+                                   stream=False,
+                                   raw_response=False)
+    return response

app/requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+requests==2.31.0
+pydantic==2.7.1
+pydantic_core==2.18.2
+fastapi==0.111.0
+uvicorn[standard]
+pdfplumber==0.11.0
+weaviate-client==4.5.4
+PyPDF2==3.0.1
+PyMuPDF==1.24.3
+llama-parse==0.4.2
+llama-index-readers-file==0.1.22
+nest_asyncio==1.6.0
+llama-index==0.10.37
+sentence-transformers==2.7.0
+fastparquet==2024.2.0
+litellm==1.37.12
+langchain==0.1.20
+langchain-community==0.0.38
+langchain-core==0.1.52
+langchain-text-splitters==0.0.1

app/settings.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import os
+datadir = '../data'  # will be used in main.py
+parquet_file = os.path.join(datadir, 'text_vectors.parquet') # used by the files in 'engine'