Spaces:
Sleeping
Sleeping
Asaad Almutareb
commited on
Commit
•
7acac3e
1
Parent(s):
f5d22a4
added S3 locations to example.env
Browse filesexpanded test use case in test_this.py
generate summaries in German now
- example.env +9 -8
- rag_app/generate_summary.py +6 -6
- rag_app/handle_vector_store.py +5 -5
- test_this.py +12 -6
example.env
CHANGED
@@ -3,13 +3,14 @@ HUGGINGFACEHUB_API_TOKEN=""
|
|
3 |
GOOGLE_CSE_ID=""
|
4 |
GOOGLE_API_KEY=""
|
5 |
|
6 |
-
#
|
7 |
-
S3_LOCATION=""
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
|
13 |
# llm and embedding models
|
14 |
-
|
15 |
-
|
|
|
|
3 |
GOOGLE_CSE_ID=""
|
4 |
GOOGLE_API_KEY=""
|
5 |
|
6 |
+
# Vectorstore storage on S3 and locally
|
7 |
+
S3_LOCATION="rad-rag-demos"
|
8 |
+
FAISS_VS_NAME="vectorstores/faiss-insurance-agent-500.zip"
|
9 |
+
CHROMA_VS_NAME=""
|
10 |
+
FAISS_INDEX_PATH = "./vectorstore/faiss-insurance-agent-500"
|
11 |
+
CHROMA_DIRECTORY = "./vectorstore/chroma-insurance-agent-500"
|
12 |
|
13 |
# llm and embedding models
|
14 |
+
EMBEDDING_MODEL="sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
15 |
+
LLM_MODEL="mistralai/Mixtral-8x7B-Instruct-v0.1"
|
16 |
+
LLM_MODEL_ARGS=
|
rag_app/generate_summary.py
CHANGED
@@ -24,10 +24,10 @@ def generate_keywords(document:dict,
|
|
24 |
|
25 |
template = (
|
26 |
"""
|
27 |
-
|
28 |
-
|
29 |
|
30 |
-
|
31 |
|
32 |
<<<
|
33 |
Document: {document}
|
@@ -49,10 +49,10 @@ def generate_description(document:dict,
|
|
49 |
|
50 |
template = (
|
51 |
"""
|
52 |
-
|
53 |
-
|
54 |
|
55 |
-
|
56 |
|
57 |
<<<
|
58 |
Document: {document}
|
|
|
24 |
|
25 |
template = (
|
26 |
"""
|
27 |
+
Du bist ein SEO-Experten-Bot. Deine Aufgabe ist es, 5 aussagekräftige Schlüsselwörtern zu erstellen, um Dokumente zu organisieren.
|
28 |
+
Die Schlüsselwörter sollen uns später beim Suchen, Filtern und Abrufen der Dokumente helfen.
|
29 |
|
30 |
+
Antworte nur mit 5 klaren, prägnanten und aussagekräftigen Schlüsselwörtern, getrennt durch Kommas.
|
31 |
|
32 |
<<<
|
33 |
Document: {document}
|
|
|
49 |
|
50 |
template = (
|
51 |
"""
|
52 |
+
Du bist ein SEO-Experten-Bot. Deine Aufgabe ist es, eine aussagekräftige Zusammenfassung zu erstellen, um Dokumente zu beschreiben und zu organisieren.
|
53 |
+
Die Beschreibung sollte eine aussagekräftige Zusammenfassung des Dokumentinhalts sein und uns später beim Suchen und Abrufen der Dokumente helfen.
|
54 |
|
55 |
+
Antworte nur mit einer klaren, prägnanten und aussagekräftigen Beschreibung in Deutsch.
|
56 |
|
57 |
<<<
|
58 |
Document: {document}
|
rag_app/handle_vector_store.py
CHANGED
@@ -29,11 +29,11 @@ def build_vector_store(
|
|
29 |
FAISS_INDEX_PATH = db_path
|
30 |
|
31 |
embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
|
32 |
-
for chunk in chunks:
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
|
38 |
#load chunks into vector store
|
39 |
print(f'Loading chunks into faiss vector store ...')
|
|
|
29 |
FAISS_INDEX_PATH = db_path
|
30 |
|
31 |
embeddings,chunks = create_embeddings(docs, chunk_size, chunk_overlap, embedding_model)
|
32 |
+
# for chunk in chunks:
|
33 |
+
# keywords=generate_keywords(chunk)
|
34 |
+
# description=generate_description(chunk)
|
35 |
+
# chunk.metadata['keywords']=keywords
|
36 |
+
# chunk.metadata['description']=description
|
37 |
|
38 |
#load chunks into vector store
|
39 |
print(f'Loading chunks into faiss vector store ...')
|
test_this.py
CHANGED
@@ -3,13 +3,19 @@ from rag_app.create_embedding import create_embeddings
|
|
3 |
from rag_app.generate_summary import generate_description, generate_keywords
|
4 |
from rag_app.handle_vector_store import build_vector_store
|
5 |
|
6 |
-
|
|
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
13 |
|
14 |
build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
|
15 |
|
|
|
3 |
from rag_app.generate_summary import generate_description, generate_keywords
|
4 |
from rag_app.handle_vector_store import build_vector_store
|
5 |
|
6 |
+
# 1. load the urls
|
7 |
+
# 2. build the vectorstore -> the function will create the chunking and embeddings
|
8 |
+
# 3. initialize the db retriever
|
9 |
+
# 4.
|
10 |
|
11 |
+
docs = load_docs_from_urls(["https://www.wuerttembergische.de/"],6)
|
12 |
+
|
13 |
+
# for doc in docs:
|
14 |
+
# keywords=generate_keywords(doc)
|
15 |
+
# description=generate_description(doc)
|
16 |
+
# doc.metadata['keywords']=keywords
|
17 |
+
# doc.metadata['description']=description
|
18 |
+
# print(doc.metadata)
|
19 |
|
20 |
build_vector_store(docs, './vectorstore/faiss-insurance-agent-1500','sentence-transformers/multi-qa-mpnet-base-dot-v1',True,1500,150)
|
21 |
|