Spaces:
Running
Running
include title, authors and year in the data store
Browse files- document_qa/document_qa_engine.py +30 -8
- streamlit_app.py +2 -1
document_qa/document_qa_engine.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import copy
|
|
|
2 |
import os
|
3 |
from pathlib import Path
|
4 |
from typing import Union, Any
|
@@ -173,8 +174,10 @@ class DocumentQAEngine:
|
|
173 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
174 |
return relevant_documents
|
175 |
|
176 |
-
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
|
177 |
-
"""
|
|
|
|
|
178 |
if verbose:
|
179 |
print("File", pdf_file_path)
|
180 |
filename = Path(pdf_file_path).stem
|
@@ -189,6 +192,7 @@ class DocumentQAEngine:
|
|
189 |
texts = []
|
190 |
metadatas = []
|
191 |
ids = []
|
|
|
192 |
if chunk_size < 0:
|
193 |
for passage in structure['passages']:
|
194 |
biblio_copy = copy.copy(biblio)
|
@@ -212,10 +216,25 @@ class DocumentQAEngine:
|
|
212 |
metadatas = [biblio for _ in range(len(texts))]
|
213 |
ids = [id for id, t in enumerate(texts)]
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
return texts, metadatas, ids
|
216 |
|
217 |
-
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1):
|
218 |
-
|
|
|
|
|
|
|
|
|
|
|
219 |
if doc_id:
|
220 |
hash = doc_id
|
221 |
else:
|
@@ -233,7 +252,7 @@ class DocumentQAEngine:
|
|
233 |
|
234 |
return hash
|
235 |
|
236 |
-
def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1):
|
237 |
input_files = []
|
238 |
for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
|
239 |
for file_ in files:
|
@@ -250,9 +269,12 @@ class DocumentQAEngine:
|
|
250 |
if os.path.exists(data_path):
|
251 |
print(data_path, "exists. Skipping it ")
|
252 |
continue
|
253 |
-
|
254 |
-
texts, metadata, ids = self.get_text_from_document(
|
255 |
-
|
|
|
|
|
|
|
256 |
filename = metadata[0]['filename']
|
257 |
|
258 |
vector_db_document = Chroma.from_texts(texts,
|
|
|
1 |
import copy
|
2 |
+
import json
|
3 |
import os
|
4 |
from pathlib import Path
|
5 |
from typing import Union, Any
|
|
|
174 |
relevant_documents = multi_query_retriever.get_relevant_documents(query)
|
175 |
return relevant_documents
|
176 |
|
177 |
+
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, include=(), verbose=False):
|
178 |
+
"""
|
179 |
+
Extract text from documents using Grobid, if chunk_size is < 0 it keeps each paragraph separately
|
180 |
+
"""
|
181 |
if verbose:
|
182 |
print("File", pdf_file_path)
|
183 |
filename = Path(pdf_file_path).stem
|
|
|
192 |
texts = []
|
193 |
metadatas = []
|
194 |
ids = []
|
195 |
+
|
196 |
if chunk_size < 0:
|
197 |
for passage in structure['passages']:
|
198 |
biblio_copy = copy.copy(biblio)
|
|
|
216 |
metadatas = [biblio for _ in range(len(texts))]
|
217 |
ids = [id for id, t in enumerate(texts)]
|
218 |
|
219 |
+
if "biblio" in include:
|
220 |
+
biblio_metadata = copy.copy(biblio)
|
221 |
+
biblio_metadata['type'] = "biblio"
|
222 |
+
biblio_metadata['section'] = "header"
|
223 |
+
for key in ['title', 'authors', 'year']:
|
224 |
+
if key in biblio_metadata:
|
225 |
+
texts.append("{}: {}".format(key, biblio_metadata[key]))
|
226 |
+
metadatas.append(biblio_metadata)
|
227 |
+
ids.append(key)
|
228 |
+
|
229 |
return texts, metadatas, ids
|
230 |
|
231 |
+
def create_memory_embeddings(self, pdf_path, doc_id=None, chunk_size=500, perc_overlap=0.1, include_biblio=False):
|
232 |
+
include = ["biblio"] if include_biblio else []
|
233 |
+
texts, metadata, ids = self.get_text_from_document(
|
234 |
+
pdf_path,
|
235 |
+
chunk_size=chunk_size,
|
236 |
+
perc_overlap=perc_overlap,
|
237 |
+
include=include)
|
238 |
if doc_id:
|
239 |
hash = doc_id
|
240 |
else:
|
|
|
252 |
|
253 |
return hash
|
254 |
|
255 |
+
def create_embeddings(self, pdfs_dir_path: Path, chunk_size=500, perc_overlap=0.1, include_biblio=False):
|
256 |
input_files = []
|
257 |
for root, dirs, files in os.walk(pdfs_dir_path, followlinks=False):
|
258 |
for file_ in files:
|
|
|
269 |
if os.path.exists(data_path):
|
270 |
print(data_path, "exists. Skipping it ")
|
271 |
continue
|
272 |
+
include = ["biblio"] if include_biblio else []
|
273 |
+
texts, metadata, ids = self.get_text_from_document(
|
274 |
+
input_file,
|
275 |
+
chunk_size=chunk_size,
|
276 |
+
perc_overlap=perc_overlap,
|
277 |
+
include=include)
|
278 |
filename = metadata[0]['filename']
|
279 |
|
280 |
vector_db_document = Chroma.from_texts(texts,
|
streamlit_app.py
CHANGED
@@ -283,7 +283,8 @@ if uploaded_file and not st.session_state.loaded_embeddings:
|
|
283 |
# hash = get_file_hash(tmp_file.name)[:10]
|
284 |
st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
|
285 |
chunk_size=chunk_size,
|
286 |
-
perc_overlap=0.1
|
|
|
287 |
st.session_state['loaded_embeddings'] = True
|
288 |
st.session_state.messages = []
|
289 |
|
|
|
283 |
# hash = get_file_hash(tmp_file.name)[:10]
|
284 |
st.session_state['doc_id'] = hash = st.session_state['rqa'][model].create_memory_embeddings(tmp_file.name,
|
285 |
chunk_size=chunk_size,
|
286 |
+
perc_overlap=0.1,
|
287 |
+
include_biblio=True)
|
288 |
st.session_state['loaded_embeddings'] = True
|
289 |
st.session_state.messages = []
|
290 |
|