Spaces:
Running
Running
Merge branch 'main' into question-coefficient
Browse files
document_qa/document_qa_engine.py
CHANGED
@@ -494,7 +494,7 @@ class DocumentQAEngine:
|
|
494 |
print("File", pdf_file_path)
|
495 |
filename = Path(pdf_file_path).stem
|
496 |
coordinates = True # if chunk_size == -1 else False
|
497 |
-
structure = self.grobid_processor.
|
498 |
|
499 |
biblio = structure['biblio']
|
500 |
biblio['filename'] = filename.replace(" ", "_")
|
|
|
494 |
print("File", pdf_file_path)
|
495 |
filename = Path(pdf_file_path).stem
|
496 |
coordinates = True # if chunk_size == -1 else False
|
497 |
+
structure = self.grobid_processor.process(pdf_file_path, coordinates=coordinates)
|
498 |
|
499 |
biblio = structure['biblio']
|
500 |
biblio['filename'] = filename.replace(" ", "_")
|
document_qa/grobid_processors.py
CHANGED
@@ -110,10 +110,10 @@ class GrobidProcessor(BaseProcessor):
|
|
110 |
if status != 200:
|
111 |
return
|
112 |
|
113 |
-
|
114 |
-
|
115 |
|
116 |
-
return
|
117 |
|
118 |
def process_single(self, input_file):
|
119 |
doc = self.process_structure(input_file)
|
@@ -152,8 +152,8 @@ class GrobidProcessor(BaseProcessor):
|
|
152 |
"text": f"authors: {biblio['authors']}",
|
153 |
"type": passage_type,
|
154 |
"section": "<header>",
|
155 |
-
"subSection": "<
|
156 |
-
"passage_id": "
|
157 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
158 |
blocks_header['authors']])
|
159 |
})
|
@@ -258,7 +258,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
|
|
258 |
def __init__(self, grobid_quantities_client):
|
259 |
self.grobid_quantities_client = grobid_quantities_client
|
260 |
|
261 |
-
def
|
262 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
263 |
|
264 |
if status != 200:
|
@@ -430,7 +430,7 @@ class GrobidMaterialsProcessor(BaseProcessor):
|
|
430 |
def __init__(self, grobid_superconductors_client):
|
431 |
self.grobid_superconductors_client = grobid_superconductors_client
|
432 |
|
433 |
-
def
|
434 |
preprocessed_text = text.strip()
|
435 |
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
436 |
"processText_disable_linking")
|
@@ -534,22 +534,21 @@ class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProce
|
|
534 |
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
535 |
|
536 |
def process_single_text(self, text):
|
537 |
-
extracted_quantities_spans = self.
|
538 |
-
extracted_materials_spans = self.
|
539 |
all_entities = extracted_quantities_spans + extracted_materials_spans
|
540 |
entities = self.prune_overlapping_annotations(all_entities)
|
541 |
return entities
|
542 |
|
543 |
-
def
|
544 |
if self.gqp:
|
545 |
-
return self.gqp.
|
546 |
else:
|
547 |
return []
|
548 |
|
549 |
-
|
550 |
-
def extract_materials(self, text):
|
551 |
if self.gmp:
|
552 |
-
return self.gmp.
|
553 |
else:
|
554 |
return []
|
555 |
|
@@ -688,8 +687,8 @@ class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProce
|
|
688 |
|
689 |
|
690 |
class XmlProcessor(BaseProcessor):
|
691 |
-
def __init__(self
|
692 |
-
super().__init__(
|
693 |
|
694 |
def process_structure(self, input_file):
|
695 |
text = ""
|
@@ -701,16 +700,16 @@ class XmlProcessor(BaseProcessor):
|
|
701 |
|
702 |
return output_data
|
703 |
|
704 |
-
def process_single(self, input_file):
|
705 |
-
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
|
713 |
-
def
|
714 |
output_data = OrderedDict()
|
715 |
soup = BeautifulSoup(text, 'xml')
|
716 |
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|
|
|
110 |
if status != 200:
|
111 |
return
|
112 |
|
113 |
+
document_object = self.parse_grobid_xml(text, coordinates=coordinates)
|
114 |
+
document_object['filename'] = Path(pdf_file).stem.replace(".tei", "")
|
115 |
|
116 |
+
return document_object
|
117 |
|
118 |
def process_single(self, input_file):
|
119 |
doc = self.process_structure(input_file)
|
|
|
152 |
"text": f"authors: {biblio['authors']}",
|
153 |
"type": passage_type,
|
154 |
"section": "<header>",
|
155 |
+
"subSection": "<authors>",
|
156 |
+
"passage_id": "hauthors",
|
157 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
158 |
blocks_header['authors']])
|
159 |
})
|
|
|
258 |
def __init__(self, grobid_quantities_client):
|
259 |
self.grobid_quantities_client = grobid_quantities_client
|
260 |
|
261 |
+
def process(self, text) -> list:
|
262 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
263 |
|
264 |
if status != 200:
|
|
|
430 |
def __init__(self, grobid_superconductors_client):
|
431 |
self.grobid_superconductors_client = grobid_superconductors_client
|
432 |
|
433 |
+
def process(self, text):
|
434 |
preprocessed_text = text.strip()
|
435 |
status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
|
436 |
"processText_disable_linking")
|
|
|
534 |
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
535 |
|
536 |
def process_single_text(self, text):
|
537 |
+
extracted_quantities_spans = self.process_properties(text)
|
538 |
+
extracted_materials_spans = self.process_materials(text)
|
539 |
all_entities = extracted_quantities_spans + extracted_materials_spans
|
540 |
entities = self.prune_overlapping_annotations(all_entities)
|
541 |
return entities
|
542 |
|
543 |
+
def process_properties(self, text):
|
544 |
if self.gqp:
|
545 |
+
return self.gqp.process(text)
|
546 |
else:
|
547 |
return []
|
548 |
|
549 |
+
def process_materials(self, text):
|
|
|
550 |
if self.gmp:
|
551 |
+
return self.gmp.process(text)
|
552 |
else:
|
553 |
return []
|
554 |
|
|
|
687 |
|
688 |
|
689 |
class XmlProcessor(BaseProcessor):
|
690 |
+
def __init__(self):
|
691 |
+
super().__init__()
|
692 |
|
693 |
def process_structure(self, input_file):
|
694 |
text = ""
|
|
|
700 |
|
701 |
return output_data
|
702 |
|
703 |
+
# def process_single(self, input_file):
|
704 |
+
# doc = self.process_structure(input_file)
|
705 |
+
#
|
706 |
+
# for paragraph in doc['passages']:
|
707 |
+
# entities = self.process_single_text(paragraph['text'])
|
708 |
+
# paragraph['spans'] = entities
|
709 |
+
#
|
710 |
+
# return doc
|
711 |
|
712 |
+
def process(self, text):
|
713 |
output_data = OrderedDict()
|
714 |
soup = BeautifulSoup(text, 'xml')
|
715 |
text_blocks_children = get_children_list_supermat(soup, verbose=False)
|