Add dependency comp general functionality, fix issues and add more examples
Browse files- ExampleParsing.svg +71 -0
- __pycache__/custom_renderer.cpython-37.pyc +0 -0
- app.py +123 -39
- custom_renderer.py +80 -26
- dependency-specific-text/article11.txt +4 -0
- dependency-specific-text/article13.txt +3 -0
- dependency-specific-text/example.txt +1 -0
- entity-specific-text/article11.txt +3 -0
- entity-specific-text/article13.txt +2 -0
- sample-articles/article11.txt +13 -0
- sample-summaries/article11.txt +1 -0
- sample-summaries/article13.txt +1 -1
ExampleParsing.svg
ADDED
__pycache__/custom_renderer.cpython-37.pyc
CHANGED
Binary files a/__pycache__/custom_renderer.cpython-37.pyc and b/__pycache__/custom_renderer.cpython-37.pyc differ
|
|
app.py
CHANGED
@@ -8,7 +8,9 @@ from bs4 import BeautifulSoup
|
|
8 |
import numpy as np
|
9 |
import base64
|
10 |
|
|
|
11 |
from spacy_streamlit.util import get_svg
|
|
|
12 |
|
13 |
from custom_renderer import render_sentence_custom
|
14 |
from flair.data import Sentence
|
@@ -134,6 +136,18 @@ def fetch_summary_contents(filename: str) -> AnyStr:
|
|
134 |
return data
|
135 |
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
def classify_comment(comment, selected_model):
|
138 |
"""Classify the given comment and augment with additional information."""
|
139 |
toxicity_pipeline, cls_explainer = load_pipeline(selected_model)
|
@@ -162,9 +176,10 @@ def classify_comment(comment, selected_model):
|
|
162 |
|
163 |
def display_summary(article_name: str):
|
164 |
summary_content = fetch_summary_contents(article_name)
|
|
|
165 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
166 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
167 |
-
|
168 |
|
169 |
|
170 |
##@st.cache(hash_funcs={preshed.maps.PreshMap: my_hash_func})
|
@@ -215,12 +230,12 @@ def get_all_entities(text):
|
|
215 |
def get_and_compare_entities(article_name: str):
|
216 |
article_content = fetch_article_contents(article_name)
|
217 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
218 |
-
#st.session_state.entities_per_sentence_article = all_entities_per_sentence
|
219 |
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
220 |
|
221 |
summary_content = fetch_summary_contents(article_name)
|
222 |
all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
|
223 |
-
#st.session_state.entities_per_sentence_summary = all_entities_per_sentence
|
224 |
entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
225 |
|
226 |
matched_entities = []
|
@@ -268,45 +283,55 @@ def check_dependency(article: bool):
|
|
268 |
if article:
|
269 |
text = st.session_state.article_text
|
270 |
all_entities = get_all_entities_per_sentence(text)
|
271 |
-
#all_entities = st.session_state.entities_per_sentence_article
|
272 |
else:
|
273 |
text = st.session_state.summary_output
|
274 |
all_entities = get_all_entities_per_sentence(text)
|
275 |
-
#all_entities = st.session_state.entities_per_sentence_summary
|
276 |
doc = nlp(text)
|
277 |
tok_l = doc.to_json()['tokens']
|
278 |
-
all_deps = ""
|
279 |
-
|
280 |
-
print("OOPS")
|
281 |
|
282 |
sentences = list(doc.sents)
|
283 |
-
print(sentences)
|
284 |
for i, sentence in enumerate(sentences):
|
285 |
-
#TODO MONDAY: THE PROBLEM LIES HERE WITH THE SENTENCE!!! (I THINK I KNOW PROBLEM: TEXT SAVED AS SESSION STATE IS HTML NOT PURE TEXT!)
|
286 |
-
print(str(sentence))
|
287 |
start_id = sentence.start
|
288 |
end_id = sentence.end
|
289 |
for t in tok_l:
|
|
|
290 |
if t["id"] < start_id or t["id"] > end_id:
|
291 |
continue
|
292 |
head = tok_l[t['head']]
|
293 |
-
if t['dep'] == 'amod':
|
294 |
-
print("AMOD FOUND")
|
295 |
object_here = text[t['start']:t['end']]
|
296 |
object_target = text[head['start']:head['end']]
|
297 |
-
|
|
|
298 |
# ONE NEEDS TO BE ENTITY
|
299 |
if object_here in all_entities[i]:
|
300 |
-
|
301 |
-
|
302 |
-
|
|
|
|
|
303 |
elif object_target in all_entities[i]:
|
304 |
-
all_deps = all_deps.join(str(sentence))
|
|
|
|
|
|
|
|
|
305 |
else:
|
306 |
continue
|
307 |
-
#print(f'
|
308 |
-
|
309 |
-
return all_deps
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
|
311 |
|
312 |
# Start session
|
@@ -359,13 +384,28 @@ if st.session_state.article_text:
|
|
359 |
with st.spinner('Generating summary...'):
|
360 |
# classify_comment(article_text, selected_model)
|
361 |
|
362 |
-
display_summary(selected_article)
|
363 |
|
364 |
-
st.write("**Generated summary:**",
|
365 |
else:
|
366 |
st.error('**Error**: No comment to classify. Please provide a comment.',
|
367 |
help="Generate summary for the given article text")
|
368 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
# ENTITY MATCHING PART
|
370 |
st.header("Entity matching")
|
371 |
st.markdown("**Named entity recognition** (NER) is the task of identifying and categorising key information ("
|
@@ -376,23 +416,67 @@ st.markdown("**Named entity recognition** (NER) is the task of identifying and c
|
|
376 |
with st.spinner("Calculating and matching entities..."):
|
377 |
entity_match_html = highlight_entities(selected_article)
|
378 |
st.write(entity_match_html, unsafe_allow_html=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
|
380 |
# DEPENDENCY PARSING PART
|
381 |
st.header("Dependency comparison")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
with st.spinner("Doing dependency parsing..."):
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
#
|
390 |
-
#
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
#
|
398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import numpy as np
|
9 |
import base64
|
10 |
|
11 |
+
import validators
|
12 |
from spacy_streamlit.util import get_svg
|
13 |
+
from validators import ValidationFailure
|
14 |
|
15 |
from custom_renderer import render_sentence_custom
|
16 |
from flair.data import Sentence
|
|
|
136 |
return data
|
137 |
|
138 |
|
139 |
+
def fetch_entity_specific_contents(filename: str) -> AnyStr:
|
140 |
+
with open(f'./entity-specific-text/{filename.lower()}.txt', 'r') as f:
|
141 |
+
data = f.read()
|
142 |
+
return data
|
143 |
+
|
144 |
+
|
145 |
+
def fetch_dependency_specific_contents(filename: str) -> AnyStr:
|
146 |
+
with open(f'./dependency-specific-text/{filename.lower()}.txt', 'r') as f:
|
147 |
+
data = f.read()
|
148 |
+
return data
|
149 |
+
|
150 |
+
|
151 |
def classify_comment(comment, selected_model):
|
152 |
"""Classify the given comment and augment with additional information."""
|
153 |
toxicity_pipeline, cls_explainer = load_pipeline(selected_model)
|
|
|
176 |
|
177 |
def display_summary(article_name: str):
|
178 |
summary_content = fetch_summary_contents(article_name)
|
179 |
+
st.session_state.summary_output = summary_content
|
180 |
soup = BeautifulSoup(summary_content, features="html.parser")
|
181 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
|
182 |
+
return HTML_WRAPPER.format(soup)
|
183 |
|
184 |
|
185 |
##@st.cache(hash_funcs={preshed.maps.PreshMap: my_hash_func})
|
|
|
230 |
def get_and_compare_entities(article_name: str):
|
231 |
article_content = fetch_article_contents(article_name)
|
232 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
233 |
+
# st.session_state.entities_per_sentence_article = all_entities_per_sentence
|
234 |
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
235 |
|
236 |
summary_content = fetch_summary_contents(article_name)
|
237 |
all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
|
238 |
+
# st.session_state.entities_per_sentence_summary = all_entities_per_sentence
|
239 |
entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
240 |
|
241 |
matched_entities = []
|
|
|
283 |
if article:
|
284 |
text = st.session_state.article_text
|
285 |
all_entities = get_all_entities_per_sentence(text)
|
286 |
+
# all_entities = st.session_state.entities_per_sentence_article
|
287 |
else:
|
288 |
text = st.session_state.summary_output
|
289 |
all_entities = get_all_entities_per_sentence(text)
|
290 |
+
# all_entities = st.session_state.entities_per_sentence_summary
|
291 |
doc = nlp(text)
|
292 |
tok_l = doc.to_json()['tokens']
|
293 |
+
# all_deps = ""
|
294 |
+
test_list_dict_output = []
|
|
|
295 |
|
296 |
sentences = list(doc.sents)
|
|
|
297 |
for i, sentence in enumerate(sentences):
|
|
|
|
|
298 |
start_id = sentence.start
|
299 |
end_id = sentence.end
|
300 |
for t in tok_l:
|
301 |
+
# print(t)
|
302 |
if t["id"] < start_id or t["id"] > end_id:
|
303 |
continue
|
304 |
head = tok_l[t['head']]
|
305 |
+
if t['dep'] == 'amod' or t['dep'] == "pobj":
|
|
|
306 |
object_here = text[t['start']:t['end']]
|
307 |
object_target = text[head['start']:head['end']]
|
308 |
+
if t['dep'] == "pobj" and str.lower(object_target) != "in":
|
309 |
+
continue
|
310 |
# ONE NEEDS TO BE ENTITY
|
311 |
if object_here in all_entities[i]:
|
312 |
+
# all_deps = all_deps.join(str(sentence))
|
313 |
+
identifier = object_here + t['dep'] + object_target
|
314 |
+
test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
|
315 |
+
"target_word_index": (t['head'] - sentence.start),
|
316 |
+
"identifier": identifier, "sentence": str(sentence)})
|
317 |
elif object_target in all_entities[i]:
|
318 |
+
# all_deps = all_deps.join(str(sentence))
|
319 |
+
identifier = object_here + t['dep'] + object_target
|
320 |
+
test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
|
321 |
+
"target_word_index": (t['head'] - sentence.start),
|
322 |
+
"identifier": identifier, "sentence": str(sentence)})
|
323 |
else:
|
324 |
continue
|
325 |
+
# print(f'NOW TEST LIST DICT: {test_list_dict_output}')
|
326 |
+
return test_list_dict_output
|
327 |
+
# return all_deps
|
328 |
+
|
329 |
+
|
330 |
+
def is_valid_url(url: str) -> bool:
|
331 |
+
result = validators.url(url)
|
332 |
+
if isinstance(result, ValidationFailure):
|
333 |
+
return False
|
334 |
+
return True
|
335 |
|
336 |
|
337 |
# Start session
|
|
|
384 |
with st.spinner('Generating summary...'):
|
385 |
# classify_comment(article_text, selected_model)
|
386 |
|
387 |
+
summary_displayed = display_summary(selected_article)
|
388 |
|
389 |
+
st.write("**Generated summary:**", summary_displayed, unsafe_allow_html=True)
|
390 |
else:
|
391 |
st.error('**Error**: No comment to classify. Please provide a comment.',
|
392 |
help="Generate summary for the given article text")
|
393 |
|
394 |
+
if is_valid_url(article_text):
|
395 |
+
print("YES")
|
396 |
+
else:
|
397 |
+
print("NO")
|
398 |
+
def render_svg(svg_file):
|
399 |
+
with open(svg_file, "r") as f:
|
400 |
+
lines = f.readlines()
|
401 |
+
svg = "".join(lines)
|
402 |
+
|
403 |
+
# """Renders the given svg string."""
|
404 |
+
b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
|
405 |
+
html = r'<img src="data:image/svg+xml;base64,%s"/>' % b64
|
406 |
+
return html
|
407 |
+
|
408 |
+
|
409 |
# ENTITY MATCHING PART
|
410 |
st.header("Entity matching")
|
411 |
st.markdown("**Named entity recognition** (NER) is the task of identifying and categorising key information ("
|
|
|
416 |
with st.spinner("Calculating and matching entities..."):
|
417 |
entity_match_html = highlight_entities(selected_article)
|
418 |
st.write(entity_match_html, unsafe_allow_html=True)
|
419 |
+
red_text = """<font color="black"><span style="background-color: rgb(238, 135, 135); opacity:
|
420 |
+
1;">red</span></font> """
|
421 |
+
green_text = """<font color="black">
|
422 |
+
<span style="background-color: rgb(121, 236, 121); opacity: 1;">green</span>
|
423 |
+
</font>"""
|
424 |
+
|
425 |
+
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
426 |
+
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
|
427 |
+
st.markdown("Here you can see what this looks like when we apply entity-matching on the summary (compared to the "
|
428 |
+
"original article). Entities in this summary are marked " + green_text + " when the entity also "
|
429 |
+
"exists in the article, while unmatched entities are marked " + red_text + ".",
|
430 |
+
unsafe_allow_html=True)
|
431 |
+
entity_specific_text = fetch_entity_specific_contents(selected_article)
|
432 |
+
st.markdown(entity_specific_text)
|
433 |
|
434 |
# DEPENDENCY PARSING PART
|
435 |
st.header("Dependency comparison")
|
436 |
+
st.markdown("**Dependency parsing** is the process in which the grammatical structure in a sentence is analysed, "
|
437 |
+
"to find out related words as well as the type of the relationship between them. For the sentence “Jan’s "
|
438 |
+
"wife is called Sarah” you would get the following dependency graph:")
|
439 |
+
|
440 |
+
# TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
|
441 |
+
# st.image("ExampleParsing.svg")
|
442 |
+
st.write(render_svg('ExampleParsing.svg'), unsafe_allow_html=True)
|
443 |
+
st.markdown("Here, “Jan” is the “poss” (possession modifier) of “wife”. If suddenly the summary would read “Jan’s "
|
444 |
+
"husband…”, there would be a dependency in the summary that is non-existent in the article itself. "
|
445 |
+
"However, it could be that such a new dependency is not per se correct, “The borders of Ukraine” have a "
|
446 |
+
"different dependency between “borders” and “Ukraine” than “Ukraine’s borders”, while this would also be "
|
447 |
+
"correct. So general matching between summary and article wont work.")
|
448 |
+
st.markdown("There is however a simple method that we found has potential in post-processing. Based on empirical "
|
449 |
+
"results, we have found that when there are specific kinds of dependencies in the summary that are not in "
|
450 |
+
"the article, these specific types are often an indication of a wrongly constructed sentence. Let’s take "
|
451 |
+
"a look at an example:")
|
452 |
with st.spinner("Doing dependency parsing..."):
|
453 |
+
summary_deps = check_dependency(False)
|
454 |
+
article_deps = check_dependency(True)
|
455 |
+
total_unmatched_deps = []
|
456 |
+
for summ_dep in summary_deps:
|
457 |
+
if not any(summ_dep['identifier'] in art_dep['identifier'] for art_dep in article_deps):
|
458 |
+
total_unmatched_deps.append(summ_dep)
|
459 |
+
# print(f'ALL UNMATCHED DEPS ARE: {total_unmatched_deps}')
|
460 |
+
# render_dependency_parsing(check_dependency(False))
|
461 |
+
if total_unmatched_deps:
|
462 |
+
for current_drawing_list in total_unmatched_deps:
|
463 |
+
render_dependency_parsing(current_drawing_list)
|
464 |
+
dep_spec_text = fetch_dependency_specific_contents(selected_article)
|
465 |
+
st.markdown(dep_spec_text)
|
466 |
+
soup = BeautifulSoup("Example text option with box", features="html.parser")
|
467 |
+
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
468 |
+
margin-bottom: 2.5rem">{}</div> """
|
469 |
+
st.write(HTML_WRAPPER.format(soup), unsafe_allow_html=True)
|
470 |
+
|
471 |
+
# OUTRO/CONCLUSION
|
472 |
+
st.header("Wrapping up")
|
473 |
+
st.markdown("We have presented 2 methods that try to improve summaries via post-processing steps. Entity matching can "
|
474 |
+
"be used to solve hallucinations, while checking if specific dependencies are matched between summary and "
|
475 |
+
"article can be used to filter out some bad sentences (and thus worse summaries). Of course these are "
|
476 |
+
"only basic methods which were empirically tested, but they are a start at actually making something good "
|
477 |
+
"(???). (something about that we tested also RE and maybe other things).")
|
478 |
+
st.markdown("####")
|
479 |
+
st.markdown("Now based on these methods you can check summaries and whether they are “good” or “bad”. Below you can "
|
480 |
+
"generate 5 different kind of summaries for the starting article (based on different model params) in "
|
481 |
+
"which their ranks are estimated, and hopefully the best summary (read: the one that a human would prefer "
|
482 |
+
"or indicate as the best one) will be at the top.")
|
custom_renderer.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
from typing import Dict, Any
|
2 |
|
|
|
3 |
import spacy
|
4 |
from PIL import ImageFont
|
5 |
|
6 |
from spacy.tokens import Doc
|
7 |
|
|
|
8 |
def get_pil_text_size(text, font_size, font_name):
|
9 |
font = ImageFont.truetype(font_name, font_size)
|
10 |
size = font.getsize(text)
|
@@ -32,8 +34,8 @@ def render_arrow(
|
|
32 |
<path class="displacy-arrowhead" d="{head}" fill="red"/>
|
33 |
</g>
|
34 |
"""
|
35 |
-
arc = get_arc(start +
|
36 |
-
arrowhead = get_arrowhead(direction, start +
|
37 |
label_side = "right" if direction == "rtl" else "left"
|
38 |
return TPL_DEP_ARCS.format(
|
39 |
id=0,
|
@@ -77,7 +79,7 @@ def get_arrowhead(direction: str, x: int, y: int, end: int) -> str:
|
|
77 |
|
78 |
|
79 |
# parsed = [{'words': [{'text': 'The', 'tag': 'DET', 'lemma': None}, {'text': 'OnePlus', 'tag': 'PROPN', 'lemma': None}, {'text': '10', 'tag': 'NUM', 'lemma': None}, {'text': 'Pro', 'tag': 'PROPN', 'lemma': None}, {'text': 'is', 'tag': 'AUX', 'lemma': None}, {'text': 'the', 'tag': 'DET', 'lemma': None}, {'text': 'company', 'tag': 'NOUN', 'lemma': None}, {'text': "'s", 'tag': 'PART', 'lemma': None}, {'text': 'first', 'tag': 'ADJ', 'lemma': None}, {'text': 'flagship', 'tag': 'NOUN', 'lemma': None}, {'text': 'phone.', 'tag': 'NOUN', 'lemma': None}], 'arcs': [{'start': 0, 'end': 3, 'label': 'det', 'dir': 'left'}, {'start': 1, 'end': 3, 'label': 'nmod', 'dir': 'left'}, {'start': 1, 'end': 2, 'label': 'nummod', 'dir': 'right'}, {'start': 3, 'end': 4, 'label': 'nsubj', 'dir': 'left'}, {'start': 5, 'end': 6, 'label': 'det', 'dir': 'left'}, {'start': 6, 'end': 10, 'label': 'poss', 'dir': 'left'}, {'start': 6, 'end': 7, 'label': 'case', 'dir': 'right'}, {'start': 8, 'end': 10, 'label': 'amod', 'dir': 'left'}, {'start': 9, 'end': 10, 'label': 'compound', 'dir': 'left'}, {'start': 4, 'end': 10, 'label': 'attr', 'dir': 'right'}], 'settings': {'lang': 'en', 'direction': 'ltr'}}]
|
80 |
-
def render_sentence_custom(
|
81 |
TPL_DEP_WORDS = """
|
82 |
<text class="displacy-token" fill="currentColor" text-anchor="start" y="{y}">
|
83 |
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
|
@@ -89,43 +91,94 @@ def render_sentence_custom(parsed: str):
|
|
89 |
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="{lang}" id="{id}" class="displacy" width="{width}" height="{height}" direction="{dir}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}; direction: {dir}">{content}</svg>
|
90 |
"""
|
91 |
arcs_svg = []
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
x_value_counter = 10
|
106 |
index_counter = 0
|
107 |
svg_words = []
|
|
|
108 |
coords_test = []
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
word = word + " "
|
112 |
pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
|
113 |
svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
|
114 |
-
if
|
115 |
coords_test.append(x_value_counter)
|
116 |
-
|
|
|
117 |
index_counter += 1
|
118 |
x_value_counter += pixel_x_length + 4
|
119 |
-
|
120 |
-
|
121 |
-
|
|
|
122 |
|
123 |
content = "".join(svg_words) + "".join(arcs_svg)
|
124 |
|
125 |
full_svg = TPL_DEP_SVG.format(
|
126 |
id=0,
|
127 |
-
width=
|
128 |
-
height=
|
129 |
color="#00000",
|
130 |
bg="#ffffff",
|
131 |
font="Arial",
|
@@ -133,9 +186,9 @@ def render_sentence_custom(parsed: str):
|
|
133 |
dir="ltr",
|
134 |
lang="en",
|
135 |
)
|
136 |
-
|
137 |
return full_svg
|
138 |
|
|
|
139 |
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
140 |
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
141 |
|
@@ -196,8 +249,9 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
|
196 |
)
|
197 |
return {"words": words, "arcs": arcs, "settings": get_doc_settings(orig_doc)}
|
198 |
|
|
|
199 |
def get_doc_settings(doc: Doc) -> Dict[str, Any]:
|
200 |
return {
|
201 |
"lang": doc.lang_,
|
202 |
"direction": doc.vocab.writing_system.get("direction", "ltr"),
|
203 |
-
}
|
|
|
1 |
from typing import Dict, Any
|
2 |
|
3 |
+
import numpy as np
|
4 |
import spacy
|
5 |
from PIL import ImageFont
|
6 |
|
7 |
from spacy.tokens import Doc
|
8 |
|
9 |
+
|
10 |
def get_pil_text_size(text, font_size, font_name):
|
11 |
font = ImageFont.truetype(font_name, font_size)
|
12 |
size = font.getsize(text)
|
|
|
34 |
<path class="displacy-arrowhead" d="{head}" fill="red"/>
|
35 |
</g>
|
36 |
"""
|
37 |
+
arc = get_arc(start + 10, 50, 5, end + 10)
|
38 |
+
arrowhead = get_arrowhead(direction, start + 10, 50, end + 10)
|
39 |
label_side = "right" if direction == "rtl" else "left"
|
40 |
return TPL_DEP_ARCS.format(
|
41 |
id=0,
|
|
|
79 |
|
80 |
|
81 |
# parsed = [{'words': [{'text': 'The', 'tag': 'DET', 'lemma': None}, {'text': 'OnePlus', 'tag': 'PROPN', 'lemma': None}, {'text': '10', 'tag': 'NUM', 'lemma': None}, {'text': 'Pro', 'tag': 'PROPN', 'lemma': None}, {'text': 'is', 'tag': 'AUX', 'lemma': None}, {'text': 'the', 'tag': 'DET', 'lemma': None}, {'text': 'company', 'tag': 'NOUN', 'lemma': None}, {'text': "'s", 'tag': 'PART', 'lemma': None}, {'text': 'first', 'tag': 'ADJ', 'lemma': None}, {'text': 'flagship', 'tag': 'NOUN', 'lemma': None}, {'text': 'phone.', 'tag': 'NOUN', 'lemma': None}], 'arcs': [{'start': 0, 'end': 3, 'label': 'det', 'dir': 'left'}, {'start': 1, 'end': 3, 'label': 'nmod', 'dir': 'left'}, {'start': 1, 'end': 2, 'label': 'nummod', 'dir': 'right'}, {'start': 3, 'end': 4, 'label': 'nsubj', 'dir': 'left'}, {'start': 5, 'end': 6, 'label': 'det', 'dir': 'left'}, {'start': 6, 'end': 10, 'label': 'poss', 'dir': 'left'}, {'start': 6, 'end': 7, 'label': 'case', 'dir': 'right'}, {'start': 8, 'end': 10, 'label': 'amod', 'dir': 'left'}, {'start': 9, 'end': 10, 'label': 'compound', 'dir': 'left'}, {'start': 4, 'end': 10, 'label': 'attr', 'dir': 'right'}], 'settings': {'lang': 'en', 'direction': 'ltr'}}]
|
82 |
+
def render_sentence_custom(unmatched_list: Dict):
|
83 |
TPL_DEP_WORDS = """
|
84 |
<text class="displacy-token" fill="currentColor" text-anchor="start" y="{y}">
|
85 |
<tspan class="displacy-word" fill="currentColor" x="{x}">{text}</tspan>
|
|
|
91 |
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="{lang}" id="{id}" class="displacy" width="{width}" height="{height}" direction="{dir}" style="max-width: none; height: {height}px; color: {color}; background: {bg}; font-family: {font}; direction: {dir}">{content}</svg>
|
92 |
"""
|
93 |
arcs_svg = []
|
94 |
+
nlp = spacy.load('en_core_web_lg')
|
95 |
+
doc = nlp(unmatched_list["sentence"])
|
96 |
+
# words = {}
|
97 |
+
# unmatched_list = [parse_deps(doc)]
|
98 |
+
# #print(parsed)
|
99 |
+
# for i, p in enumerate(unmatched_list):
|
100 |
+
# arcs = p["arcs"]
|
101 |
+
# words = p["words"]
|
102 |
+
# for i, a in enumerate(arcs):
|
103 |
+
# #CHECK CERTAIN DEPS (ALSO ADD/CHANGE BELOW WHEN CHANGING HERE)
|
104 |
+
# if a["label"] == "amod":
|
105 |
+
# couples = (a["start"], a["end"])
|
106 |
+
# elif a["label"] == "pobj":
|
107 |
+
# couples = (a["start"], a["end"])
|
108 |
+
# #couples = (3,5)
|
109 |
+
#
|
110 |
+
# x_value_counter = 10
|
111 |
+
# index_counter = 0
|
112 |
+
# svg_words = []
|
113 |
+
# coords_test = []
|
114 |
+
# for i, word in enumerate(words):
|
115 |
+
# word = word["text"]
|
116 |
+
# word = word + " "
|
117 |
+
# pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
|
118 |
+
# svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
|
119 |
+
# if index_counter >= couples[0] and index_counter <= couples[1]:
|
120 |
+
# coords_test.append(x_value_counter)
|
121 |
+
# x_value_counter += 50
|
122 |
+
# index_counter += 1
|
123 |
+
# x_value_counter += pixel_x_length + 4
|
124 |
+
# for i, a in enumerate(arcs):
|
125 |
+
# if a["label"] == "amod":
|
126 |
+
# arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
|
127 |
+
# elif a["label"] == "pobj":
|
128 |
+
# arcs_svg.append(render_arrow(a["label"], coords_test[0], coords_test[-1], a["dir"], i))
|
129 |
+
#
|
130 |
+
# content = "".join(svg_words) + "".join(arcs_svg)
|
131 |
+
#
|
132 |
+
# full_svg = TPL_DEP_SVG.format(
|
133 |
+
# id=0,
|
134 |
+
# width=1200, #600
|
135 |
+
# height=250, #125
|
136 |
+
# color="#00000",
|
137 |
+
# bg="#ffffff",
|
138 |
+
# font="Arial",
|
139 |
+
# content=content,
|
140 |
+
# dir="ltr",
|
141 |
+
# lang="en",
|
142 |
+
# )
|
143 |
|
144 |
x_value_counter = 10
|
145 |
index_counter = 0
|
146 |
svg_words = []
|
147 |
+
words = unmatched_list["sentence"].split(" ")
|
148 |
coords_test = []
|
149 |
+
#print(unmatched_list)
|
150 |
+
#print(words)
|
151 |
+
#print("NOW")
|
152 |
+
direction_current = "rtl"
|
153 |
+
if unmatched_list["cur_word_index"] < unmatched_list["target_word_index"]:
|
154 |
+
min_index = unmatched_list["cur_word_index"]
|
155 |
+
max_index = unmatched_list["target_word_index"]
|
156 |
+
direction_current = "left"
|
157 |
+
else:
|
158 |
+
max_index = unmatched_list["cur_word_index"]
|
159 |
+
min_index = unmatched_list["target_word_index"]
|
160 |
+
for i, token in enumerate(doc):
|
161 |
+
word = str(token)
|
162 |
word = word + " "
|
163 |
pixel_x_length = get_pil_text_size(word, 16, 'arial.ttf')[0]
|
164 |
svg_words.append(TPL_DEP_WORDS.format(text=word, tag="", x=x_value_counter, y=70))
|
165 |
+
if min_index <= index_counter <= max_index:
|
166 |
coords_test.append(x_value_counter)
|
167 |
+
if index_counter < max_index - 1:
|
168 |
+
x_value_counter += 50
|
169 |
index_counter += 1
|
170 |
x_value_counter += pixel_x_length + 4
|
171 |
+
|
172 |
+
# TODO: DYNAMIC DIRECTION MAKING (SHOULD GIVE WITH DICT I THINK)
|
173 |
+
#print(coords_test)
|
174 |
+
arcs_svg.append(render_arrow(unmatched_list['dep'], coords_test[0], coords_test[-1], direction_current, i))
|
175 |
|
176 |
content = "".join(svg_words) + "".join(arcs_svg)
|
177 |
|
178 |
full_svg = TPL_DEP_SVG.format(
|
179 |
id=0,
|
180 |
+
width=1200, # 600
|
181 |
+
height=75, # 125
|
182 |
color="#00000",
|
183 |
bg="#ffffff",
|
184 |
font="Arial",
|
|
|
186 |
dir="ltr",
|
187 |
lang="en",
|
188 |
)
|
|
|
189 |
return full_svg
|
190 |
|
191 |
+
|
192 |
def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
|
193 |
"""Generate dependency parse in {'words': [], 'arcs': []} format.
|
194 |
|
|
|
249 |
)
|
250 |
return {"words": words, "arcs": arcs, "settings": get_doc_settings(orig_doc)}
|
251 |
|
252 |
+
|
253 |
def get_doc_settings(doc: Doc) -> Dict[str, Any]:
|
254 |
return {
|
255 |
"lang": doc.lang_,
|
256 |
"direction": doc.vocab.writing_system.get("direction", "ltr"),
|
257 |
+
}
|
dependency-specific-text/article11.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "poss" (possession modifier) dependency.
|
2 |
+
In the image above, you can see the unmatched dependency that is found in the summary but not present in the article. For the "poss" dependency, we only check matches when the target word is "in", as it is here. U.S. is the entity here.
|
3 |
+
For this specific example, it's obvious that the dependency of "in U.S." is not found in the article, as you can already see in the entity matching paragraph that U.S. is a hallucinated entity and doesn't occur in the article itself,
|
4 |
+
so technically we don't need dependency comparison here to spot this particular error.
|
dependency-specific-text/article13.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "amod" (adjectival modifier) dependency.
|
2 |
+
In the image above, you can see the unmatched dependency that is found in the summary but not present in the article. "First" is the entity here, and it's the adjectival modifier of the word "phone".
|
3 |
+
However, this sentence is not factual, since the article talks about a **new** type of flagship phone, and not at all the **first** flagship phone. This is wrong, and the error was found by filtering on this specific kind of dependency.
|
dependency-specific-text/example.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
This is example explanation.
|
entity-specific-text/article11.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
For this summary, there are 2 unmatched entities: "The Mark Levinson" and "U.S". The first one
|
2 |
+
is not actually a real error per se, but rather a "the" before "Mark Levinson" (TODO EXPLAIN BIT BETTER).
|
3 |
+
The "U.S." however is a hallucinated entity not present in the article, and via this method this can be found.
|
entity-specific-text/article13.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
For this summary, there are 2 unmatched entities: "January 18" and "U.S". January 18 is indeed a hallucinated entity, as there is no sentence containing this exact date. U.S. does occur in the article, but as "US" instead of "U.S.". This can be solved
|
2 |
+
by comparing to a list of abbreviations (of embeddings :TODO?)
|
sample-articles/article11.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Not so long ago, the internet was debating what the folks at Apple were thinking charging $549 for the AirPods Max. At CES 2022, luxury audio brand Mark Levinson would like a word. The Harman-owned company (which also owns AKG, JBL and Harman Kardon, and is itself a Samsung subsidiary) mostly known for its high-end home and car audio systems has announced its first wireless headphones: the No. 5909. While they offer everything you'd expect from a premium set, they have a nearly $1,000 price tag that only a select few might commit to.
|
2 |
+
|
3 |
+
The over-ear No. 5909 packs 40mm Beryllium drivers "expertly tuned to the Harman curve." The company explains that "the Harman curve" is acoustic response that it says has taken decades of research to construct. The result here is "incredible acoustic performance" in a set of "reference class" wireless headphones. Mark Levinson says that audio performance meets the guidelines for Hi-Res Audio certification thanks to 24-bit/96kHz signal processing and 40kHz acoustic response. The No. 5909 supports LDAC, AAC and aptX Adaptive wireless codecs via Bluetooth 5.1.
|
4 |
+
|
5 |
+
Mark Levinson promises you'll hear details you haven't before, like "the slightest breath an artist takes" or "a hidden harmony." The company explains that the same "world-class sound engineers" that built the luxury brand's amps, turntables and streaming players are behind the tuning of the ultra pricey No. 5909.
|
6 |
+
|
7 |
+
Mark Levinson/Harman
|
8 |
+
|
9 |
+
Sound quality isn't the only consideration though. The No. 5909 has adaptive active noise cancellation (ANC) with three modes "for premium sound isolation" and an Ambient Aware feature that lets you tune into your surroundings as needed. The company also packed in four microphones for calls that are equipped with a so-called Smart Wind Adaption feature. The materials used to make the headphones are also better than the mostly plastic sets we typically see. The No. 5909 is built with an aluminum frame, painted metallic earcups, leather headband and replaceable leather ear cushions. An included hard shell travel case comes stocked with a USB-C charging cable, USB-C to USB-A adaptor, two USB-C to 3.5mm cables, 3.5mm to 6.3mm adaptor, airplane adaptor and a polishing cloth. Basically, it's everything you'd need to use the headphones on any setup — wired, wireless or while traveling.
|
10 |
+
|
11 |
+
Mark Levinson says you can expect up to 30 hours of use with adaptive ANC active and up to 34 hours with the feature disabled. A quick-charge feature will give you up to six hours of play time in 15 minutes. Via an app for Android and iOS, you'll get some control over the headphones, but the company didn't go into specifics there.
|
12 |
+
|
13 |
+
The No. 5909 will be available in black, pewter and red color options starting today for $999.
|
sample-summaries/article11.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
The Mark Levinson No. 5909 is the company's first wireless headphones. It's "reference class" and comes in black, pewter and red color options.. The headphones start at $999 and will be available starting today in the U.S. A quick-charge feature will give you up to six hours of play time in 15 minutes, the company says, via an app for Android and iOS.The company also packed in four microphones for calls that are equipped with a so-called Smart Wind Adaption feature., via Bluetooth 5.1.
|
sample-summaries/article13.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
The OnePlus 10 Pro is the company's first flagship phone. It's the result of a merger between OnePlus and Oppo, which will be called "SuperVOOC" The phone is launching in China first on January 11. There's also no word on a US release date yet. The 10 Pro will have a 6.7-inch display and three cameras on the back. We don't have a price yet, but OnePlus' flagship prices have gone up every year so far, and the 9 Pro was $969.The phone will go on sale January 11 in China and January 18 in the U.S.
|
|
|
1 |
+
The OnePlus 10 Pro is the company's first flagship phone. It's the result of a merger between OnePlus and Oppo, which will be called "SuperVOOC" The phone is launching in China first on January 11. There's also no word on a US release date yet. The 10 Pro will have a 6.7-inch display and three cameras on the back. We don't have a price yet, but OnePlus' flagship prices have gone up every year so far, and the 9 Pro was $969. The phone will go on sale January 11 in China and January 18 in the U.S.
|