Update app.py
Browse files
app.py
CHANGED
@@ -60,11 +60,8 @@ def article_text_extractor(url: str):
|
|
60 |
|
61 |
article_header = ''
|
62 |
|
63 |
-
article = " ".join(article_text)
|
64 |
-
|
65 |
-
article = article.replace("!", "!<eos>")
|
66 |
-
article = article.replace("?", "?<eos>")
|
67 |
-
sentences = article.split("<eos>")
|
68 |
|
69 |
current_chunk = 0
|
70 |
chunks = []
|
@@ -77,7 +74,6 @@ def article_text_extractor(url: str):
|
|
77 |
current_chunk += 1
|
78 |
chunks.append(sentence.split(" "))
|
79 |
else:
|
80 |
-
print(current_chunk)
|
81 |
chunks.append(sentence.split(" "))
|
82 |
|
83 |
for chunk_id in range(len(chunks)):
|
@@ -86,8 +82,12 @@ def article_text_extractor(url: str):
|
|
86 |
return article_header, chunks
|
87 |
|
88 |
def chunk_clean_text(text):
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
91 |
current_chunk = 0
|
92 |
chunks = []
|
93 |
|
@@ -99,9 +99,8 @@ def chunk_clean_text(text):
|
|
99 |
current_chunk += 1
|
100 |
chunks.append(sentence.split(" "))
|
101 |
else:
|
102 |
-
print(current_chunk)
|
103 |
chunks.append(sentence.split(" "))
|
104 |
-
|
105 |
for chunk_id in range(len(chunks)):
|
106 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
107 |
|
@@ -259,10 +258,10 @@ def highlight_entities(article_content,summary_output):
|
|
259 |
print(summary_output)
|
260 |
|
261 |
for entity in matched_entities:
|
262 |
-
summary_output =
|
263 |
|
264 |
for entity in unmatched_entities:
|
265 |
-
summary_output = summary_output.
|
266 |
|
267 |
print("")
|
268 |
print(summary_output)
|
|
|
60 |
|
61 |
article_header = ''
|
62 |
|
63 |
+
article = nlp(" ".join(article_text))
|
64 |
+
sentences = [i.text for i in list(article.sents)]
|
|
|
|
|
|
|
65 |
|
66 |
current_chunk = 0
|
67 |
chunks = []
|
|
|
74 |
current_chunk += 1
|
75 |
chunks.append(sentence.split(" "))
|
76 |
else:
|
|
|
77 |
chunks.append(sentence.split(" "))
|
78 |
|
79 |
for chunk_id in range(len(chunks)):
|
|
|
82 |
return article_header, chunks
|
83 |
|
84 |
def chunk_clean_text(text):
|
85 |
+
|
86 |
+
"""Chunk text longer than 500 tokens"""
|
87 |
+
|
88 |
+
article = nlp(" ".join(text))
|
89 |
+
sentences = [i.text for i in list(article.sents)]
|
90 |
+
|
91 |
current_chunk = 0
|
92 |
chunks = []
|
93 |
|
|
|
99 |
current_chunk += 1
|
100 |
chunks.append(sentence.split(" "))
|
101 |
else:
|
|
|
102 |
chunks.append(sentence.split(" "))
|
103 |
+
|
104 |
for chunk_id in range(len(chunks)):
|
105 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
106 |
|
|
|
258 |
print(summary_output)
|
259 |
|
260 |
for entity in matched_entities:
|
261 |
+
summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)
|
262 |
|
263 |
for entity in unmatched_entities:
|
264 |
+
summary_output = summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
|
265 |
|
266 |
print("")
|
267 |
print(summary_output)
|