nickmuchi commited on
Commit
7b6e772
1 Parent(s): c8a1518

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -12
app.py CHANGED
@@ -60,11 +60,8 @@ def article_text_extractor(url: str):
60
 
61
  article_header = ''
62
 
63
- article = " ".join(article_text)
64
- article = article.replace(".", ".<eos>")
65
- article = article.replace("!", "!<eos>")
66
- article = article.replace("?", "?<eos>")
67
- sentences = article.split("<eos>")
68
 
69
  current_chunk = 0
70
  chunks = []
@@ -77,7 +74,6 @@ def article_text_extractor(url: str):
77
  current_chunk += 1
78
  chunks.append(sentence.split(" "))
79
  else:
80
- print(current_chunk)
81
  chunks.append(sentence.split(" "))
82
 
83
  for chunk_id in range(len(chunks)):
@@ -86,8 +82,12 @@ def article_text_extractor(url: str):
86
  return article_header, chunks
87
 
88
  def chunk_clean_text(text):
89
-
90
- sentences = sent_tokenize(text)
 
 
 
 
91
  current_chunk = 0
92
  chunks = []
93
 
@@ -99,9 +99,8 @@ def chunk_clean_text(text):
99
  current_chunk += 1
100
  chunks.append(sentence.split(" "))
101
  else:
102
- print(current_chunk)
103
  chunks.append(sentence.split(" "))
104
-
105
  for chunk_id in range(len(chunks)):
106
  chunks[chunk_id] = " ".join(chunks[chunk_id])
107
 
@@ -259,10 +258,10 @@ def highlight_entities(article_content,summary_output):
259
  print(summary_output)
260
 
261
  for entity in matched_entities:
262
- summary_output = summary_output.replace(entity, markdown_start_green + entity + markdown_end)
263
 
264
  for entity in unmatched_entities:
265
- summary_output = summary_output.replace(entity, markdown_start_red + entity + markdown_end)
266
 
267
  print("")
268
  print(summary_output)
 
60
 
61
  article_header = ''
62
 
63
+ article = nlp(" ".join(article_text))
64
+ sentences = [i.text for i in list(article.sents)]
 
 
 
65
 
66
  current_chunk = 0
67
  chunks = []
 
74
  current_chunk += 1
75
  chunks.append(sentence.split(" "))
76
  else:
 
77
  chunks.append(sentence.split(" "))
78
 
79
  for chunk_id in range(len(chunks)):
 
82
  return article_header, chunks
83
 
84
  def chunk_clean_text(text):
85
+
86
+ """Chunk text longer than 500 tokens"""
87
+
88
+ article = nlp(" ".join(text))
89
+ sentences = [i.text for i in list(article.sents)]
90
+
91
  current_chunk = 0
92
  chunks = []
93
 
 
99
  current_chunk += 1
100
  chunks.append(sentence.split(" "))
101
  else:
 
102
  chunks.append(sentence.split(" "))
103
+
104
  for chunk_id in range(len(chunks)):
105
  chunks[chunk_id] = " ".join(chunks[chunk_id])
106
 
 
258
  print(summary_output)
259
 
260
  for entity in matched_entities:
261
+ summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)
262
 
263
  for entity in unmatched_entities:
264
+ summary_output = summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
265
 
266
  print("")
267
  print(summary_output)