qa-generator / src /text.py
philipp-zettl's picture
Update src/text.py
5362638 verified
raw
history blame
4.24 kB
from markdownify import markdownify as md
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
from newspaper import Article
import re
import markdown
def clean(s):
s = s.replace("\t", "\\t")
s = s.replace("\n", "\\n")
return s
class DocTree:
def __init__(self, content):
self.content = content
self.max_depth = 6
def get_sections(self, *location_ids):
out = self.content
for id_ in location_ids:
out = out[id_]
return out
def merge_sections(self, elems):
if not isinstance(elems[0], list):
return '\n\n '.join(elems)
out = []
for e in elems:
out.append(self.merge_sections(e))
return '\n\n '.join(map(clean, out))
def get_merged_sections(self, *location_ids):
return [self.merge_sections(s) for s in self.get_sections(*location_ids)]
def as_markdown(self, content):
return md(content)
def get_sections_by_depth(self, depth):
return self._get_sections_by_depth(self.content, depth)
@staticmethod
def _get_sections_by_depth(content, depth):
"""Returns a list of merged sections at a specific depth"""
if depth == 0:
return content
out = []
for elem in content:
out += DocTree._get_sections_by_depth(elem, depth - 1)
return out
def fix_relative_links(url, article_content):
if 'http' in url:
base_url = '/'.join(url.split('/')[:3])
else:
base_url = url.split('/')
pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
res = pat.findall(article_content)
if res:
for g in res:
url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
else:print('not found')
return article_content
def extract_article(url):
article = Article(url)
article.download()
article.parse()
return article
def select_content(html_code, elem_class, class_name):
print(f'Calling select_content with {elem_class}, {class_name}')
kwargs = {}
if class_name.startswith('.'):
class_name = class_name[1:]
kwargs = {'class_': class_name}
elif class_name.startswith('#'):
kwargs = {'id': class_name[1:]}
return md(str(BS(html_code, features="lxml").find(**kwargs)))
def split_by_heading(html_content, _i):
if _i >= 7:
return html_content
elems = []
for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
if idx > 0 or elem.startswith('>'):
elem = f'<h{_i}{elem}'
elems.append(split_by_heading(elem, _i+1))
return elems
def doctree_from_url(url, elem_class='div', class_name='article-body'):
article = extract_article(url)
# convert to MD to handle splitting better
article_content = select_content(article.html, elem_class, class_name)
requires_title = list(filter(lambda x: x.strip().startswith('# '), article_content.split('\n'))) != []
if requires_title:
print('Didn\'t find title, will add it manually...')
article_content = f"# {article.title}\n\n{article_content}"
article_content = article_content.replace('\n\n', '\n').replace('#', '%%@@%%')
# fix relative website links
article_content = fix_relative_links(url, article_content)
# convert back to HTML
html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
doc_tree = DocTree(split_by_heading(html_content, 1))
#assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
return doc_tree
def get_selectors_for_class(url, elem_class):
article = extract_article(url)
html_content = article.html
soup = BS(html_content, features="lxml")
classes = set()
ids = set()
for elem in soup.find_all(elem_class):
if elem.get('class'):
for c in elem.get('class'):
classes |= {f".{c}"}
if elem.get('id'):
ids |= {f"#{elem.get('id')}"}
return ids | classes