Spaces:
Runtime error
Runtime error
File size: 4,242 Bytes
cc3e8a0 db59f68 cc3e8a0 db59f68 cc3e8a0 db59f68 5362638 cc3e8a0 5fce8f8 5e38ce5 941ea0e 5e38ce5 cc3e8a0 6e658c0 cc3e8a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from markdownify import markdownify as md
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
from newspaper import Article
import re
import markdown
def clean(s):
s = s.replace("\t", "\\t")
s = s.replace("\n", "\\n")
return s
class DocTree:
def __init__(self, content):
self.content = content
self.max_depth = 6
def get_sections(self, *location_ids):
out = self.content
for id_ in location_ids:
out = out[id_]
return out
def merge_sections(self, elems):
if not isinstance(elems[0], list):
return '\n\n '.join(elems)
out = []
for e in elems:
out.append(self.merge_sections(e))
return '\n\n '.join(map(clean, out))
def get_merged_sections(self, *location_ids):
return [self.merge_sections(s) for s in self.get_sections(*location_ids)]
def as_markdown(self, content):
return md(content)
def get_sections_by_depth(self, depth):
return self._get_sections_by_depth(self.content, depth)
@staticmethod
def _get_sections_by_depth(content, depth):
"""Returns a list of merged sections at a specific depth"""
if depth == 0:
return content
out = []
for elem in content:
out += DocTree._get_sections_by_depth(elem, depth - 1)
return out
def fix_relative_links(url, article_content):
if 'http' in url:
base_url = '/'.join(url.split('/')[:3])
else:
base_url = url.split('/')
pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
res = pat.findall(article_content)
if res:
for g in res:
url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
else:print('not found')
return article_content
def extract_article(url):
article = Article(url)
article.download()
article.parse()
return article
def select_content(html_code, elem_class, class_name):
print(f'Calling select_content with {elem_class}, {class_name}')
kwargs = {}
if class_name.startswith('.'):
class_name = class_name[1:]
kwargs = {'class_': class_name}
elif class_name.startswith('#'):
kwargs = {'id': class_name[1:]}
return md(str(BS(html_code, features="lxml").find(**kwargs)))
def split_by_heading(html_content, _i):
if _i >= 7:
return html_content
elems = []
for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
if idx > 0 or elem.startswith('>'):
elem = f'<h{_i}{elem}'
elems.append(split_by_heading(elem, _i+1))
return elems
def doctree_from_url(url, elem_class='div', class_name='article-body'):
article = extract_article(url)
# convert to MD to handle splitting better
article_content = select_content(article.html, elem_class, class_name)
requires_title = list(filter(lambda x: x.strip().startswith('# '), article_content.split('\n'))) != []
if requires_title:
print('Didn\'t find title, will add it manually...')
article_content = f"# {article.title}\n\n{article_content}"
article_content = article_content.replace('\n\n', '\n').replace('#', '%%@@%%')
# fix relative website links
article_content = fix_relative_links(url, article_content)
# convert back to HTML
html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
doc_tree = DocTree(split_by_heading(html_content, 1))
#assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
return doc_tree
def get_selectors_for_class(url, elem_class):
article = extract_article(url)
html_content = article.html
soup = BS(html_content, features="lxml")
classes = set()
ids = set()
for elem in soup.find_all(elem_class):
if elem.get('class'):
for c in elem.get('class'):
classes |= {f".{c}"}
if elem.get('id'):
ids |= {f"#{elem.get('id')}"}
return ids | classes
|