File size: 4,242 Bytes
cc3e8a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db59f68
cc3e8a0
 
db59f68
cc3e8a0
db59f68
5362638
cc3e8a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fce8f8
5e38ce5
 
941ea0e
5e38ce5
 
cc3e8a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e658c0
cc3e8a0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from markdownify import markdownify as md
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
from newspaper import Article
import re
import markdown


def clean(s):
    s = s.replace("\t", "\\t")
    s = s.replace("\n", "\\n")
    return s

class DocTree:
    def __init__(self, content):
        self.content = content
        self.max_depth = 6

    def get_sections(self, *location_ids):
        out = self.content
        for id_ in location_ids:
            out = out[id_]
        return out
    
    def merge_sections(self, elems):
        if not isinstance(elems[0], list):
            return '\n\n '.join(elems)
        out = []
        for e in elems:
            out.append(self.merge_sections(e))
        return '\n\n '.join(map(clean, out))

    def get_merged_sections(self, *location_ids):
        return [self.merge_sections(s) for s in self.get_sections(*location_ids)]

    def as_markdown(self, content):
        return md(content)

    def get_sections_by_depth(self, depth):
        return self._get_sections_by_depth(self.content, depth)

    @staticmethod
    def _get_sections_by_depth(content, depth):
        """Returns a list of merged sections at a specific depth"""
        if depth == 0:
            return content
        out = []
        for elem in content:
            out += DocTree._get_sections_by_depth(elem, depth - 1)
        return out


def fix_relative_links(url, article_content):
    if 'http' in url:
        base_url = '/'.join(url.split('/')[:3])
    else:
        base_url = url.split('/')
    pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
    res = pat.findall(article_content)
    if res:
        for g in res:
            url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
            article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
    else:print('not found')
    return article_content


def extract_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article


def select_content(html_code, elem_class, class_name):
    print(f'Calling select_content with {elem_class}, {class_name}')
    kwargs = {}
    if class_name.startswith('.'):
        class_name = class_name[1:]
        kwargs = {'class_': class_name}
    elif class_name.startswith('#'):
        kwargs = {'id': class_name[1:]}
    return md(str(BS(html_code, features="lxml").find(**kwargs)))


def split_by_heading(html_content, _i):
    if _i >= 7:
        return html_content
    elems = []
    for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
        if idx > 0 or elem.startswith('>'):
            elem = f'<h{_i}{elem}'
        elems.append(split_by_heading(elem, _i+1))
    return elems

def doctree_from_url(url, elem_class='div', class_name='article-body'):
    article = extract_article(url)
    # convert to MD to handle splitting better
    article_content = select_content(article.html, elem_class, class_name)
    requires_title = list(filter(lambda x: x.strip().startswith('# '), article_content.split('\n'))) != []

    if requires_title:
        print('Didn\'t find title, will add it manually...')
        article_content = f"# {article.title}\n\n{article_content}"
    article_content = article_content.replace('\n\n', '\n').replace('#', '%%@@%%')
    # fix relative website links
    article_content = fix_relative_links(url, article_content)
    # convert back to HTML
    html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
    doc_tree = DocTree(split_by_heading(html_content, 1))

    #assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
    return doc_tree


def get_selectors_for_class(url, elem_class):
    article = extract_article(url)

    html_content = article.html
    soup = BS(html_content, features="lxml")
    classes = set()
    ids = set()
    for elem in soup.find_all(elem_class):
        if elem.get('class'):
            for c in elem.get('class'):
                classes |= {f".{c}"}
        if elem.get('id'):
            ids |= {f"#{elem.get('id')}"}

    return ids | classes