web-search-api / documents /webpage_content_extractor.py
Hansimov's picture
:gem: [Feature] New WebpageContentExtractor: extract webpage content as clean markdown
e773696
raw
history blame
3.96 kB
import re
from pathlib import Path
from pprint import pprint
from bs4 import BeautifulSoup, Comment, NavigableString, Tag
from tiktoken import get_encoding as tiktoken_get_encoding
from utils.logger import logger
from markdownify import markdownify
# from trafilatura import extract as extract_text_from_html
# from inscriptis import get_text as extract_text_from_html
# from html_text import extract_text as extract_text_from_html
# from readabilipy import simple_json_from_html_string as extract_text_from_html
class WebpageContentExtractor:
def __init__(self):
self.tokenizer = tiktoken_get_encoding("cl100k_base")
def count_tokens(self, text):
tokens = self.tokenizer.encode(text)
token_count = len(tokens)
return token_count
def filter_html_str(self, html_str):
soup = BeautifulSoup(html_str, "html.parser")
ignore_tags = ["script", "style", "button"]
ignore_classes = [
"sidebar",
"footer",
"related",
"comment",
"topbar",
"menu",
"offcanvas",
"navbar",
]
ignore_classes_pattern = f'{"|".join(ignore_classes)}'
removed_element_counts = 0
for element in soup.find_all():
class_str = ""
id_str = ""
try:
class_attr = element.get("class", [])
if class_attr:
class_str = " ".join(list(class_attr))
if id_str:
class_str = f"{class_str} {id_str}"
except:
pass
try:
id_str = element.get("id", "")
except:
pass
if (
(not element.text.strip())
or (element.name in ignore_tags)
or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
):
# try:
# logger.note(f"Removing:\n{element}")
# except:
# logger.note(f"Removing unknown element")
element.decompose()
removed_element_counts += 1
logger.note(
f"Elements Removed/Remained: {removed_element_counts}/{len(soup.find_all())}"
)
html_str = str(soup)
return html_str
def extract(self, html_path):
logger.note(f"Extracing content from:{html_path}")
with open(html_path, "r", encoding="utf-8") as f:
html_str = f.read()
html_str = self.filter_html_str(html_str)
# self.main_content = extract_text_from_html(html_str)
# # when using `readabilipy`
# self.main_content = extract_text_from_html(html_str)["plain_content"]
# self.main_content = "\n".join(
# item["text"] for item in extract_text_from_html(html_str)["plain_text"]
# )
# self.main_content = markdownify(extract_text_from_html(html_str)["content"])
# self.main_content = markdownify(extract_text_from_html(html_str))
self.main_content = markdownify(html_str, strip="a")
self.main_content = re.sub(r"\n{3,}", "\n\n", self.main_content)
logger.line(self.main_content)
# pprint(self.main_content)
token_count = self.count_tokens(self.main_content)
logger.note(f"Token Count: {token_count}")
return self.main_content
if __name__ == "__main__":
html_path = (
Path(__file__).parents[1]
/ "files"
/ "urls"
# / "stackoverflow.com_questions_295135_turn-a-string-into-a-valid-filename.html"
/ "www.liaoxuefeng.com_wiki_1016959663602400_1017495723838528.html"
# / "docs.python.org_zh-cn_3_tutorial_interpreter.html"
)
extractor = WebpageContentExtractor()
main_content = extractor.extract(html_path)