|
import re |
|
|
|
import gradio as gr |
|
import requests |
|
from inscriptis import get_text |
|
from inscriptis.css_profiles import CSS_PROFILES |
|
from inscriptis.model.config import ParserConfig |
|
from readability import Document |
|
|
|
INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"]) |
|
|
|
|
|
def extract_text(url: str): |
|
html = requests.get(url).content.decode("utf-8") |
|
|
|
if len(html.strip()) == 0: |
|
return "", "", "", "" |
|
|
|
parsed_doc = Document(html) |
|
|
|
|
|
title = parsed_doc.short_title() |
|
clean_html = parsed_doc.summary(html_partial=True) |
|
del parsed_doc |
|
|
|
|
|
text = get_text(clean_html, INSCRIPTIS_CONFIG).strip() |
|
|
|
if not re.search(r"\w+", text): |
|
|
|
return title, "", clean_html, html |
|
|
|
|
|
text = re.sub(r"\n\s*\n", "\n\n", text) |
|
|
|
return title, text, clean_html, html |
|
|
|
|
|
title = gr.Textbox(label="Title") |
|
text = gr.Textbox(label="Text (`inscriptis` output)", lines=10) |
|
clean_html = gr.Textbox(label="Clean HTML (`readability-lxml` output)", lines=10) |
|
html = gr.Textbox(label="Raw HTML response", lines=10) |
|
demo = gr.Interface( |
|
extract_text, |
|
gr.Textbox(placeholder="https://hf.co/", label="URL"), |
|
[title, text, clean_html, html], |
|
examples=[ |
|
["https://huggingface.co/blog/peft"], |
|
[ |
|
"https://www.nytimes.com/2023/03/08/technology/chatbots-disrupt-internet-industry.html" |
|
], |
|
], |
|
) |
|
|
|
demo.launch() |
|
|