Spaces:

anton-l
/

html-viz

Runtime error

html-viz / app.py

anton-l HF staff

titles

618b24a over 1 year ago

1.57 kB

	import re

	import gradio as gr
	import requests
	from inscriptis import get_text
	from inscriptis.css_profiles import CSS_PROFILES
	from inscriptis.model.config import ParserConfig
	from readability import Document

	INSCRIPTIS_CONFIG = ParserConfig(css=CSS_PROFILES["strict"])


	def extract_text(url: str):
	html = requests.get(url).content.decode("utf-8")

	if len(html.strip()) == 0:
	return "", "", "", ""

	parsed_doc = Document(html)

	# get the body of the article with readability-lxml
	title = parsed_doc.short_title()
	clean_html = parsed_doc.summary(html_partial=True)
	del parsed_doc

	# get the formatted plaintext with inscriptis
	text = get_text(clean_html, INSCRIPTIS_CONFIG).strip()

	if not re.search(r"\w+", text):
	# no words found, only whitespace and punctuation
	return title, "", clean_html, html

	# remove excessive empty lines
	text = re.sub(r"\n\s*\n", "\n\n", text)

	return title, text, clean_html, html


	title = gr.Textbox(label="Title")
	text = gr.Textbox(label="Text (`inscriptis` output)", lines=10)
	clean_html = gr.Textbox(label="Clean HTML (`readability-lxml` output)", lines=10)
	html = gr.Textbox(label="Raw HTML response", lines=10)
	demo = gr.Interface(
	extract_text,
	gr.Textbox(placeholder="https://hf.co/", label="URL"),
	[title, text, clean_html, html],
	examples=[
	["https://huggingface.co/blog/peft"],
	[
	"https://www.nytimes.com/2023/03/08/technology/chatbots-disrupt-internet-industry.html"
	],
	],
	)

	demo.launch()