Spaces:
Sleeping
Sleeping
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/02_app_gradio.ipynb. | |
# %% auto 0 | |
__all__ = ['categories', 'k', 'min_words', 'max_words', 'ignore_text', 'ignore_common', 'learn', 'text', 'label', 'examples', | |
'intf', 'predict'] | |
# %% ../nbs/02_app_gradio.ipynb 4 | |
import warnings | |
warnings.filterwarnings('ignore') | |
from fastai.text.all import * | |
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import enchant | |
import re | |
import random | |
from collections import Counter | |
import hashlib | |
import pickle | |
from wordcloud import WordCloud | |
# %% ../nbs/01_data.ipynb 8 | |
class Webpage: | |
def __init__(self, url): | |
self.url = url | |
self.hash = self.get_hash_str() | |
self.requested = False | |
self.page_text = "" | |
self.html = "" | |
self.links = [] | |
self.text = [] | |
self.cleaned_text = [] | |
self.most_common_words = [] | |
def get_page(self, headers, min_size, max_size): | |
r = requests.get(self.url, stream=True, headers=headers) | |
content_length = int(r.headers.get('Content-Length', 0)) | |
data = [] | |
length = 0 | |
if content_length > max_size: | |
return None | |
for chunk in r.iter_content(1024): | |
data.append(chunk) | |
length += len(chunk) | |
if length > max_size: | |
return None | |
r._content = b''.join(data) | |
if len(r.text) < min_size: return None | |
return r.text | |
def get_page_html(self, min_size=1000, max_size=2000000): | |
user_agents = [ | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36', | |
'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148', | |
'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36' | |
] | |
user_agent = random.choice(user_agents) | |
headers = {'User-Agent': user_agent} | |
self.page_text = self.get_page(headers, min_size, max_size) | |
self.html = BeautifulSoup(self.page_text, "html.parser") | |
self.requested = True | |
def get_hash_str(self, inp=""): | |
return hashlib.sha3_256((self.url+inp).encode()).hexdigest() | |
def get_html_anchors(self, keyword="http"): | |
for anchor in self.html.findAll('a'): | |
link = anchor.get('href') | |
if link == None or link == "": | |
continue | |
if keyword in link: | |
self.links.append(link) | |
def get_html_text(self, tags=["p"]): | |
for tag in tags: | |
for p in self.html.findAll(tag): | |
p_text = p.getText().strip() | |
if p_text == None or p_text == '': | |
continue | |
self.text.append(p_text) | |
def clean_html_text(self, max_words, enchant_dict="en_US", ignore=[], rx="[^a-zA-Z ]+", min_word_len=2): | |
all_text = ' '.join(self.text).lower() | |
regex_text = re.sub(rx,'',all_text).strip() | |
split = regex_text.split() | |
split = [word for word in split if word not in ignore] | |
if enchant_dict != "": d = enchant.Dict(enchant_dict) | |
for word in split: | |
if len(self.cleaned_text) >= max_words: break | |
if len(word) >= min_word_len: | |
if enchant_dict == "": | |
self.cleaned_text.append(word) | |
elif d.check(word): | |
self.cleaned_text.append(word) | |
def k_common_words(self, k=10, ignore=[]): | |
if self.cleaned_text == "": | |
text = self.text | |
else: | |
text = self.cleaned_text | |
all_text = ' '.join(text).lower() | |
split = all_text.split() | |
split_ignore = [word for word in split if word not in ignore] | |
counts = Counter(split_ignore) | |
k_most_common = counts.most_common(k) | |
self.most_common_words = k_most_common | |
def save_text(self, path, fname): | |
file = open(path+fname, 'wb') | |
pickle.dump(self.text, file) | |
file.close() | |
def load_text(self, path, fname): | |
file = open(path+fname, 'rb') | |
self.text = pickle.load(file) | |
file.close() | |
def save_links(self, path, fname): | |
file = open(path+fname, 'wb') | |
pickle.dump(self.links, file) | |
file.close() | |
def load_links(self, path, fname): | |
file = open(path+fname, 'rb') | |
self.links = pickle.load(file) | |
file.close() | |
# %% ../nbs/01_data.ipynb 14 | |
def get_page_all(url, k, max_words, ignore_text, ignore_common, path = None): | |
page = Webpage(url) | |
fname_text = page.hash+'.text' | |
fname_links = page.hash+'.links' | |
if path == None: | |
page.get_page_html() | |
page.get_html_text(tags=["p","h1","h2","h3","span"]) | |
page.get_html_anchors() | |
else: | |
if os.path.isfile(path+fname_text): | |
page.load_text(path, fname_text) | |
else: | |
page.get_page_html() | |
page.get_html_text(tags=["p","h1","h2","h3","span"]) | |
page.save_text(path, fname_text) | |
if os.path.isfile(path+fname_links): | |
page.load_links(path, fname_links) | |
else: | |
if page.html == "": page.get_page_html() | |
page.get_html_anchors() | |
page.save_links(path, fname_links) | |
if page.text is not None: | |
page.clean_html_text(max_words, ignore=ignore_text, rx="[^a-zA-Z ]+") | |
page.k_common_words(k=k, ignore=ignore_common) | |
return page | |
# %% ../nbs/02_app_gradio.ipynb 6 | |
categories = ('pseudoscience','science') | |
k = 30 | |
min_words = 20 | |
max_words = 450 | |
ignore_text = ['the', 'of', 'to', 'and', 'a', 'in', 'it', 'that', 'for', 'on'] | |
ignore_common = ignore_text | |
learn = load_learner('model.pkl', cpu=True) | |
def predict(url): | |
page = get_page_all(url, k, max_words, ignore_text, ignore_common) | |
length = len(page.cleaned_text) | |
if length < min_words: | |
return "ERROR: Returned "+str(length)+" words" | |
else: | |
text = ' '.join(page.cleaned_text) | |
with learn.no_bar(), learn.no_logging(): | |
pred,idx,probs = learn.predict(text) | |
wordcloud = WordCloud(width = 800, height = 800, | |
background_color ='white', | |
min_font_size = 10).generate(text) | |
# plot the WordCloud image | |
fig = plt.figure(figsize = (8, 8), facecolor = None) | |
plt.imshow(wordcloud) | |
plt.axis("off") | |
plt.tight_layout(pad = 0) | |
return (dict(zip(categories, map(float,probs))), fig) | |
# %% ../nbs/02_app_gradio.ipynb 8 | |
examples = ['https://www.theskepticsguide.org/about','https://www.foxnews.com/opinion'] | |
pseudo_sources = ["http://www.ageofautism.com/", | |
"http://www.naturalnews.com", | |
"https://foodbabe.com/starthere/", | |
"http://www.chopra.com", | |
"https://www.mercola.com/", | |
"https://www.history.com/", | |
"https://doctoroz.com/", | |
"https://www.disclose.tv/", | |
"https://nationalreport.net/", | |
"https://heartland.org/", | |
"https://www.dailymail.co.uk/", | |
"https://www.motherjones.com/"] | |
science_sources = ["https://sciencebasedmedicine.org/", | |
"https://www.hopkinsmedicine.org/gim/research/method/ebm.html", | |
"https://www.bbc.com/news/science_and_environment", | |
"https://www.nature.com/", | |
"https://www.science.org/", | |
"https://www.snopes.com/top/", | |
"https://quackwatch.org/", | |
"https://www.skepdic.com/", | |
"http://scibabe.com/", | |
"http://pandasthumb.org/", | |
"https://skepticalscience.com/", | |
"https://www.cdc.gov/", | |
"https://apnews.com/"] | |
with gr.Blocks() as blocks: | |
gr.Markdown("# Pseudometer") | |
gr.Markdown("Prototype machine learning pseudoscience detector for websites!") | |
text = gr.Textbox(label="Input URL (http format):") | |
label = gr.Label() | |
btn = gr.Button("Analyze!") | |
with gr.Accordion("Pseudoscience Primary Training Sources"): | |
gr.Markdown(', '.join(pseudo_sources)) | |
with gr.Accordion("Science Primary Training Sources"): | |
gr.Markdown(', '.join(science_sources)) | |
example = gr.Examples(examples=examples, inputs=text) | |
btn.click(fn=predict, inputs=text, outputs=[label, gr.Plot(label="Wordcloud")]) | |
blocks.launch() |