web-search-api / documents /search_results_extractor.py
Hansimov's picture
:gem: [Feature] SearchResultsExtractor: related questions
f150f6b
raw
history blame
1.82 kB
from bs4 import BeautifulSoup
from pathlib import Path
class SearchResultsExtractor:
def __init__(self) -> None:
pass
def load_html(self, html_path):
with open(html_path, "r", encoding="utf-8") as f:
html = f.read()
self.soup = BeautifulSoup(html, "html.parser")
def extract_search_results(self):
search_result_elements = self.soup.find_all("div", class_="g")
for result in search_result_elements:
site = result.find("cite").find_previous("span").text
link = result.find("a")["href"]
title = result.find("h3").text
abstract_element = result.find("div", {"data-sncf": "1"})
if abstract_element is None:
abstract_element = result.find("div", class_="ITZIwc")
abstract = abstract_element.text.strip()
print(
f"{title}\n" f" - {site}\n" f" - {link}\n" f" - {abstract}\n" f"\n"
)
print(len(search_result_elements))
def extract_related_questions(self):
related_question_elements = self.soup.find_all(
"div", class_="related-question-pair"
)
for question_element in related_question_elements:
question = question_element.find("span").text.strip()
print(question)
print(len(related_question_elements))
def extract(self, html_path):
self.load_html(html_path)
self.extract_search_results()
self.extract_related_questions()
if __name__ == "__main__":
html_path_root = Path(__file__).parents[1] / "files"
# html_filename = "python教程"
html_filename = "python_tutorials"
html_path = html_path_root / f"{html_filename}.html"
extractor = SearchResultsExtractor()
extractor.extract(html_path)