import re import requests from bs4 import BeautifulSoup class SearchResult: def __init__(self, results: list[str], user_agent: str, did_you_mean: str = "", tailored_query: str = ""): self.results = results self.user_agent = user_agent self.suggestion_query = did_you_mean self.tailored_query = tailored_query def parse_results(self): results = self.results[1:] headers = { "User-Agent": self.user_agent } stripped_pages: list[{ "page_title": str, "text_content": str }] = [] # type: ignore for link_entry in results: if len(stripped_pages) < 1: twitter_pattern = re.compile(r".*twitter.*", re.IGNORECASE) if not re.search(twitter_pattern, link_entry): text_content = "" response = requests.get(link_entry, headers=headers) soup = BeautifulSoup(response.text, "html.parser") title = soup.title.string or "No title provided" # type: ignore relevant_tags = ["p", "li", "h1", "h2", "h3", "h4", "h5", "h6"] for tag in relevant_tags: elements = soup.find_all(tag, class_=lambda c: c != 'ads' and c != 'header' and c != 'footer') for element in elements: if element.text.strip().lower(): text_content += element.text.strip() + '\n' stripped_pages.append({ "page_title": title, "text_content": text_content }) else: continue return stripped_pages