Spaces:
Runtime error
Runtime error
Kajise
commited on
Commit
•
7f01d00
1
Parent(s):
04cc455
Delete SearchResult.py
Browse files- SearchResult.py +0 -45
SearchResult.py
DELETED
@@ -1,45 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import requests
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
|
5 |
-
class SearchResult:
|
6 |
-
def __init__(self, results: list[str], user_agent: str, did_you_mean: str = "", tailored_query: str = ""):
|
7 |
-
self.results = results
|
8 |
-
self.user_agent = user_agent
|
9 |
-
self.suggestion_query = did_you_mean
|
10 |
-
self.tailored_query = tailored_query
|
11 |
-
|
12 |
-
def parse_results(self):
|
13 |
-
results = self.results[1:]
|
14 |
-
headers = {
|
15 |
-
"User-Agent": self.user_agent
|
16 |
-
}
|
17 |
-
|
18 |
-
stripped_pages: list[{
|
19 |
-
"page_title": str,
|
20 |
-
"text_content": str
|
21 |
-
}] = [] # type: ignore
|
22 |
-
|
23 |
-
for link_entry in results:
|
24 |
-
if len(stripped_pages) < 1:
|
25 |
-
twitter_pattern = re.compile(r".*twitter.*", re.IGNORECASE)
|
26 |
-
|
27 |
-
if not re.search(twitter_pattern, link_entry):
|
28 |
-
text_content = ""
|
29 |
-
response = requests.get(link_entry, headers=headers)
|
30 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
31 |
-
|
32 |
-
title = soup.title.string or "No title provided" # type: ignore
|
33 |
-
relevant_tags = ["p", "li", "h1", "h2", "h3", "h4", "h5", "h6"]
|
34 |
-
|
35 |
-
for tag in relevant_tags:
|
36 |
-
elements = soup.find_all(tag, class_=lambda c: c != 'ads' and c != 'header' and c != 'footer')
|
37 |
-
for element in elements:
|
38 |
-
if element.text.strip().lower():
|
39 |
-
text_content += element.text.strip() + '\n'
|
40 |
-
|
41 |
-
stripped_pages.append({ "page_title": title, "text_content": text_content })
|
42 |
-
else:
|
43 |
-
continue
|
44 |
-
|
45 |
-
return stripped_pages
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|