Spaces:
Runtime error
Runtime error
Kajise Org
commited on
Commit
•
bd6cfa0
1
Parent(s):
db468f2
Only get 1 for the time being.
Browse files- SearchResult.py +21 -18
SearchResult.py
CHANGED
@@ -8,7 +8,7 @@ class SearchResult:
|
|
8 |
self.user_agent = user_agent
|
9 |
self.suggestion_query = did_you_mean
|
10 |
self.tailored_query = tailored_query
|
11 |
-
|
12 |
def parse_results(self):
|
13 |
results = self.results[1:]
|
14 |
headers = {
|
@@ -21,22 +21,25 @@ class SearchResult:
|
|
21 |
}] = [] # type: ignore
|
22 |
|
23 |
for link_entry in results:
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
41 |
|
42 |
return stripped_pages
|
|
|
8 |
self.user_agent = user_agent
|
9 |
self.suggestion_query = did_you_mean
|
10 |
self.tailored_query = tailored_query
|
11 |
+
|
12 |
def parse_results(self):
|
13 |
results = self.results[1:]
|
14 |
headers = {
|
|
|
21 |
}] = [] # type: ignore
|
22 |
|
23 |
for link_entry in results:
|
24 |
+
if len(stripped_pages) < 1:
|
25 |
+
twitter_pattern = re.compile(r".*twitter.*", re.IGNORECASE)
|
26 |
+
|
27 |
+
if not re.search(twitter_pattern, link_entry):
|
28 |
+
text_content = ""
|
29 |
+
response = requests.get(link_entry, headers=headers)
|
30 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
31 |
+
|
32 |
+
title = soup.title.string or "No title provided" # type: ignore
|
33 |
+
relevant_tags = ["p", "li", "h1", "h2", "h3", "h4", "h5", "h6"]
|
34 |
+
|
35 |
+
for tag in relevant_tags:
|
36 |
+
elements = soup.find_all(tag, class_=lambda c: c != 'ads' and c != 'header' and c != 'footer')
|
37 |
+
for element in elements:
|
38 |
+
if element.text.strip().lower():
|
39 |
+
text_content += element.text.strip() + '\n'
|
40 |
+
|
41 |
+
stripped_pages.append({ "page_title": title, "text_content": text_content })
|
42 |
+
else:
|
43 |
+
continue
|
44 |
|
45 |
return stripped_pages
|