Kajise Org commited on
Commit
bd6cfa0
1 Parent(s): db468f2

Only get 1 for the time being.

Browse files
Files changed (1) hide show
  1. SearchResult.py +21 -18
SearchResult.py CHANGED
@@ -8,7 +8,7 @@ class SearchResult:
8
  self.user_agent = user_agent
9
  self.suggestion_query = did_you_mean
10
  self.tailored_query = tailored_query
11
-
12
  def parse_results(self):
13
  results = self.results[1:]
14
  headers = {
@@ -21,22 +21,25 @@ class SearchResult:
21
  }] = [] # type: ignore
22
 
23
  for link_entry in results:
24
- twitter_pattern = re.compile(r".*twitter.*", re.IGNORECASE)
25
-
26
- if not re.search(twitter_pattern, link_entry):
27
- text_content = ""
28
- response = requests.get(link_entry, headers=headers)
29
- soup = BeautifulSoup(response.text, "html.parser")
30
-
31
- title = soup.title.string or "No title provided" # type: ignore
32
- relevant_tags = ["p", "li", "h1", "h2", "h3", "h4", "h5", "h6"]
33
-
34
- for tag in relevant_tags:
35
- elements = soup.find_all(tag, class_=lambda c: c != 'ads' and c != 'header' and c != 'footer')
36
- for element in elements:
37
- if element.text.strip().lower():
38
- text_content += element.text.strip() + '\n'
39
-
40
- stripped_pages.append({ "page_title": title, "text_content": text_content })
 
 
 
41
 
42
  return stripped_pages
 
8
  self.user_agent = user_agent
9
  self.suggestion_query = did_you_mean
10
  self.tailored_query = tailored_query
11
+
12
  def parse_results(self):
13
  results = self.results[1:]
14
  headers = {
 
21
  }] = [] # type: ignore
22
 
23
  for link_entry in results:
24
+ if len(stripped_pages) < 1:
25
+ twitter_pattern = re.compile(r".*twitter.*", re.IGNORECASE)
26
+
27
+ if not re.search(twitter_pattern, link_entry):
28
+ text_content = ""
29
+ response = requests.get(link_entry, headers=headers)
30
+ soup = BeautifulSoup(response.text, "html.parser")
31
+
32
+ title = soup.title.string or "No title provided" # type: ignore
33
+ relevant_tags = ["p", "li", "h1", "h2", "h3", "h4", "h5", "h6"]
34
+
35
+ for tag in relevant_tags:
36
+ elements = soup.find_all(tag, class_=lambda c: c != 'ads' and c != 'header' and c != 'footer')
37
+ for element in elements:
38
+ if element.text.strip().lower():
39
+ text_content += element.text.strip() + '\n'
40
+
41
+ stripped_pages.append({ "page_title": title, "text_content": text_content })
42
+ else:
43
+ continue
44
 
45
  return stripped_pages