Spaces:

twimbit-ai
/

project_ABM

Sleeping

App Files Files Community

twimbit-ai commited on Apr 17

Commit

c6f8a21

•

1 Parent(s): 3b44354

Create test_web_rag.py

Browse files

Files changed (1) hide show

test_web_rag.py +263 -0

test_web_rag.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import urllib.request
+from urllib.parse import quote
+from seleniumbase import SB
+import markdownify
+from bs4 import BeautifulSoup
+from requests_html import HTMLSession
+import html2text
+import re
+from openai import OpenAI
+import tiktoken
+from zenrows import ZenRowsClient
+import requests
+import os
+from dotenv import load_dotenv
+load_dotenv()
+ZENROWS_KEY = os.getenv('ZENROWS_KEY')
+client = OpenAI()
+def get_fast_url_source(url):
+    session = HTMLSession()
+    r = session.get(url)
+    return r.text
+def convert_html_to_text(html):
+    h = html2text.HTML2Text()
+    h.body_width = 0  # Disable line wrapping
+    text = h.handle(html)
+    text = re.sub(r'\n\s*', '', text)
+    text = re.sub(r'\* \\', '', text)
+    " ".join(text.split())
+    return text
+def get_google_search_url(query):
+    url = 'https://www.google.com/search?q=' + quote(query)
+    # Perform the request
+    request = urllib.request.Request(url)
+    # Set a normal User Agent header, otherwise Google will block the request.
+    request.add_header('User-Agent',
+                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36')
+    raw_response = urllib.request.urlopen(request).read()
+    # Read the repsonse as a utf-8 string
+    html = raw_response.decode("utf-8")
+    # The code to get the html contents here.
+    soup = BeautifulSoup(html, 'html.parser')
+    # Find all the search result divs
+    divs = soup.select("#search div.g")
+    # print(divs)
+    url = []
+    for div in divs:
+        # Search for a h3 tag
+        results = div.select("h3")
+        urls = div.select('a')
+        # Check if we have found a result
+        # if (len(results) >= 1):
+        #     # Print the title
+        #     h3 = results[0]
+        #     print(h3.get_text())
+        url.append(urls[0]['href'])
+    return url
+def format_text(text):
+    soup = BeautifulSoup(text, 'html.parser')
+    results = soup.find_all(['p', 'h1', 'h2', 'span'])
+    text = ''
+    for key, result in enumerate(results):
+        if key % 2 == 0:
+            text = text + str(result) + '&nbsp;&nbsp;'
+        else:
+            text = text + str(result) + '&nbsp;&nbsp;'
+    return text
+def get_page_source_selenium_base(url):
+    with SB(uc_cdp=True, guest_mode=True, headless=True) as sb:
+        sb.open(url)
+        sb.sleep(5)
+        page_source = sb.driver.get_page_source()
+    return page_source
+def num_tokens_from_string(string: str, encoding_name: str) -> int:
+    encoding = tiktoken.get_encoding(encoding_name)
+    # encoding = tiktoken.encoding_for_model(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+def encoding_getter(encoding_type: str):
+    """
+    Returns the appropriate encoding based on the given encoding type (either an encoding string or a model name).
+    """
+    if "k_base" in encoding_type:
+        return tiktoken.get_encoding(encoding_type)
+    else:
+        return tiktoken.encoding_for_model(encoding_type)
+def tokenizer(string: str, encoding_type: str) -> list:
+    """
+    Returns the tokens in a text string using the specified encoding.
+    """
+    encoding = encoding_getter(encoding_type)
+    tokens = encoding.encode(string)
+    return tokens
+def token_counter(string: str, encoding_type: str) -> int:
+    """
+    Returns the number of tokens in a text string using the specified encoding.
+    """
+    num_tokens = len(tokenizer(string, encoding_type))
+    return num_tokens
+def format_output(text):
+    page_source = format_text(text)
+    page_source = markdownify.markdownify(page_source)
+    # page_source = convert_html_to_text(page_source)
+    page_source = " ".join(page_source.split())
+    return page_source
+def clean_text(text):
+    # Remove URLs
+    text = re.sub(r'http[s]?://\S+', '', text)
+    # Remove special characters and punctuation (keep only letters, numbers, and basic punctuation)
+    text = re.sub(r'[^a-zA-Z0-9\s,.!?-]', '', text)
+    # Normalize whitespace
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def call_open_ai(system_prompt, max_tokens=800, stream=False):
+    messages = [
+        {
+            "role": "user",
+            "content": system_prompt
+        }
+    ]
+    stream = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=messages,
+        temperature=0,
+        max_tokens=max_tokens,
+        top_p=0,
+        frequency_penalty=0,
+        presence_penalty=0,
+        stream=stream
+    )
+    return stream.choices[0].message.content
+def url_summary(text, question):
+    system_prompt = """
+        Summarize the given text, please add all the important topics and numerical data.
+While summarizing please keep this question in mind.
+question:- {question}
+text:
+{text}
+        """.format(question=question, text=text)
+    return call_open_ai(system_prompt=system_prompt, max_tokens=800)
+def get_google_search_query(question):
+    system_prompt = """
+        convert this question to the Google search query and return only query.
+        question:- {question}
+        """.format(question=question)
+    return call_open_ai(system_prompt=system_prompt, max_tokens=50)
+def is_urlfile(url):
+    # Check if online file exists
+    try:
+        r = urllib.request.urlopen(url)  # response
+        return r.getcode() == 200
+    except urllib.request.HTTPError:
+        return False
+def check_url_pdf_file(url):
+    r = requests.get(url)
+    content_type = r.headers.get('content-type')
+    if 'application/pdf' in content_type:
+        return True
+    else:
+        return False
+def zenrows_scrapper(url):
+    zen_client = ZenRowsClient(ZENROWS_KEY)
+    params = {"js_render": "true"}
+    response = zen_client.get(url, params=params)
+    return response.text
+def get_new_question_from_history(pre_question, new_question, answer):
+    system_prompt = """
+            Generate a new Google search query using the previous question and answer. And return only the query.
+            previous question:- {pre_question}
+            answer:- {answer}
+            new question:- {new_question}
+            """.format(pre_question=pre_question, answer=answer, new_question=new_question)
+    return call_open_ai(system_prompt=system_prompt, max_tokens=50)
+def get_docs_from_web(question, history, n_web_search, strategy):
+    if history:
+        question = get_new_question_from_history(history[0][0], question, history[0][1])
+    urls = get_google_search_url(get_google_search_query(question))[:n_web_search]
+    urls = list(set(urls))
+    docs = ''
+    yield f"Scraping started for {len(urls)} urls:-\n\n"
+    for key, url in enumerate(urls):
+        if '.pdf' in url:
+            yield f"Scraping skipped pdf detected. {key + 1}/{len(urls)} - {url} ❌\n"
+            continue
+        if strategy == 'Deep':
+            # page_source = get_page_source_selenium_base(url)
+            page_source = zenrows_scrapper(url)
+            formatted_page_source = format_output(page_source)
+            formatted_page_source = clean_text(formatted_page_source)
+        else:
+            page_source = get_fast_url_source(url)
+            formatted_page_source = format_output(page_source)
+            formatted_page_source = clean_text(formatted_page_source)
+        tokens = token_counter(formatted_page_source, 'gpt-3.5-turbo')
+        if tokens >= 15585:
+            yield f"Scraping skipped as token limit exceeded. {key + 1}/{len(urls)} - {url} ❌\n"
+            continue
+        summary = url_summary(formatted_page_source, question)
+        docs += summary
+        docs += '\n Source:-' + url + '\n\n'
+        yield f"Scraping Done {key + 1}/{len(urls)} - {url} ✅\n"
+    yield {"data": docs}