from pydantic import NoneStr import os from langchain.document_loaders import UnstructuredFileLoader import mimetypes import validators import requests import tempfile import gradio as gr import openai import re import urllib.parse class WebpageSummarizer: """ A class to summarize webpages using OpenAI API. """ def __init__(self,): """ Set OpeanApi key """ openai.api_key = os.getenv("OPENAI_API_KEY") def upload_via_url(self, url: str) -> NoneStr: """ Uploads a webpage content via URL and returns the document. Args: url (str): The URL of the webpage. Returns: NoneStr: The document content. """ # Check if the URL is valid if validators.url(url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', } # Send a GET request to retrieve the webpage content retrieve = requests.get(url, headers=headers) # Get the content type of the response content_type = retrieve.headers.get("content-type") # Guess the file extension based on the content type file_extension = mimetypes.guess_extension(content_type) # Save the webpage content to a temporary file temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False) temp_file.write(retrieve.content) file_path = temp_file.name # Return the file path of the temporary file return file_path else: # If the URL is not valid, do nothing and continue pass def save_content(self, file_path: str) -> NoneStr: """ Saves the content of a file at the specified file path. Args: file_path (str): The path of the file to be saved. Returns: NoneStr: The document content. """ # Load the temporary file as a document using the UnstructuredFileLoader # strategy set to "fast" for faster processing loader = UnstructuredFileLoader(file_path, strategy="fast") # Load the document from the file document = loader.load() # Return the loaded document content return document def generate_summary(self, text: str) -> str: """ Generates a summary using OpenAI API. Args: text (str): The text to be summarized. Returns: str: The generated summary. """ prompt = f"Summarize the chemical related parts from given text. if text has other language return the summary as english. text: {text}" # Make an API call to generate a summary using OpenAI API response = openai.Completion.create( model="text-davinci-003", prompt=prompt, temperature=0, max_tokens=500, top_p=1, frequency_penalty=0, presence_penalty=0.6, ) message = response.choices[0].text.strip() return message def summarize_webpage(self, url: str) -> str: """ Summarizes a webpage using OpenAI API. Args: url (str): The URL of the webpage. Returns: str: The generated summary. """ try: # Upload the webpage content and retrieve the temporary file path temporary_file_path = self.upload_via_url(url) # Save the content of the temporary file document_content = self.save_content(temporary_file_path) # Generate a summary using the document content summary = self.generate_summary(document_content) # Return the generated summary return summary except: # If an exception occurs (e.g., invalid URL), return an error message return "Please enter a valid URL." def gradio_interface(self): # Create a Gradio interface for the webpage summarization with gr.Blocks(css="style.css", theme=gr.themes.Soft()) as demo: gr.HTML(""" """) with gr.Row(): with gr.Column(elem_id="col-container"): gr.HTML("""