Spaces:
Runtime error
Runtime error
from pydantic import NoneStr | |
import os | |
from langchain.document_loaders import UnstructuredFileLoader | |
import mimetypes | |
import validators | |
import requests | |
import tempfile | |
import gradio as gr | |
import openai | |
import re | |
import urllib.parse | |
class WebpageSummarizer: | |
""" | |
A class to summarize webpages using OpenAI API. | |
""" | |
def __init__(self,): | |
""" | |
Set OpeanApi key | |
""" | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
def upload_via_url(self, url: str) -> NoneStr: | |
""" | |
Uploads a webpage content via URL and returns the document. | |
Args: | |
url (str): The URL of the webpage. | |
Returns: | |
NoneStr: The document content. | |
""" | |
# Check if the URL is valid | |
if validators.url(url): | |
headers = { | |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', | |
} | |
# Send a GET request to retrieve the webpage content | |
retrieve = requests.get(url, headers=headers) | |
# Get the content type of the response | |
content_type = retrieve.headers.get("content-type") | |
# Guess the file extension based on the content type | |
file_extension = mimetypes.guess_extension(content_type) | |
# Save the webpage content to a temporary file | |
temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False) | |
temp_file.write(retrieve.content) | |
file_path = temp_file.name | |
# Return the file path of the temporary file | |
return file_path | |
else: | |
# If the URL is not valid, do nothing and continue | |
pass | |
def save_content(self, file_path: str) -> NoneStr: | |
""" | |
Saves the content of a file at the specified file path. | |
Args: | |
file_path (str): The path of the file to be saved. | |
Returns: | |
NoneStr: The document content. | |
""" | |
# Load the temporary file as a document using the UnstructuredFileLoader | |
# strategy set to "fast" for faster processing | |
loader = UnstructuredFileLoader(file_path, strategy="fast") | |
# Load the document from the file | |
document = loader.load() | |
# Return the loaded document content | |
return document | |
def generate_summary(self, text: str) -> str: | |
""" | |
Generates a summary using OpenAI API. | |
Args: | |
text (str): The text to be summarized. | |
Returns: | |
str: The generated summary. | |
""" | |
prompt = f"Summarize the chemical related parts from given text. if text has other language return the summary as english. text: {text}" | |
# Make an API call to generate a summary using OpenAI API | |
response = openai.Completion.create( | |
model="text-davinci-003", | |
prompt=prompt, | |
temperature=0, | |
max_tokens=500, | |
top_p=1, | |
frequency_penalty=0, | |
presence_penalty=0.6, | |
) | |
message = response.choices[0].text.strip() | |
return message | |
def summarize_webpage(self, url: str) -> str: | |
""" | |
Summarizes a webpage using OpenAI API. | |
Args: | |
url (str): The URL of the webpage. | |
Returns: | |
str: The generated summary. | |
""" | |
try: | |
# Upload the webpage content and retrieve the temporary file path | |
temporary_file_path = self.upload_via_url(url) | |
# Save the content of the temporary file | |
document_content = self.save_content(temporary_file_path) | |
# Generate a summary using the document content | |
summary = self.generate_summary(document_content) | |
# Return the generated summary | |
return summary | |
except: | |
# If an exception occurs (e.g., invalid URL), return an error message | |
return "Please enter a valid URL." | |
def gradio_interface(self): | |
# Create a Gradio interface for the webpage summarization | |
with gr.Blocks(css="style.css", theme=gr.themes.Soft()) as demo: | |
gr.HTML("""<img class="leftimage" align="left" src="https://templates.images.credential.net/1612472097627370951721412474196.png" alt="Image" width="210" height="210"> | |
<img class="rightimage" align="right" src="https://logos-download.com/wp-content/uploads/2016/06/Syngenta_logo.png" alt="Image" width="150" height="140">""") | |
with gr.Row(): | |
with gr.Column(elem_id="col-container"): | |
gr.HTML("""<center><h1>Syngenta Chemical Identifier</h1></center>""") | |
inputs = gr.Textbox(label="URL") | |
btn = gr.Button(label="Submit",value = "Analyse") | |
outputs = gr.Textbox(label="Summary", lines=6) | |
btn.click(fn=self.summarize_webpage, inputs=inputs, outputs=outputs) | |
# Launch the Gradio interface | |
demo.launch() | |
if __name__ == "__main__": | |
web_scraper = WebpageSummarizer() | |
web_scraper.gradio_interface() | |