robertselvam's picture
Update app.py
a04b1a4
from pydantic import NoneStr
import os
from langchain.document_loaders import UnstructuredFileLoader
import mimetypes
import validators
import requests
import tempfile
import gradio as gr
import openai
import re
import urllib.parse
class WebpageSummarizer:
"""
A class to summarize webpages using OpenAI API.
"""
def __init__(self,):
"""
Set OpeanApi key
"""
openai.api_key = os.getenv("OPENAI_API_KEY")
def upload_via_url(self, url: str) -> NoneStr:
"""
Uploads a webpage content via URL and returns the document.
Args:
url (str): The URL of the webpage.
Returns:
NoneStr: The document content.
"""
# Check if the URL is valid
if validators.url(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
# Send a GET request to retrieve the webpage content
retrieve = requests.get(url, headers=headers)
# Get the content type of the response
content_type = retrieve.headers.get("content-type")
# Guess the file extension based on the content type
file_extension = mimetypes.guess_extension(content_type)
# Save the webpage content to a temporary file
temp_file = tempfile.NamedTemporaryFile(suffix=file_extension, delete=False)
temp_file.write(retrieve.content)
file_path = temp_file.name
# Return the file path of the temporary file
return file_path
else:
# If the URL is not valid, do nothing and continue
pass
def save_content(self, file_path: str) -> NoneStr:
"""
Saves the content of a file at the specified file path.
Args:
file_path (str): The path of the file to be saved.
Returns:
NoneStr: The document content.
"""
# Load the temporary file as a document using the UnstructuredFileLoader
# strategy set to "fast" for faster processing
loader = UnstructuredFileLoader(file_path, strategy="fast")
# Load the document from the file
document = loader.load()
# Return the loaded document content
return document
def generate_summary(self, text: str) -> str:
"""
Generates a summary using OpenAI API.
Args:
text (str): The text to be summarized.
Returns:
str: The generated summary.
"""
prompt = f"Summarize the chemical related parts from given text. if text has other language return the summary as english. text: {text}"
# Make an API call to generate a summary using OpenAI API
response = openai.Completion.create(
model="text-davinci-003",
prompt=prompt,
temperature=0,
max_tokens=500,
top_p=1,
frequency_penalty=0,
presence_penalty=0.6,
)
message = response.choices[0].text.strip()
return message
def summarize_webpage(self, url: str) -> str:
"""
Summarizes a webpage using OpenAI API.
Args:
url (str): The URL of the webpage.
Returns:
str: The generated summary.
"""
try:
# Upload the webpage content and retrieve the temporary file path
temporary_file_path = self.upload_via_url(url)
# Save the content of the temporary file
document_content = self.save_content(temporary_file_path)
# Generate a summary using the document content
summary = self.generate_summary(document_content)
# Return the generated summary
return summary
except:
# If an exception occurs (e.g., invalid URL), return an error message
return "Please enter a valid URL."
def gradio_interface(self):
# Create a Gradio interface for the webpage summarization
with gr.Blocks(css="style.css", theme=gr.themes.Soft()) as demo:
gr.HTML("""<img class="leftimage" align="left" src="https://templates.images.credential.net/1612472097627370951721412474196.png" alt="Image" width="210" height="210">
<img class="rightimage" align="right" src="https://logos-download.com/wp-content/uploads/2016/06/Syngenta_logo.png" alt="Image" width="150" height="140">""")
with gr.Row():
with gr.Column(elem_id="col-container"):
gr.HTML("""<center><h1>Syngenta Chemical Identifier</h1></center>""")
inputs = gr.Textbox(label="URL")
btn = gr.Button(label="Submit",value = "Analyse")
outputs = gr.Textbox(label="Summary", lines=6)
btn.click(fn=self.summarize_webpage, inputs=inputs, outputs=outputs)
# Launch the Gradio interface
demo.launch()
if __name__ == "__main__":
web_scraper = WebpageSummarizer()
web_scraper.gradio_interface()