Spaces:
Sleeping
Sleeping
import tiktoken | |
import os | |
from bs4 import BeautifulSoup | |
import gradio as gr | |
from langchain import OpenAI, ConversationChain, LLMChain, PromptTemplate | |
from langchain.memory import ConversationBufferWindowMemory | |
import openai | |
import requests | |
from langchain.chat_models import ChatOpenAI | |
import ast | |
import re | |
import json | |
import tempfile | |
import collections | |
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] | |
def save_webpage_as_html(url): | |
headers = { | |
'authority': 'ms-mt--api-web.spain.advgo.net', | |
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', | |
'accept': 'application/json, text/plain, */*', | |
'x-adevinta-channel': 'web-desktop', | |
'x-schibsted-tenant': 'coches', | |
'sec-ch-ua-mobile': '?0', | |
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', | |
'content-type': 'application/json;charset=UTF-8', | |
'origin': 'https://www.coches.net', | |
'sec-fetch-site': 'cross-site', | |
'sec-fetch-mode': 'cors', | |
'sec-fetch-dest': 'empty', | |
'referer': 'https://www.coches.net/', | |
'accept-language': 'en-US,en;q=0.9,es;q=0.8', | |
} | |
response = requests.get(url, headers=headers) | |
# Check if the request was successful | |
if response.status_code != 200: | |
print(f"Failed to get the webpage: {url}") | |
return | |
# Create a BeautifulSoup object and specify the parser | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Create a dictionary to hold the result | |
result = collections.defaultdict(list) | |
# Find all tags that contain text (you may need to add more tags to this list) | |
for tag in soup.find_all(['li', 'ol']): | |
result[tag.name].append(tag.get_text(strip=True)) | |
return result | |
output_json_format = ''' | |
{ | |
"category": "root_category", | |
"subcategories": [ | |
{ | |
"category": "node_category", | |
"subcategories": [ | |
{ | |
"category": "node_category", | |
"subcategories": [category1, category2, ...] | |
}, | |
{ | |
"category": "node_category", | |
"subcategories": [category1, category2, ...] | |
} | |
] | |
}, | |
{ | |
"category": "node_category", | |
"subcategories": [category1, category2, ...] | |
} | |
] | |
} | |
''' | |
empty_json = { | |
"category": "root_category", | |
"subcategories": [ | |
] | |
} | |
def get_taxanomy_from_url(url): | |
url_dict = save_webpage_as_html(url) | |
json_input = str(url_dict) | |
template = ''' | |
{history} | |
{human_input} | |
''' | |
prompt = PromptTemplate( | |
input_variables=["history", "human_input"], | |
template=template | |
) | |
chatgpt_chain = LLMChain( | |
llm=ChatOpenAI(model="gpt-4", temperature=0,openai_api_key=OPENAI_API_KEY), | |
prompt=prompt, | |
verbose=True, | |
memory=ConversationBufferWindowMemory(k=10), | |
) | |
prompt_input2 = f''' | |
You are an expert ecommerce product taxanomy analyst. | |
You are equiped with vast knowledge of taxanomy, ontology and everything related to it. | |
You fit have deep expertise in the domain of: "An ontology identifies and distinguishes concepts and their relationships; | |
it describes content and relationships. | |
A taxonomy formalizes the hierarchical relationships among concepts and specifies the term to be used to refer to each; | |
it prescribes structure and terminology." | |
You have a task to extract taxanomy from a python dictionary of an extracted html page of an ecommerce website. | |
Here is the input python dictionary: | |
{json_input} | |
Here is the output json format: | |
{output_json_format} | |
From the input python dictionary, extract all available products under the li and ol key and create the output json taxanomy. | |
Think step by step. | |
Place the products in categories and subcategories accordingly. | |
Organize all the products to fit the output json format. | |
The output should follow a python dictionary.. | |
Do not declare a new variable, output the python dictionary json object only. | |
Do not output "The taxonomy extracted from the given python list can be represented as follows:" | |
Do not provide extra information. Directly output the python dictionary only. | |
Do not insert any string before or after the python dictionary. | |
Output python dictionary only. | |
''' | |
encoding = tiktoken.encoding_for_model("gpt-4") | |
encoded_prompt2 = encoding.encode(prompt_input2)[:8000] | |
prompt_input2 = encoding.decode(encoded_prompt2) | |
json_dict = "" | |
while type(json_dict) != dict: | |
json_taxanomy_output=chatgpt_chain.predict(human_input=prompt_input2) | |
json_dict = ast.literal_eval(json_taxanomy_output) | |
file_name = "url_temp.json" | |
# Save the modified data back to the file | |
with open(file_name, 'w') as json_file: | |
json.dump(json_dict, json_file, indent=4) # 'indent' parameter makes the output more readable | |
return(file_name) | |
def expand_taxanomy(json_dict, num_layers, num_items, category_type): | |
num_layers = str(int(num_layers)) | |
num_items = str(int(num_items)) | |
json_input = str(json_dict) | |
template = ''' | |
{history} | |
{human_input} | |
''' | |
prompt = PromptTemplate( | |
input_variables=["history", "human_input"], | |
template=template | |
) | |
chatgpt_chain = LLMChain( | |
llm=ChatOpenAI(model="gpt-4", temperature=0,openai_api_key=OPENAI_API_KEY), | |
prompt=prompt, | |
verbose=True, | |
memory=ConversationBufferWindowMemory(k=10), | |
) | |
prompt_input1 = f''' | |
You are an expert ecommerce product taxanomy analyst. | |
You are equiped with vast knowledge of taxanomy, ontology and everything related to it. | |
You fit have deep expertise in the domain of: "An ontology identifies and distinguishes concepts and their relationships; | |
it describes content and relationships. | |
A taxonomy formalizes the hierarchical relationships among concepts and specifies the term to be used to refer to each; | |
it prescribes structure and terminology." | |
You have a task to expand a taxanomy that is formatted in a json file. | |
The taxanomy tree should be {num_layers} layer deep with a total of {num_items} items. | |
The category type is {category_type}. | |
Here is the input json file: | |
{json_input} | |
Here is the output json format: | |
{output_json_format} | |
Expand the taxanomy of the input json file. | |
Find subcategories that fits each category. | |
Expand the leafs of the taxanomy tree. | |
Go deeper. Think step by step. | |
Find all subcategories and output it as a json object. | |
The output should follow a python dictionary.. | |
Do not declare a new variable, output the python dictionary json object only. | |
Do not provide extra information. Directly output the python dictionary only. | |
''' | |
encoding = tiktoken.encoding_for_model("gpt-4") | |
encoded_prompt1 = encoding.encode(prompt_input1)[:8000] | |
prompt_input1 = encoding.decode(encoded_prompt1) | |
json_taxanomy_output=chatgpt_chain.predict(human_input=prompt_input1) | |
json_dict = ast.literal_eval(json_taxanomy_output) | |
return(json_dict) | |
def add_nodes_edges(graph, data, parent=None): | |
new_name = data['category'] | |
# create node | |
graph.node(new_name) | |
if parent: | |
# create an edge between parent and child | |
graph.edge(parent, new_name) | |
# iterate over subcategories (if they exist) | |
for subcat in data.get('subcategories', []): | |
# subcategories can be either strings or new dicts | |
if isinstance(subcat, str): | |
# create node for the string subcategory | |
graph.node(subcat) | |
# create edge between the parent category and this subcategory | |
graph.edge(new_name, subcat) | |
else: | |
# if subcat is a dict, repeat the process with subcat as the parent | |
add_nodes_edges(graph, subcat, new_name) | |
def visualize_json(data): | |
graph = graphviz.Digraph(graph_attr={'rankdir': 'LR'}) # Added 'LR' for left to right graph | |
# Add nodes and edges | |
add_nodes_edges(graph, data) | |
# Visualize the graph | |
#graph.view() | |
return graph | |
def get_file(json_file): | |
try: | |
print("loading json file") | |
print("temp_file", json_file.name) | |
file_path = json_file.name | |
with open(file_path, 'r') as json_file: | |
data = json.load(json_file) | |
except: | |
print("using temp json") | |
file_path = 'temp.json' | |
with open(file_path, 'r') as json_file: | |
data = json.load(json_file) | |
try: | |
os.remove('graph.png') | |
print("graph removed") | |
except: | |
print("no existing graph") | |
graph = visualize_json(data) | |
# Render the graph as a PNG file | |
graph.format = 'png' | |
graph = graph.render(filename='graph', cleanup=True) | |
return graph | |
def modify_json(json_input, num_layers, num_items, category_type): | |
print("json_input first", json_input) | |
if json_input is not None: | |
file_path = json_input.name | |
# Open the file and load the JSON data | |
with open(file_path, 'r') as json_file: | |
data = json.load(json_file) | |
else: | |
data = empty_json | |
data["category"] = category_type | |
# Directly from dictionary | |
file_path = 'temp.json' | |
with open(file_path, 'w') as outfile: | |
json.dump(data, outfile) | |
json_dict = expand_taxanomy(data, num_layers, num_items,category_type) | |
print("json_dict", json_dict) | |
# Save the modified data back to the file | |
with open(file_path, 'w') as json_file: | |
json.dump(json_dict, json_file, indent=4) # 'indent' parameter makes the output more readable | |
return(file_path) | |
def print_num(a,b): | |
return(int(a), int(b)) | |
with gr.Blocks() as demo: | |
gr.Markdown( | |
""" | |
# Auto Taxanomy App | |
Upload a JSON taxanomy file or generate from scratch. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
json_file = gr.File(label="Upload JSON here.") | |
num_layers = gr.Number(label="Number of layers") | |
num_items = gr.Number(label="Number of items") | |
category_type = gr.Text(label="Category type") | |
modify_btn = gr.Button(value="Generate") | |
render_btn = gr.Button(value="Render") | |
print_btn = gr.Button(value="Print") | |
with gr.Column(): | |
input_url = gr.Text(label="Insert URL") | |
geturl_btn = gr.Button(value="Get JSON Taxanomy") | |
#url_json_file = gr.File(label="URL JSON file.") | |
rendered_tree = gr.Image(label="Taxanomy Tree.") | |
output_file = gr.File(label="Ouput JSON file.") | |
#print_text = gr.Text(label="Printing") | |
modify_btn.click(modify_json, inputs=[json_file, num_layers, num_items, category_type], outputs=output_file) | |
render_btn.click(get_file, inputs=json_file, outputs=rendered_tree) | |
#print_btn.click(print_num, inputs=[num_layers,num_items], outputs=print_text) | |
geturl_btn.click(get_taxanomy_from_url, inputs=input_url, outputs=output_file) | |
demo.launch() |