Spaces:

mr-dee
/

auto-taxanomy

Sleeping

App Files Files Community

Dy commited on Jul 24, 2023

Commit

4b05aaa

•

1 Parent(s): 160dabe

Create app.py

Browse files

Files changed (1) hide show

app.py +341 -0

app.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import tiktoken
+import os
+from bs4 import BeautifulSoup
+import gradio as gr
+from langchain import OpenAI, ConversationChain, LLMChain, PromptTemplate
+from langchain.memory import ConversationBufferWindowMemory
+import openai
+import requests
+from langchain.chat_models import ChatOpenAI
+import ast
+import re
+import json
+import tempfile
+import collectionstions
+OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
+def save_webpage_as_html(url):
+    headers = {
+    'authority': 'ms-mt--api-web.spain.advgo.net',
+    'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
+    'accept': 'application/json, text/plain, */*',
+    'x-adevinta-channel': 'web-desktop',
+    'x-schibsted-tenant': 'coches',
+    'sec-ch-ua-mobile': '?0',
+    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
+    'content-type': 'application/json;charset=UTF-8',
+    'origin': 'https://www.coches.net',
+    'sec-fetch-site': 'cross-site',
+    'sec-fetch-mode': 'cors',
+    'sec-fetch-dest': 'empty',
+    'referer': 'https://www.coches.net/',
+    'accept-language': 'en-US,en;q=0.9,es;q=0.8',
+    }
+    response = requests.get(url, headers=headers)
+    # Check if the request was successful
+    if response.status_code != 200:
+        print(f"Failed to get the webpage: {url}")
+        return
+    # Create a BeautifulSoup object and specify the parser
+    soup = BeautifulSoup(response.text, 'html.parser')
+    # Create a dictionary to hold the result
+    result = collections.defaultdict(list)
+    # Find all tags that contain text (you may need to add more tags to this list)
+    for tag in soup.find_all(['li', 'ol']):
+        result[tag.name].append(tag.get_text(strip=True))
+    return result
+output_json_format = '''
+{
+    "category": "root_category",
+    "subcategories": [
+        {
+            "category": "node_category",
+            "subcategories": [
+                {
+                    "category": "node_category",
+                    "subcategories": [category1, category2, ...]
+                },
+                {
+                    "category": "node_category",
+                    "subcategories": [category1, category2, ...]
+                }
+            ]
+        },
+        {
+            "category": "node_category",
+            "subcategories": [category1, category2, ...]
+        }
+    ]
+}
+'''
+empty_json = {
+    "category": "root_category",
+    "subcategories": [
+    ]
+}
+def get_taxanomy_from_url(url):
+    url_dict = save_webpage_as_html(url)
+    json_input = str(url_dict)
+    template = '''
+    {history}
+    {human_input}
+    '''
+    prompt = PromptTemplate(
+        input_variables=["history", "human_input"],
+        template=template
+    )
+    chatgpt_chain = LLMChain(
+        llm=ChatOpenAI(model="gpt-4", temperature=0,openai_api_key=OPENAI_API_KEY),
+        prompt=prompt,
+        verbose=True,
+        memory=ConversationBufferWindowMemory(k=10),
+    )
+    prompt_input2 = f'''
+    You are an expert ecommerce product taxanomy analyst.
+    You are equiped with vast knowledge of taxanomy, ontology and everything related to it.
+    You fit have deep expertise in the domain of: "An ontology identifies and distinguishes concepts and their relationships;
+    it describes content and relationships.
+    A taxonomy formalizes the hierarchical relationships among concepts and specifies the term to be used to refer to each;
+    it prescribes structure and terminology."
+    You have a task to extract taxanomy from a python dictionary of an extracted html page of an ecommerce website.
+    Here is the input python dictionary:
+    {json_input}
+    Here is the output json format:
+    {output_json_format}
+    From the input python dictionary, extract all available products under the li and ol key and create the output json taxanomy.
+    Think step by step.
+    Place the products in categories and subcategories accordingly.
+    Organize all the products to fit the output json format.
+    The output should follow a python dictionary..
+    Do not declare a new variable, output the python dictionary json object only.
+    Do not output "The taxonomy extracted from the given python list can be represented as follows:"
+    Do not provide extra information.  Directly output the python dictionary only.
+    Do not insert any string before or after the python dictionary.
+    Output python dictionary only.
+    '''
+    encoding = tiktoken.encoding_for_model("gpt-4")
+    encoded_prompt2 = encoding.encode(prompt_input2)[:8000]
+    prompt_input2 = encoding.decode(encoded_prompt2)
+    json_dict = ""
+    while type(json_dict) != dict:
+        json_taxanomy_output=chatgpt_chain.predict(human_input=prompt_input2)
+        json_dict = ast.literal_eval(json_taxanomy_output)
+    file_name = "url_temp.json"
+    # Save the modified data back to the file
+    with open(file_name, 'w') as json_file:
+        json.dump(json_dict, json_file, indent=4)  # 'indent' parameter makes the output more readable
+    return(file_name)
+def expand_taxanomy(json_dict, num_layers, num_items, category_type):
+    num_layers = str(int(num_layers))
+    num_items = str(int(num_items))
+    json_input = str(json_dict)
+    template = '''
+    {history}
+    {human_input}
+    '''
+    prompt = PromptTemplate(
+        input_variables=["history", "human_input"],
+        template=template
+    )
+    chatgpt_chain = LLMChain(
+        llm=ChatOpenAI(model="gpt-4", temperature=0,openai_api_key=OPENAI_API_KEY),
+        prompt=prompt,
+        verbose=True,
+        memory=ConversationBufferWindowMemory(k=10),
+    )
+    prompt_input1 = f'''
+    You are an expert ecommerce product taxanomy analyst.
+    You are equiped with vast knowledge of taxanomy, ontology and everything related to it.
+    You fit have deep expertise in the domain of: "An ontology identifies and distinguishes concepts and their relationships;
+    it describes content and relationships.
+    A taxonomy formalizes the hierarchical relationships among concepts and specifies the term to be used to refer to each;
+    it prescribes structure and terminology."
+    You have a task to expand a taxanomy that is formatted in a json file.
+    The taxanomy tree should be {num_layers} layer deep with a total of {num_items} items.
+    The category type is {category_type}.
+    Here is the input json file:
+    {json_input}
+    Here is the output json format:
+    {output_json_format}
+    Expand the taxanomy of the input json file.
+    Find subcategories that fits each category.
+    Expand the leafs of the taxanomy tree.
+    Go deeper. Think step by step.
+    Find all subcategories and output it as a json object.
+    The output should follow a python dictionary..
+    Do not declare a new variable, output the python dictionary json object only.
+    Do not provide extra information.  Directly output the python dictionary only.
+    '''
+    encoding = tiktoken.encoding_for_model("gpt-4")
+    encoded_prompt1 = encoding.encode(prompt_input1)[:8000]
+    prompt_input1 = encoding.decode(encoded_prompt1)
+    json_taxanomy_output=chatgpt_chain.predict(human_input=prompt_input1)
+    json_dict = ast.literal_eval(json_taxanomy_output)
+    return(json_dict)
+def add_nodes_edges(graph, data, parent=None):
+    new_name = data['category']
+    # create node
+    graph.node(new_name)
+    if parent:
+        # create an edge between parent and child
+        graph.edge(parent, new_name)
+    # iterate over subcategories (if they exist)
+    for subcat in data.get('subcategories', []):
+        # subcategories can be either strings or new dicts
+        if isinstance(subcat, str):
+            # create node for the string subcategory
+            graph.node(subcat)
+            # create edge between the parent category and this subcategory
+            graph.edge(new_name, subcat)
+        else:
+            # if subcat is a dict, repeat the process with subcat as the parent
+            add_nodes_edges(graph, subcat, new_name)
+def visualize_json(data):
+    graph = graphviz.Digraph(graph_attr={'rankdir': 'LR'})  # Added 'LR' for left to right graph
+    # Add nodes and edges
+    add_nodes_edges(graph, data)
+    # Visualize the graph
+    #graph.view()
+    return graph
+def get_file(json_file):
+    try:
+        print("loading json file")
+        print("temp_file", json_file.name)
+        file_path = json_file.name
+        with open(file_path, 'r') as json_file:
+            data = json.load(json_file)
+    except:
+        print("using temp json")
+        file_path = 'temp.json'
+        with open(file_path, 'r') as json_file:
+            data = json.load(json_file)
+    try:
+        os.remove('graph.png')
+        print("graph removed")
+    except:
+        print("no existing graph")
+    graph = visualize_json(data)
+    # Render the graph as a PNG file
+    graph.format = 'png'
+    graph = graph.render(filename='graph', cleanup=True)
+    return graph
+def modify_json(json_input, num_layers, num_items, category_type):
+    print("json_input first", json_input)
+    if json_input is not None:
+        file_path = json_input.name
+        # Open the file and load the JSON data
+        with open(file_path, 'r') as json_file:
+            data = json.load(json_file)
+    else:
+        data = empty_json
+        data["category"] = category_type
+        # Directly from dictionary
+        file_path = 'temp.json'
+        with open(file_path, 'w') as outfile:
+            json.dump(data, outfile)
+    json_dict = expand_taxanomy(data, num_layers, num_items,category_type)
+    print("json_dict", json_dict)
+    # Save the modified data back to the file
+    with open(file_path, 'w') as json_file:
+        json.dump(json_dict, json_file, indent=4)  # 'indent' parameter makes the output more readable
+    return(file_path)
+def print_num(a,b):
+    return(int(a), int(b))
+with gr.Blocks() as demo:
+    gr.Markdown(
+    """
+    # Auto Taxanomy App
+    Upload a JSON taxanomy file or generate from scratch.
+    """)
+    with gr.Row():
+        with gr.Column():
+            json_file = gr.File(label="Upload JSON here.")
+            num_layers = gr.Number(label="Number of layers")
+            num_items = gr.Number(label="Number of items")
+            category_type = gr.Text(label="Category type")
+            modify_btn = gr.Button(value="Generate")
+            render_btn = gr.Button(value="Render")
+            print_btn = gr.Button(value="Print")
+        with gr.Column():
+            input_url = gr.Text(label="Insert URL")
+            geturl_btn = gr.Button(value="Get JSON Taxanomy")
+            #url_json_file = gr.File(label="URL JSON file.")
+            rendered_tree =  gr.Image(label="Taxanomy Tree.")
+            output_file = gr.File(label="Ouput JSON file.")
+            print_text = gr.Text(label="Printing")
+    modify_btn.click(modify_json, inputs=[json_file, num_layers, num_items, category_type], outputs=output_file)
+    render_btn.click(get_file, inputs=json_file, outputs=rendered_tree)
+    print_btn.click(print_num, inputs=[num_layers,num_items], outputs=print_text)
+    geturl_btn.click(get_taxanomy_from_url, inputs=input_url, outputs=output_file)
+demo.launch()