Dy commited on
Commit
4b05aaa
1 Parent(s): 160dabe

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +341 -0
app.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ import os
3
+ from bs4 import BeautifulSoup
4
+ import gradio as gr
5
+ from langchain import OpenAI, ConversationChain, LLMChain, PromptTemplate
6
+ from langchain.memory import ConversationBufferWindowMemory
7
+ import openai
8
+ import requests
9
+ from langchain.chat_models import ChatOpenAI
10
+ import ast
11
+ import re
12
+ import json
13
+ import tempfile
14
+ import collectionstions
15
+
16
+ OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
17
+
18
+
19
+ def save_webpage_as_html(url):
20
+ headers = {
21
+ 'authority': 'ms-mt--api-web.spain.advgo.net',
22
+ 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
23
+ 'accept': 'application/json, text/plain, */*',
24
+ 'x-adevinta-channel': 'web-desktop',
25
+ 'x-schibsted-tenant': 'coches',
26
+ 'sec-ch-ua-mobile': '?0',
27
+ 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
28
+ 'content-type': 'application/json;charset=UTF-8',
29
+ 'origin': 'https://www.coches.net',
30
+ 'sec-fetch-site': 'cross-site',
31
+ 'sec-fetch-mode': 'cors',
32
+ 'sec-fetch-dest': 'empty',
33
+ 'referer': 'https://www.coches.net/',
34
+ 'accept-language': 'en-US,en;q=0.9,es;q=0.8',
35
+ }
36
+
37
+ response = requests.get(url, headers=headers)
38
+
39
+ # Check if the request was successful
40
+ if response.status_code != 200:
41
+ print(f"Failed to get the webpage: {url}")
42
+ return
43
+
44
+ # Create a BeautifulSoup object and specify the parser
45
+ soup = BeautifulSoup(response.text, 'html.parser')
46
+
47
+ # Create a dictionary to hold the result
48
+ result = collections.defaultdict(list)
49
+
50
+ # Find all tags that contain text (you may need to add more tags to this list)
51
+ for tag in soup.find_all(['li', 'ol']):
52
+ result[tag.name].append(tag.get_text(strip=True))
53
+
54
+ return result
55
+
56
+ output_json_format = '''
57
+ {
58
+ "category": "root_category",
59
+ "subcategories": [
60
+ {
61
+ "category": "node_category",
62
+ "subcategories": [
63
+ {
64
+ "category": "node_category",
65
+ "subcategories": [category1, category2, ...]
66
+ },
67
+ {
68
+ "category": "node_category",
69
+ "subcategories": [category1, category2, ...]
70
+ }
71
+ ]
72
+ },
73
+ {
74
+ "category": "node_category",
75
+ "subcategories": [category1, category2, ...]
76
+ }
77
+ ]
78
+ }
79
+
80
+ '''
81
+
82
+ empty_json = {
83
+ "category": "root_category",
84
+ "subcategories": [
85
+ ]
86
+ }
87
+
88
+ def get_taxanomy_from_url(url):
89
+
90
+ url_dict = save_webpage_as_html(url)
91
+
92
+ json_input = str(url_dict)
93
+
94
+ template = '''
95
+ {history}
96
+ {human_input}
97
+ '''
98
+ prompt = PromptTemplate(
99
+ input_variables=["history", "human_input"],
100
+ template=template
101
+ )
102
+
103
+ chatgpt_chain = LLMChain(
104
+ llm=ChatOpenAI(model="gpt-4", temperature=0,openai_api_key=OPENAI_API_KEY),
105
+ prompt=prompt,
106
+ verbose=True,
107
+ memory=ConversationBufferWindowMemory(k=10),
108
+ )
109
+
110
+ prompt_input2 = f'''
111
+ You are an expert ecommerce product taxanomy analyst.
112
+ You are equiped with vast knowledge of taxanomy, ontology and everything related to it.
113
+ You fit have deep expertise in the domain of: "An ontology identifies and distinguishes concepts and their relationships;
114
+ it describes content and relationships.
115
+ A taxonomy formalizes the hierarchical relationships among concepts and specifies the term to be used to refer to each;
116
+ it prescribes structure and terminology."
117
+
118
+ You have a task to extract taxanomy from a python dictionary of an extracted html page of an ecommerce website.
119
+
120
+ Here is the input python dictionary:
121
+ {json_input}
122
+
123
+ Here is the output json format:
124
+ {output_json_format}
125
+
126
+ From the input python dictionary, extract all available products under the li and ol key and create the output json taxanomy.
127
+ Think step by step.
128
+ Place the products in categories and subcategories accordingly.
129
+ Organize all the products to fit the output json format.
130
+
131
+ The output should follow a python dictionary..
132
+ Do not declare a new variable, output the python dictionary json object only.
133
+ Do not output "The taxonomy extracted from the given python list can be represented as follows:"
134
+ Do not provide extra information. Directly output the python dictionary only.
135
+ Do not insert any string before or after the python dictionary.
136
+ Output python dictionary only.
137
+ '''
138
+ encoding = tiktoken.encoding_for_model("gpt-4")
139
+ encoded_prompt2 = encoding.encode(prompt_input2)[:8000]
140
+ prompt_input2 = encoding.decode(encoded_prompt2)
141
+
142
+ json_dict = ""
143
+ while type(json_dict) != dict:
144
+ json_taxanomy_output=chatgpt_chain.predict(human_input=prompt_input2)
145
+ json_dict = ast.literal_eval(json_taxanomy_output)
146
+
147
+ file_name = "url_temp.json"
148
+
149
+ # Save the modified data back to the file
150
+ with open(file_name, 'w') as json_file:
151
+ json.dump(json_dict, json_file, indent=4) # 'indent' parameter makes the output more readable
152
+
153
+ return(file_name)
154
+
155
+
156
+ def expand_taxanomy(json_dict, num_layers, num_items, category_type):
157
+
158
+ num_layers = str(int(num_layers))
159
+ num_items = str(int(num_items))
160
+ json_input = str(json_dict)
161
+
162
+ template = '''
163
+ {history}
164
+ {human_input}
165
+ '''
166
+ prompt = PromptTemplate(
167
+ input_variables=["history", "human_input"],
168
+ template=template
169
+ )
170
+
171
+ chatgpt_chain = LLMChain(
172
+ llm=ChatOpenAI(model="gpt-4", temperature=0,openai_api_key=OPENAI_API_KEY),
173
+ prompt=prompt,
174
+ verbose=True,
175
+ memory=ConversationBufferWindowMemory(k=10),
176
+ )
177
+
178
+ prompt_input1 = f'''
179
+ You are an expert ecommerce product taxanomy analyst.
180
+ You are equiped with vast knowledge of taxanomy, ontology and everything related to it.
181
+ You fit have deep expertise in the domain of: "An ontology identifies and distinguishes concepts and their relationships;
182
+ it describes content and relationships.
183
+ A taxonomy formalizes the hierarchical relationships among concepts and specifies the term to be used to refer to each;
184
+ it prescribes structure and terminology."
185
+
186
+ You have a task to expand a taxanomy that is formatted in a json file.
187
+ The taxanomy tree should be {num_layers} layer deep with a total of {num_items} items.
188
+ The category type is {category_type}.
189
+
190
+ Here is the input json file:
191
+ {json_input}
192
+
193
+ Here is the output json format:
194
+ {output_json_format}
195
+
196
+ Expand the taxanomy of the input json file.
197
+ Find subcategories that fits each category.
198
+ Expand the leafs of the taxanomy tree.
199
+ Go deeper. Think step by step.
200
+ Find all subcategories and output it as a json object.
201
+
202
+ The output should follow a python dictionary..
203
+ Do not declare a new variable, output the python dictionary json object only.
204
+ Do not provide extra information. Directly output the python dictionary only.
205
+ '''
206
+
207
+ encoding = tiktoken.encoding_for_model("gpt-4")
208
+ encoded_prompt1 = encoding.encode(prompt_input1)[:8000]
209
+ prompt_input1 = encoding.decode(encoded_prompt1)
210
+
211
+ json_taxanomy_output=chatgpt_chain.predict(human_input=prompt_input1)
212
+ json_dict = ast.literal_eval(json_taxanomy_output)
213
+
214
+ return(json_dict)
215
+
216
+
217
+ def add_nodes_edges(graph, data, parent=None):
218
+ new_name = data['category']
219
+
220
+ # create node
221
+ graph.node(new_name)
222
+
223
+ if parent:
224
+ # create an edge between parent and child
225
+ graph.edge(parent, new_name)
226
+
227
+ # iterate over subcategories (if they exist)
228
+ for subcat in data.get('subcategories', []):
229
+ # subcategories can be either strings or new dicts
230
+ if isinstance(subcat, str):
231
+ # create node for the string subcategory
232
+ graph.node(subcat)
233
+ # create edge between the parent category and this subcategory
234
+ graph.edge(new_name, subcat)
235
+ else:
236
+ # if subcat is a dict, repeat the process with subcat as the parent
237
+ add_nodes_edges(graph, subcat, new_name)
238
+
239
+ def visualize_json(data):
240
+
241
+ graph = graphviz.Digraph(graph_attr={'rankdir': 'LR'}) # Added 'LR' for left to right graph
242
+
243
+ # Add nodes and edges
244
+ add_nodes_edges(graph, data)
245
+
246
+ # Visualize the graph
247
+ #graph.view()
248
+ return graph
249
+
250
+ def get_file(json_file):
251
+
252
+ try:
253
+ print("loading json file")
254
+ print("temp_file", json_file.name)
255
+ file_path = json_file.name
256
+
257
+ with open(file_path, 'r') as json_file:
258
+ data = json.load(json_file)
259
+ except:
260
+ print("using temp json")
261
+ file_path = 'temp.json'
262
+
263
+ with open(file_path, 'r') as json_file:
264
+ data = json.load(json_file)
265
+
266
+ try:
267
+ os.remove('graph.png')
268
+ print("graph removed")
269
+ except:
270
+ print("no existing graph")
271
+
272
+ graph = visualize_json(data)
273
+ # Render the graph as a PNG file
274
+ graph.format = 'png'
275
+ graph = graph.render(filename='graph', cleanup=True)
276
+
277
+ return graph
278
+
279
+ def modify_json(json_input, num_layers, num_items, category_type):
280
+
281
+ print("json_input first", json_input)
282
+ if json_input is not None:
283
+
284
+ file_path = json_input.name
285
+ # Open the file and load the JSON data
286
+ with open(file_path, 'r') as json_file:
287
+ data = json.load(json_file)
288
+ else:
289
+ data = empty_json
290
+ data["category"] = category_type
291
+
292
+ # Directly from dictionary
293
+ file_path = 'temp.json'
294
+ with open(file_path, 'w') as outfile:
295
+ json.dump(data, outfile)
296
+
297
+ json_dict = expand_taxanomy(data, num_layers, num_items,category_type)
298
+
299
+ print("json_dict", json_dict)
300
+
301
+ # Save the modified data back to the file
302
+ with open(file_path, 'w') as json_file:
303
+ json.dump(json_dict, json_file, indent=4) # 'indent' parameter makes the output more readable
304
+
305
+ return(file_path)
306
+
307
+ def print_num(a,b):
308
+ return(int(a), int(b))
309
+
310
+
311
+
312
+ with gr.Blocks() as demo:
313
+
314
+ gr.Markdown(
315
+ """
316
+ # Auto Taxanomy App
317
+ Upload a JSON taxanomy file or generate from scratch.
318
+ """)
319
+ with gr.Row():
320
+ with gr.Column():
321
+ json_file = gr.File(label="Upload JSON here.")
322
+ num_layers = gr.Number(label="Number of layers")
323
+ num_items = gr.Number(label="Number of items")
324
+ category_type = gr.Text(label="Category type")
325
+ modify_btn = gr.Button(value="Generate")
326
+ render_btn = gr.Button(value="Render")
327
+ print_btn = gr.Button(value="Print")
328
+ with gr.Column():
329
+ input_url = gr.Text(label="Insert URL")
330
+ geturl_btn = gr.Button(value="Get JSON Taxanomy")
331
+ #url_json_file = gr.File(label="URL JSON file.")
332
+ rendered_tree = gr.Image(label="Taxanomy Tree.")
333
+ output_file = gr.File(label="Ouput JSON file.")
334
+ print_text = gr.Text(label="Printing")
335
+
336
+ modify_btn.click(modify_json, inputs=[json_file, num_layers, num_items, category_type], outputs=output_file)
337
+ render_btn.click(get_file, inputs=json_file, outputs=rendered_tree)
338
+ print_btn.click(print_num, inputs=[num_layers,num_items], outputs=print_text)
339
+ geturl_btn.click(get_taxanomy_from_url, inputs=input_url, outputs=output_file)
340
+
341
+ demo.launch()