Files changed (8) hide show
  1. README.md +5 -6
  2. app.py +0 -430
  3. optimization.py +0 -66
  4. requirements.txt +0 -10
  5. src/__init__.py +0 -0
  6. src/optimization.py +0 -66
  7. src/text.py +0 -130
  8. text.py +0 -130
README.md CHANGED
@@ -1,14 +1,13 @@
1
  ---
2
- title: QA/FAQ Generator
3
- emoji: πŸ“ˆ
4
- colorFrom: gray
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.36.1
8
  app_file: app.py
9
- pinned: true
10
  license: apache-2.0
11
- short_description: Generates Questions and Answers from given text content.
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Qa Generator
3
+ emoji: πŸ‘
4
+ colorFrom: green
5
  colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.36.1
8
  app_file: app.py
9
+ pinned: false
10
  license: apache-2.0
 
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py DELETED
@@ -1,430 +0,0 @@
1
- import gradio as gr
2
- import torch
3
- import itertools
4
- import pandas as pd
5
- import spaces
6
- import random
7
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
8
- from sklearn.metrics import pairwise_distances
9
- from collections import Counter
10
- from itertools import chain
11
- from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
12
- import math
13
- import markdown
14
- from src.text import doctree_from_url, get_selectors_for_class, split_by_heading, DocTree
15
- from src.optimization import ngrams, count_ngrams, self_bleu, dist_n, perplexity, js_divergence
16
-
17
-
18
- model_name = 'philipp-zettl/t5-small-long-qa'
19
- qa_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
20
- model_name = 'philipp-zettl/t5-small-qg'
21
- qg_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
22
- tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-small')
23
-
24
- embedding_model = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
25
- embedding_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
26
-
27
- # Move only the student model to GPU if available
28
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
29
- qa_model = qa_model.to(device)
30
- qg_model = qg_model.to(device)
31
- embedding_model = embedding_model.to(device)
32
-
33
- max_questions = 1
34
- max_answers = 1
35
- max_elem_value = 100
36
-
37
-
38
- def embedding_similarity(inputs, outputs):
39
- global embedding_model, embedding_tokenizer, device
40
- def embed(texts):
41
- inputs = embedding_tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
42
- with torch.no_grad():
43
- outputs = embedding_model(**inputs)
44
- return outputs.last_hidden_state.mean(dim=1).cpu().numpy()
45
-
46
- input_embeddings = embed(inputs)
47
- output_embeddings = embed(outputs)
48
-
49
- similarities = pairwise_distances(input_embeddings, output_embeddings, metric='cosine')
50
- return sum(similarities) / len(similarities)
51
-
52
-
53
- def evaluate_model(num_beams, num_beam_groups, model, tokenizer, eval_data, max_length=85):
54
- generated_outputs = []
55
-
56
- for input_text in eval_data:
57
- input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
58
- outputs = model.generate(
59
- input_ids,
60
- num_beams=num_beams,
61
- num_beam_groups=num_beam_groups,
62
- diversity_penalty=1.0,
63
- max_new_tokens=max_length,
64
- )
65
- decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
66
- generated_outputs.append(decoded_text.split())
67
-
68
- # Self-BLEU for diversity
69
- diversity_score = self_bleu(generated_outputs)
70
-
71
- # Dist-1 and Dist-2 for diversity
72
- dist1 = dist_n(generated_outputs, 1)
73
- dist2 = dist_n(generated_outputs, 2)
74
-
75
- # Perplexity for fluency and relevance
76
- fluency_score = perplexity(model, tokenizer, [" ".join(output) for output in generated_outputs])
77
-
78
- # Embedding similarity for contextual relevance
79
- contextual_score = embedding_similarity(eval_data, [" ".join(output) for output in generated_outputs])
80
-
81
- # Jensen-Shannon Divergence for distribution similarity
82
- generated_ngrams = count_ngrams(list(chain(*generated_outputs)), 4)
83
- reference_ngrams = count_ngrams(list(chain(*[tokenizer.tokenize(text) for text in eval_data])), 4)
84
- all_ngrams = set(generated_ngrams.keys()).union(set(reference_ngrams.keys()))
85
- p = [generated_ngrams[ngram] for ngram in all_ngrams]
86
- q = [reference_ngrams[ngram] for ngram in all_ngrams]
87
- jsd_score = js_divergence(p, q)
88
-
89
- return {
90
- "diversity_score": diversity_score,
91
- "dist1": dist1,
92
- "dist2": dist2,
93
- "fluency_score": fluency_score,
94
- "contextual_score": contextual_score,
95
- "jsd_score": jsd_score
96
- }
97
-
98
-
99
- def find_best_parameters(eval_data, model, tokenizer, max_length=85):
100
-
101
- # Parameter ranges
102
- parameter_map = {
103
- 2: [2],
104
- 4: [2],
105
- 6: [2], # 6x3 == 4x2
106
- 8: [2], # 8x4 == 6x3 == 4x2
107
- 9: [3],
108
- 10: [2], # 10x5 == 8x4 == 6x3 == 4x2
109
- }
110
-
111
- # Find the best parameters
112
- best_score = -float('inf')
113
- best_params = None
114
-
115
- for num_beams in parameter_map.keys():
116
- for num_beam_groups in parameter_map[num_beams]:
117
- if num_beam_groups > num_beams:
118
- continue # num_beam_groups should not be greater than num_beams
119
-
120
- scores = evaluate_model(num_beams, num_beam_groups, model, tokenizer, eval_data, max_length=max_length)
121
- # Combine scores to determine the best parameters
122
- combined_score = (scores['dist1'] + scores['dist2'] - scores['fluency_score'] + scores['contextual_score'] - scores['jsd_score']).mean()
123
- print(f"num_beams={num_beams}, num_beam_groups={num_beam_groups}, avg combined score={combined_score}")
124
-
125
- if combined_score > best_score:
126
- best_score = combined_score
127
- best_params = (num_beams, num_beam_groups)
128
-
129
- print(f"Best parameters: num_beams={best_params[0]}, num_beam_groups={best_params[1]} with combined score={best_score}")
130
- return best_params
131
-
132
-
133
- def run_model(inputs, tokenizer, model, num_beams=2, num_beam_groups=2, temperature=0.5, num_return_sequences=1, max_length=85, seed=42069):
134
- all_outputs = []
135
- torch.manual_seed(seed)
136
- for input_text in inputs:
137
- model_inputs = tokenizer([input_text], max_length=512, padding=True, truncation=True)
138
- input_ids = torch.tensor(model_inputs['input_ids']).to(device)
139
- for sample in input_ids:
140
- sample_outputs = []
141
- with torch.no_grad():
142
- sample_output = model.generate(
143
- input_ids[:1],
144
- max_length=max_length,
145
- num_return_sequences=num_return_sequences,
146
- low_memory=True,
147
- use_cache=True,
148
- # Diverse Beam search decoding
149
- num_beams=max(2, num_return_sequences),
150
- num_beam_groups=max(2, num_return_sequences),
151
- diversity_penalty=temperature,
152
-
153
- )
154
- for i, sample_output in enumerate(sample_output):
155
- sample_output = sample_output.unsqueeze(0)
156
- sample_output = tokenizer.decode(sample_output[0], skip_special_tokens=True)
157
- sample_outputs.append(sample_output)
158
-
159
- all_outputs.append(sample_outputs)
160
- return all_outputs
161
-
162
-
163
- @spaces.GPU
164
- def gen(content, temperature_qg=0.5, temperature_qa=0.75, num_return_sequences_qg=1, num_return_sequences_qa=1, max_length=85, seed=42069, optimize_questions=False):
165
- inputs = [
166
- f'context: {content}'
167
- ]
168
- question = run_model(
169
- inputs,
170
- tokenizer,
171
- qg_model,
172
- num_beams=num_return_sequences_qg,
173
- num_beam_groups=num_return_sequences_qg,
174
- temperature=temperature_qg,
175
- num_return_sequences=num_return_sequences_qg,
176
- max_length=max_length,
177
- seed=seed
178
- )
179
-
180
- if optimize_questions:
181
- q_params = find_best_parameters(
182
- list(chain.from_iterable(question)), qg_model, tokenizer, max_length=max_length
183
- )
184
-
185
- question = run_model(
186
- inputs,
187
- tokenizer,
188
- qg_model,
189
- num_beams=q_params[0],
190
- num_beam_groups=q_params[1],
191
- temperature=temperature_qg,
192
- num_return_sequences=num_return_sequences_qg,
193
- max_length=max_length,
194
- seed=seed
195
- )
196
-
197
- inputs = list(chain.from_iterable([
198
- [f'question: {q} context: {content}' for q in q_set] for q_set in question
199
- ]))
200
- answer = run_model(
201
- inputs,
202
- tokenizer,
203
- qa_model,
204
- num_beams=num_return_sequences_qa,
205
- num_beam_groups=num_return_sequences_qa,
206
- temperature=temperature_qa,
207
- num_return_sequences=num_return_sequences_qa,
208
- max_length=max_length,
209
- seed=seed
210
- )
211
-
212
- questions = list(chain.from_iterable(question))
213
- answers = list(chain.from_iterable(answer))
214
-
215
- results = []
216
- for idx, ans in enumerate(answers):
217
- results.append({'question': questions[idx % num_return_sequences_qg], 'answer': ans})
218
- return results
219
-
220
-
221
- def variable_outputs(k, max_elems=10):
222
- global max_elem_value
223
- k = int(k)
224
- return [gr.Text(visible=True)] * k + [gr.Text(visible=False)] * (max(max_elems, max_elem_value)- k)
225
-
226
-
227
- def set_outputs(content, max_elems=10):
228
- c = eval(content)
229
- print('received content: ', c)
230
- return [gr.Text(value=t, visible=True) for t in c] + [gr.Text(visible=False)] * (max(max_elems, 10) - len(c))
231
-
232
-
233
- def create_file_download(qnas):
234
- with open('qnas.tsv', 'w') as f:
235
- for idx, qna in qnas.iterrows():
236
- f.write(qna['Question'] + '\t' + qna['Answer'])
237
- if idx < len(qnas) - 1:
238
- f.write('\n')
239
- return 'qnas.tsv'
240
-
241
-
242
- def main():
243
- with gr.Tab(label='QA Generator'):
244
- with gr.Tab(label='Explanation'):
245
- gr.Markdown(
246
- '''
247
- # QA Generator
248
- This tab allows you to generate questions and answers from a given piece of text content.
249
-
250
- ## How to use
251
- 1. Enter the text content you want to generate questions and answers from.
252
- 2. Adjust the diversity penalty for question generation and answer generation.
253
- 3. Set the maximum length of the generated questions and answers.
254
- 4. Choose the number of questions and answers you want to generate.
255
- 5. Click on the "Generate" button.
256
-
257
- The next section will give you insights into the generated questions and answers.
258
-
259
- If you're satisfied with the generated questions and answers, you can download them as a TSV file.
260
- '''
261
- )
262
- with gr.Accordion(label='Optimization', open=False):
263
- gr.Markdown("""
264
- For optimization of the question generation we apply the following combined score:
265
-
266
- $$\\text{combined} = \\text{dist1} + \\text{dist2} - \\text{fluency} + \\text{contextual} - \\text{jsd}$$
267
-
268
- Here's a brief explanation of each component:
269
-
270
- 1. **dist1 and dist2**: These represent the diversity of the generated outputs. dist1 measures the ratio of unique unigrams to total unigrams, and dist2 measures the ratio of unique bigrams to total bigrams. <u>**Higher values indicate more diverse outputs.**</u>
271
-
272
- 2. **fluency**: This is the perplexity of the generated outputs, which measures how well the outputs match the language model's expectations. <u>**Lower values indicate better fluency.**</u>
273
-
274
- 3. **contextual**: This measures the similarity between the input and generated outputs using embedding similarity. <u>**Higher values indicate better contextual relevance.**</u>
275
-
276
- 4. **jsd**: This is the Jensen-Shannon Divergence between the n-gram distributions of the generated outputs and the reference data. <u>**Lower values indicate greater similarity between distributions.**</u>
277
- """, latex_delimiters=[{'display': False, 'left': '$$', 'right': '$$'}])
278
- with gr.Tab(label='Generate QA'):
279
- with gr.Row(equal_height=True):
280
- with gr.Group("Content"):
281
- content = gr.Textbox(label='Content', lines=15, placeholder='Enter text here', max_lines=10_000)
282
- with gr.Group("Settings"):
283
- temperature_qg = gr.Slider(label='Diversity Penalty QG', value=0.2, minimum=0, maximum=1, step=0.01)
284
- temperature_qa = gr.Slider(label='Diversity Penalty QA', value=0.5, minimum=0, maximum=1, step=0.01)
285
- max_length = gr.Number(label='Max Length', value=85, minimum=1, step=1, maximum=512)
286
- num_return_sequences_qg = gr.Number(label='Number Questions', value=max_questions, minimum=1, step=1, maximum=max(max_questions, max_elem_value))
287
- num_return_sequences_qa = gr.Number(label="Number Answers", value=max_answers, minimum=1, step=1, maximum=max(max_questions, max_elem_value))
288
- seed = gr.Number(label="seed", value=42069)
289
- optimize_questions = gr.Checkbox(label="Optimize questions?", value=False)
290
-
291
- with gr.Row():
292
- gen_btn = gr.Button("Generate")
293
-
294
- @gr.render(
295
- inputs=[
296
- content, temperature_qg, temperature_qa, num_return_sequences_qg, num_return_sequences_qa,
297
- max_length, seed, optimize_questions
298
- ],
299
- triggers=[gen_btn.click]
300
- )
301
- def render_results(content, temperature_qg, temperature_qa, num_return_sequences_qg, num_return_sequences_qa, max_length, seed, optimize_questions):
302
- if not content.strip():
303
- raise gr.Error('Please enter some content to generate questions and answers.')
304
- qnas = gen(
305
- content, temperature_qg, temperature_qa, num_return_sequences_qg, num_return_sequences_qa,
306
- max_length, seed, optimize_questions
307
- )
308
- df = gr.Dataframe(
309
- value=[u.values() for u in qnas],
310
- headers=['Question', 'Answer'],
311
- col_count=2,
312
- wrap=True
313
- )
314
- pd_df = pd.DataFrame([u.values() for u in qnas], columns=['Question', 'Answer'])
315
-
316
- download = gr.DownloadButton(label='Download (without headers)', value=create_file_download(pd_df))
317
-
318
- content.change(lambda x: x.strip(), content)
319
-
320
-
321
- def new_main():
322
- with gr.Tab('Content extraction from URL'):
323
- with gr.Tab(label='Explanation'):
324
- gr.Markdown(
325
- '''
326
- # Content extraction from URL
327
- This tab allows you to extract content from a URL and chunk it into sections.
328
-
329
- ## How to use
330
- 1. Enter the URL of the webpage you want to extract content from.
331
- 2. Select the element class and class name of the HTML element you want to extract content from.
332
- 3. Click on the "Extract content" button.
333
-
334
- The next section will give you insights into the extracted content.
335
-
336
- This was done to give you the possibility to look at the extracted content, as well as manipulate it further.
337
-
338
- Once you extract the content, you can choose the depth level to chunk the content into sections.
339
- 1. Enter the depth level you want to chunk the content into. **Note: <u>This is based on the HTML structure of the webpage, we're utilizing heading tags for this purpose</u>**
340
- 2. Click on the "Chunk content" button.
341
- '''
342
- )
343
- with gr.Tab(label='Extract content'):
344
- url = gr.Textbox(label='URL', placeholder='Enter URL here', lines=1, max_lines=1)
345
- elem_class = gr.Dropdown(label='CSS element class', choices=['div', 'p', 'span', 'main', 'body', 'section', 'main'], value='div')
346
- class_name = gr.Dropdown(label='CSS class name', choices=[], allow_custom_value=True)
347
-
348
- extract_btn = gr.Button('Extract content')
349
-
350
- with gr.Group():
351
- content_state = gr.State(None)
352
- final_content = gr.Textbox(value='', show_copy_button=True, label='Final content', interactive=True)
353
- with gr.Accordion('Reveal original input', open=False):
354
- og_content = gr.Textbox(value='', label='OG HTML content')
355
-
356
- with gr.Group(visible=False) as step_2_group:
357
- depth_level = gr.Number(label='Depth level', value=1, minimum=0, step=1, maximum=6)
358
- continue_btn = gr.Button('Chunk content')
359
-
360
- def render_results(url, elem_class_, class_name_):
361
- if not url.strip():
362
- raise gr.Error('Please enter a URL to extract content.')
363
- content = doctree_from_url(url, elem_class_, class_name_)
364
- return [
365
- content,
366
- content.content,
367
- content.as_markdown(content.merge_sections(content.get_sections(0))),
368
- gr.Group(visible=True)
369
- ]
370
-
371
- def get_class_options(url, elem_class):
372
- if not url.strip():
373
- raise gr.Error('Please enter a URL to extract content.')
374
-
375
- return gr.Dropdown(label='CSS class name', choices=list(set(get_selectors_for_class(url, elem_class))))
376
-
377
- def update_content_state_on_final_change(final_content):
378
- html_content = markdown.markdown(final_content)
379
- return DocTree(split_by_heading(html_content, 1))
380
-
381
- @gr.render(inputs=[content_state, depth_level], triggers=[continue_btn.click])
382
- def select_content(content, depth_level):
383
- if not content:
384
- raise gr.Error('Please extract content first.')
385
-
386
- sections = content.get_sections_by_depth(depth_level)
387
- print(f'Found {len(sections)} sections')
388
- ds = []
389
- for idx, section in enumerate(sections):
390
- ds.append([idx, content.as_markdown(content.merge_sections(section))])
391
- gr.Dataframe(value=ds, headers=['Section #', 'Content'], interactive=True, wrap=True)
392
-
393
- elem_class.change(
394
- get_class_options,
395
- inputs=[url, elem_class],
396
- outputs=[class_name]
397
- )
398
-
399
- extract_btn.click(
400
- render_results,
401
- inputs=[
402
- url, elem_class, class_name,
403
- ],
404
- outputs=[
405
- content_state, og_content, final_content, step_2_group
406
- ]
407
- )
408
- final_content.change(update_content_state_on_final_change, inputs=[final_content], outputs=[content_state])
409
-
410
-
411
- with gr.Blocks() as demo:
412
- gr.Markdown(
413
- '''
414
- # QA-Generator
415
- A tool to build FAQs or QnAs from a given piece of text content.
416
-
417
- ## How to use
418
- We provide you two major functionalities:
419
- 1. **Content extraction from URL**: Extract content from a URL and chunk it into sections.
420
- 2. **QA Generator**: Generate questions and answers from a given text content.
421
-
422
- Select the tab you want to use and follow the instructions.
423
- '''
424
- )
425
- new_main()
426
- main()
427
-
428
-
429
- demo.queue()
430
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
optimization.py DELETED
@@ -1,66 +0,0 @@
1
- from collections import Counter
2
- from itertools import chain
3
- import math
4
- import torch
5
- from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
6
-
7
-
8
- def ngrams(sequence, n):
9
- return [tuple(sequence[i:i+n]) for i in range(len(sequence)-n+1)]
10
-
11
- def count_ngrams(sequence, max_n):
12
- counts = Counter()
13
- for n in range(1, max_n + 1):
14
- counts.update(ngrams(sequence, n))
15
- return counts
16
-
17
- def self_bleu(outputs):
18
- smoothing_function = SmoothingFunction().method1
19
- scores = []
20
- for i in range(len(outputs)):
21
- references = outputs[:i] + outputs[i+1:]
22
- # Avoid calculating BLEU score for empty references
23
- if references:
24
- scores.append(sentence_bleu(references, outputs[i], smoothing_function=smoothing_function))
25
- # If all references are empty, return a default value
26
- if not scores:
27
- return 0
28
- return sum(scores) / len(scores)
29
-
30
- def dist_n(outputs, n):
31
- all_ngrams = list(chain(*[ngrams(output, n) for output in outputs]))
32
- unique_ngrams = set(all_ngrams)
33
- return len(unique_ngrams) / len(all_ngrams) if all_ngrams else 0
34
-
35
- def perplexity(model, tokenizer, texts):
36
- encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
37
- max_length = model.config.n_positions
38
- stride = 512
39
- lls = []
40
- for i in range(0, encodings.input_ids.size(1), stride):
41
- begin_loc = max(i + stride - max_length, 0)
42
- end_loc = i + stride
43
- trg_len = end_loc - i
44
- input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
45
- target_ids = input_ids.clone()
46
- target_ids[:, :-trg_len] = -100
47
-
48
- with torch.no_grad():
49
- outputs = model(input_ids, labels=target_ids)
50
- log_likelihood = outputs.loss * trg_len
51
- lls.append(log_likelihood)
52
-
53
- ppl = torch.exp(torch.stack(lls).sum() / end_loc)
54
- return ppl.item()
55
-
56
- def js_divergence(p, q):
57
- def kl_divergence(p, q):
58
- return sum(p[i] * math.log(p[i] / q[i]) for i in range(len(p)) if p[i] != 0 and q[i] != 0)
59
-
60
- p_norm = [float(i)/sum(p) for i in p]
61
- q_norm = [float(i)/sum(q) for i in q]
62
-
63
- m = [(p_norm[i] + q_norm[i]) / 2 for i in range(len(p_norm))]
64
-
65
- return (kl_divergence(p_norm, m) + kl_divergence(q_norm, m)) / 2
66
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,10 +0,0 @@
1
- transformers
2
- torch
3
- pandas
4
- scikit-learn
5
- nltk
6
- markdownify
7
- beautifulsoup4
8
- newspaper3k
9
- markdown
10
- lxml[html_clean]
 
 
 
 
 
 
 
 
 
 
 
src/__init__.py DELETED
File without changes
src/optimization.py DELETED
@@ -1,66 +0,0 @@
1
- from collections import Counter
2
- from itertools import chain
3
- import math
4
- import torch
5
- from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
6
-
7
-
8
- def ngrams(sequence, n):
9
- return [tuple(sequence[i:i+n]) for i in range(len(sequence)-n+1)]
10
-
11
- def count_ngrams(sequence, max_n):
12
- counts = Counter()
13
- for n in range(1, max_n + 1):
14
- counts.update(ngrams(sequence, n))
15
- return counts
16
-
17
- def self_bleu(outputs):
18
- smoothing_function = SmoothingFunction().method1
19
- scores = []
20
- for i in range(len(outputs)):
21
- references = outputs[:i] + outputs[i+1:]
22
- # Avoid calculating BLEU score for empty references
23
- if references:
24
- scores.append(sentence_bleu(references, outputs[i], smoothing_function=smoothing_function))
25
- # If all references are empty, return a default value
26
- if not scores:
27
- return 0
28
- return sum(scores) / len(scores)
29
-
30
- def dist_n(outputs, n):
31
- all_ngrams = list(chain(*[ngrams(output, n) for output in outputs]))
32
- unique_ngrams = set(all_ngrams)
33
- return len(unique_ngrams) / len(all_ngrams) if all_ngrams else 0
34
-
35
- def perplexity(model, tokenizer, texts):
36
- encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
37
- max_length = model.config.n_positions
38
- stride = 512
39
- lls = []
40
- for i in range(0, encodings.input_ids.size(1), stride):
41
- begin_loc = max(i + stride - max_length, 0)
42
- end_loc = i + stride
43
- trg_len = end_loc - i
44
- input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
45
- target_ids = input_ids.clone()
46
- target_ids[:, :-trg_len] = -100
47
-
48
- with torch.no_grad():
49
- outputs = model(input_ids, labels=target_ids)
50
- log_likelihood = outputs.loss * trg_len
51
- lls.append(log_likelihood)
52
-
53
- ppl = torch.exp(torch.stack(lls).sum() / end_loc)
54
- return ppl.item()
55
-
56
- def js_divergence(p, q):
57
- def kl_divergence(p, q):
58
- return sum(p[i] * math.log(p[i] / q[i]) for i in range(len(p)) if p[i] != 0 and q[i] != 0)
59
-
60
- p_norm = [float(i)/sum(p) for i in p]
61
- q_norm = [float(i)/sum(q) for i in q]
62
-
63
- m = [(p_norm[i] + q_norm[i]) / 2 for i in range(len(p_norm))]
64
-
65
- return (kl_divergence(p_norm, m) + kl_divergence(q_norm, m)) / 2
66
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/text.py DELETED
@@ -1,130 +0,0 @@
1
- from markdownify import markdownify as md
2
- from bs4 import BeautifulSoup as BS
3
- from urllib.parse import urljoin
4
- from newspaper import Article
5
- import re
6
- import markdown
7
-
8
-
9
- def clean(s):
10
- s = s.replace("\t", "\\t")
11
- s = s.replace("\n", "\\n")
12
- return s
13
-
14
- class DocTree:
15
- def __init__(self, content):
16
- self.content = content
17
- self.max_depth = 6
18
-
19
- def get_sections(self, *location_ids):
20
- out = self.content
21
- for id_ in location_ids:
22
- out = out[id_]
23
- return out
24
-
25
- def merge_sections(self, elems):
26
- if not isinstance(elems[0], list):
27
- return '\n\n '.join(elems)
28
- out = []
29
- for e in elems:
30
- out.append(self.merge_sections(e))
31
- return '\n\n '.join(map(clean, out))
32
-
33
- def get_merged_sections(self, *location_ids):
34
- return [self.merge_sections(s) for s in self.get_sections(*location_ids)]
35
-
36
- def as_markdown(self, content):
37
- return md(content)
38
-
39
- def get_sections_by_depth(self, depth):
40
- return self._get_sections_by_depth(self.content, depth)
41
-
42
- @staticmethod
43
- def _get_sections_by_depth(content, depth):
44
- """Returns a list of merged sections at a specific depth"""
45
- if depth == 0:
46
- return content
47
- out = []
48
- for elem in content:
49
- out += DocTree._get_sections_by_depth(elem, depth - 1)
50
- return out
51
-
52
-
53
- def fix_relative_links(url, article_content):
54
- if 'http' in url:
55
- base_url = '/'.join(url.split('/')[:3])
56
- else:
57
- base_url = url.split('/')
58
- pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
59
- res = pat.findall(article_content)
60
- if res:
61
- for g in res:
62
- url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
63
- article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
64
- else:print('not found')
65
- return article_content
66
-
67
-
68
- def extract_article(url):
69
- article = Article(url)
70
- article.download()
71
- article.parse()
72
- return article
73
-
74
-
75
- def select_content(html_code, elem_class, class_name):
76
- print(f'Calling select_content with {elem_class}, {class_name}')
77
- kwargs = {}
78
- if class_name.startswith('.'):
79
- class_name = class_name[1:]
80
- kwargs = {'class_': class_name}
81
- elif class_name.startswith('#'):
82
- kwargs = {'id': class_name[1:]}
83
- return md(str(BS(html_code, features="lxml").find(**kwargs)))
84
-
85
-
86
- def split_by_heading(html_content, _i):
87
- if _i >= 7:
88
- return html_content
89
- elems = []
90
- for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
91
- if idx > 0 or elem.startswith('>'):
92
- elem = f'<h{_i}{elem}'
93
- elems.append(split_by_heading(elem, _i+1))
94
- return elems
95
-
96
- def doctree_from_url(url, elem_class='div', class_name='article-body'):
97
- article = extract_article(url)
98
- # convert to MD to handle splitting better
99
- article_content = select_content(article.html, elem_class, class_name)
100
- requires_title = list(filter(lambda x: x.strip().startswith('# '), article_content.split('\n'))) != []
101
-
102
- if requires_title:
103
- print('Didn\'t find title, will add it manually...')
104
- article_content = f"# {article.title}\n\n{article_content}"
105
- article_content = article_content.replace('\n\n', '\n').replace('#', '%%@@%%')
106
- # fix relative website links
107
- article_content = fix_relative_links(url, article_content)
108
- # convert back to HTML
109
- html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
110
- doc_tree = DocTree(split_by_heading(html_content, 1))
111
-
112
- #assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
113
- return doc_tree
114
-
115
-
116
- def get_selectors_for_class(url, elem_class):
117
- article = extract_article(url)
118
-
119
- html_content = article.html
120
- soup = BS(html_content, features="lxml")
121
- classes = set()
122
- ids = set()
123
- for elem in soup.find_all(elem_class):
124
- if elem.get('class'):
125
- for c in elem.get('class'):
126
- classes |= {f".{c}"}
127
- if elem.get('id'):
128
- ids |= {f"#{elem.get('id')}"}
129
-
130
- return ids | classes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text.py DELETED
@@ -1,130 +0,0 @@
1
- from markdownify import markdownify as md
2
- from bs4 import BeautifulSoup as BS
3
- from IPython.display import display, Markdown
4
- from urllib.parse import urljoin
5
- from newspaper import Article
6
- import re
7
- import markdown
8
-
9
-
10
- def clean(s):
11
- s = s.replace("\t", "\\t")
12
- s = s.replace("\n", "\\n")
13
- return s
14
-
15
- class DocTree:
16
- def __init__(self, content):
17
- self.content = content
18
- self.max_depth = 6
19
-
20
- def get_sections(self, *location_ids):
21
- out = self.content
22
- for id_ in location_ids:
23
- out = out[id_]
24
- return out
25
-
26
- def merge_sections(self, elems):
27
- if not isinstance(elems[0], list):
28
- return '\n\n '.join(elems)
29
- out = []
30
- for e in elems:
31
- out.append(self.merge_sections(e))
32
- return '\n\n '.join(map(clean, out))
33
-
34
- def get_merged_sections(self, *location_ids):
35
- return [self.merge_sections(s) for s in self.get_sections(*location_ids)]
36
-
37
- def as_markdown(self, content):
38
- return md(content)
39
-
40
- def get_sections_by_depth(self, depth):
41
- return self._get_sections_by_depth(self.content, depth)
42
-
43
- @staticmethod
44
- def _get_sections_by_depth(content, depth):
45
- """Returns a list of merged sections at a specific depth"""
46
- if depth == 0:
47
- return content
48
- out = []
49
- for elem in content:
50
- out += DocTree._get_sections_by_depth(elem, depth - 1)
51
- return out
52
-
53
-
54
- def fix_relative_links(url, article_content):
55
- if 'http' in url:
56
- base_url = '/'.join(url.split('/')[:3])
57
- else:
58
- base_url = url.split('/')
59
- pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
60
- res = pat.findall(article_content)
61
- if res:
62
- for g in res:
63
- url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
64
- article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
65
- else:print('not found')
66
- return article_content
67
-
68
-
69
- def extract_article(url):
70
- article = Article(url)
71
- article.download()
72
- article.parse()
73
- return article
74
-
75
-
76
- def select_content(html_code, elem_class, class_name):
77
- print(f'Calling select_content with {elem_class}, {class_name}')
78
- if class_name.startswith('.'):
79
- class_name = class_name[1:]
80
- elem_id = None
81
- elif class_name.startswith('#'):
82
- elem_id = class_name[1:]
83
- class_name = None
84
- else:
85
- elem_id = None
86
- class_name = None
87
- return md(str(BS(html_code, features="lxml").find(elem_class, class_=class_name, id=elem_id)))
88
-
89
-
90
- def split_by_heading(html_content, _i):
91
- if _i >= 7:
92
- return html_content
93
- elems = []
94
- for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
95
- if idx > 0 or elem.startswith('>'):
96
- elem = f'<h{_i}{elem}'
97
- elems.append(split_by_heading(elem, _i+1))
98
- return elems
99
-
100
- def doctree_from_url(url, elem_class='div', class_name='article-body'):
101
- article = extract_article(url)
102
- # convert to MD to handle splitting better
103
- article_content = select_content(article.html, elem_class, class_name)
104
- article_content = (f"# {article.title}\n\n" + article_content).replace('\n\n', '\n').replace('#', '%%@@%%')
105
- # fix relative website links
106
- article_content = fix_relative_links(url, article_content)
107
- # convert back to HTML
108
- html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
109
- doc_tree = DocTree(split_by_heading(html_content, 1))
110
-
111
- #assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
112
- return doc_tree
113
-
114
-
115
- def get_selectors_for_class(url, elem_class):
116
- article = extract_article(url)
117
-
118
- html_content = article.html
119
- soup = BS(html_content, features="lxml")
120
- classes = set()
121
- ids = set()
122
- for elem in soup.find_all(elem_class):
123
- if elem.get('class'):
124
- for c in elem.get('class'):
125
- classes |= {f".{c}"}
126
- if elem.get('id'):
127
- for c in elem.get('id'):
128
- ids |= {f"#{c}"}
129
-
130
- return ids | classes