ramalMr commited on
Commit
b51bb0e
1 Parent(s): 36d2989

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py CHANGED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import InferenceClient
2
+ import gradio as gr
3
+ import random
4
+ import pandas as pd
5
+ from io import BytesIO
6
+ import csv
7
+ import os
8
+ import io
9
+ import tempfile
10
+ import re
11
+ from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
12
+
13
+ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
14
+
15
+ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
16
+ model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
17
+
18
+ def translate_to_english(text, source_lang):
19
+ encoded_input = tokenizer(text, return_tensors="pt")
20
+ generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id("en"))
21
+ translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
22
+ return translated_text
23
+
24
+ def translate_to_azerbaijani(text):
25
+ encoded_input = tokenizer(text, return_tensors="pt")
26
+ generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.get_lang_id("az"))
27
+ translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
28
+ return translated_text
29
+
30
+ def extract_text_from_excel(file):
31
+ df = pd.read_excel(file)
32
+ text = ' '.join(df['Unnamed: 1'].astype(str))
33
+ source_lang = "az" # Azerbaijani
34
+ english_text = translate_to_english(text, source_lang)
35
+ return english_text
36
+
37
+ def save_to_csv(sentence, output, filename="synthetic_data.csv"):
38
+ azerbaijani_output = translate_to_azerbaijani(output)
39
+ with open(filename, mode='a', newline='', encoding='utf-8') as file:
40
+ writer = csv.writer(file)
41
+ writer.writerow([sentence, azerbaijani_output])
42
+
43
+ def generate(file, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences):
44
+ text = extract_text_from_excel(file)
45
+ sentences = text.split('.')
46
+ random.shuffle(sentences) # Shuffle sentences
47
+
48
+ with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
49
+ fieldnames = ['Original Sentence', 'Generated Sentence']
50
+ writer = csv.DictWriter(tmp, fieldnames=fieldnames)
51
+ writer.writeheader()
52
+
53
+ for sentence in sentences:
54
+ sentence = sentence.strip()
55
+ if not sentence:
56
+ continue
57
+
58
+ generate_kwargs = {
59
+ "temperature": temperature,
60
+ "max_new_tokens": max_new_tokens,
61
+ "top_p": top_p,
62
+ "repetition_penalty": repetition_penalty,
63
+ "do_sample": True,
64
+ "seed": 42,
65
+ }
66
+
67
+ try:
68
+ stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
69
+ output = ""
70
+ for response in stream:
71
+ output += response.token.text
72
+
73
+ generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
74
+ generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
75
+
76
+ for _ in range(num_similar_sentences):
77
+ if not generated_sentences:
78
+ break
79
+ generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences)))
80
+ writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
81
+
82
+ except Exception as e:
83
+ print(f"Error generating data for sentence '{sentence}': {e}")
84
+
85
+ tmp_path = tmp.name
86
+
87
+ return tmp_path
88
+
89
+ gr.Interface(
90
+ fn=generate,
91
+ inputs=[
92
+ gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
93
+ gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
94
+ gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
95
+ gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
96
+ gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
97
+ gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"),
98
+ ],
99
+ outputs=gr.File(label="Synthetic Data "),
100
+ title="SDG",
101
+ description="AYE QABIL.",
102
+ allow_flagging="never",
103
+ ).launch()