patrol114 commited on
Commit
4d231f1
1 Parent(s): b278e10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +419 -2
app.py CHANGED
@@ -1,3 +1,420 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- gr.load("models/mistralai/Mistral-Nemo-Instruct-2407").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import re
4
+ import nltk
5
+ import psutil
6
+ import numpy as np
7
+ from nltk.tokenize import word_tokenize
8
+ import tensorflow as tf
9
+ from tensorflow.keras import regularizers
10
+ from tensorflow.keras.layers import Layer, Bidirectional, Dense, LayerNormalization, Dropout, Embedding, LSTM, Conv1D, MaxPooling1D, BatchNormalization, GRU, MultiHeadAttention
11
+ from tensorflow.keras.models import Sequential
12
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
13
+ from nltk.corpus import stopwords
14
+ from nltk.stem import WordNetLemmatizer
15
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
16
+ from sklearn.model_selection import train_test_split
17
+ from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
18
+ from sklearn.utils import shuffle
19
+ from typing import List, Optional, Set
20
+ from gensim.models import KeyedVectors
21
+ from pathlib import Path
22
+ import tempfile
23
+ import zipfile
24
+ import requests
25
+ from transformers import AutoTokenizer, AutoModel
26
+ import random
27
 
28
+ # Konfiguracja środowiska
29
+ gpus = tf.config.list_physical_devices("GPU")
30
+ if gpus:
31
+ try:
32
+ for gpu in gpus:
33
+ tf.config.experimental.set_memory_growth(gpu, True)
34
+ print("Dynamiczne zarządzanie pamięcią ustawione dla wszystkich GPU.")
35
+ except RuntimeError as e:
36
+ print(f"Błąd podczas ustawiania dynamicznego zarządzania pamięcią: {e}")
37
+
38
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
39
+ tf.keras.mixed_precision.set_global_policy('float32')
40
+ nltk.download('punkt')
41
+ nltk.download('wordnet')
42
+ nltk.download('stopwords')
43
+
44
+ ZAPISZ_KATALOG = "mozgi"
45
+ KATALOG_LOGOW = "logs"
46
+ directory = "test"
47
+ log_dir = Path('logs')
48
+ tf.keras.backend.clear_session()
49
+ lemmatizer = WordNetLemmatizer()
50
+ stop_words = set(stopwords.words('english'))
51
+
52
+ class TextProcessor:
53
+ class PositionalEncoding(Layer):
54
+ def __init__(self, d_model, **kwargs):
55
+ super().__init__(**kwargs)
56
+ self.d_model = d_model
57
+
58
+ def get_angles(self, position, i):
59
+ angles = 1 / np.power(10000, (2 * (i // 2)) / np.float32(self.d_model))
60
+ return position * angles
61
+
62
+ def call(self, inputs):
63
+ position = tf.shape(inputs)[1]
64
+ angle_rads = self.get_angles(
65
+ position=np.arange(position)[:, np.newaxis],
66
+ i=np.arange(self.d_model)[np.newaxis, :]
67
+ )
68
+ sines = np.sin(angle_rads[:, 0::2])
69
+ cosines = np.cos(angle_rads[:, 1::2])
70
+ pos_encoding = np.concatenate([sines, cosines], axis=-1)
71
+ pos_encoding = tf.cast(pos_encoding, dtype=tf.float32)
72
+ return inputs + pos_encoding
73
+
74
+ class WrappedMultiHeadAttention(Layer):
75
+ def __init__(self, num_heads, d_model, rate=0.2, **kwargs):
76
+ super().__init__(**kwargs)
77
+ self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=d_model, dropout=rate)
78
+
79
+ def call(self, inputs):
80
+ return self.attention(inputs, inputs)
81
+
82
+ class TransformerBlock(Layer):
83
+ def __init__(self, num_heads, d_model, dff, rate=0.2, **kwargs):
84
+ super().__init__(**kwargs)
85
+ self.attention = TextProcessor.WrappedMultiHeadAttention(num_heads, d_model, rate)
86
+ self.ffn = Sequential([
87
+ Dense(dff, activation='relu'),
88
+ Dense(d_model)
89
+ ])
90
+ self.layernorm1 = LayerNormalization(epsilon=1e-6)
91
+ self.layernorm2 = LayerNormalization(epsilon=1e-6)
92
+ self.dropout1 = Dropout(rate)
93
+ self.dropout2 = Dropout(rate)
94
+ self.pos_encoding = TextProcessor.PositionalEncoding(d_model)
95
+
96
+ def call(self, inputs, training):
97
+ inputs = self.pos_encoding(inputs)
98
+ attn_output = self.attention(inputs)
99
+ attn_output = self.dropout1(attn_output, training=training)
100
+ out1 = self.layernorm1(inputs + attn_output)
101
+ ffn_output = self.ffn(out1)
102
+ ffn_output = self.dropout2(ffn_output, training=training)
103
+ return self.layernorm2(out1 + ffn_output)
104
+
105
+ class TextGenerationCallback(tf.keras.callbacks.Callback):
106
+ def __init__(self, tokenizer, input_sequence_length, model_name, model, temperature=1.0):
107
+ super().__init__()
108
+ self.tokenizer = tokenizer
109
+ self.input_sequence_length = input_sequence_length
110
+ self.model_name = model_name
111
+ self.model = model
112
+ self.temperature = temperature
113
+ self.generated_text_interval = 5
114
+ self.seed_texts = ["Dlaczego Python jest popularny?", "Co to jest AI?", "Wyjaśnij sieci neuronowe", "Dlaczego dane są ważne?"]
115
+ self.current_seed_text_index = 0
116
+
117
+ def on_epoch_end(self, epoch, logs=None):
118
+ if epoch % self.generated_text_interval == 0:
119
+ seed_text = self.seed_texts[self.current_seed_text_index]
120
+ self.current_seed_text_index = (self.current_seed_text_index + 1) % len(self.seed_texts)
121
+ generated_text = self.generate_text(seed_text, self.temperature, self.input_sequence_length)
122
+ print(f"\nWygenerowany tekst z modelu '{self.model_name}' po epoce {epoch + 1}:\n{generated_text}\n")
123
+
124
+ def generate_text(self, seed_text, temperature=1.0, num_words=50):
125
+ result = []
126
+ for _ in range(num_words):
127
+ encoded_text = self.tokenizer.encode(seed_text, return_tensors='tf')
128
+ predictions = self.model(encoded_text)
129
+ predictions = predictions.logits[:, -1, :] / temperature
130
+ predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
131
+ seed_text += self.tokenizer.decode([predicted_id])
132
+ result.append(self.tokenizer.decode([predicted_id]))
133
+ return ' '.join(result)
134
+
135
+ def __init__(
136
+ self,
137
+ directory: str,
138
+ oov_token: str = '<OOV>',
139
+ glove_file: str = None,
140
+ gpt2_model_dir: str = 'gpt2',
141
+ model_name: str = 'gpt2',
142
+ input_sequence_length: int = 100,
143
+ output_sequence_length: int = 100,
144
+ batch_size: int = 32,
145
+ lowercase: bool = False,
146
+ handle_numbers: bool = True,
147
+ handle_special_characters: bool = False,
148
+ handle_stop_words: bool = True,
149
+ lemmatize: bool = True,
150
+ handle_python_code: bool = True,
151
+ lstm_units: int = 128,
152
+ dropout_rate: float = 0.2,
153
+ epochs: int = 100,
154
+ learning_rate: float = 0.00001,
155
+ amsgrad: bool = True,
156
+ kernel_regularizer: float = 0.001,
157
+ recurrent_regularizer: float = 0.001,
158
+ bias_regularizer: float = 0.001,
159
+ num_difficult_sequences: int = 50,
160
+ stop_words: Optional[Set[str]] = None,
161
+ log_dir: Optional[str] = 'logs',
162
+ ):
163
+ self.oov_token = oov_token
164
+ self.directory = directory
165
+ self.glove_file = glove_file
166
+ self.gpt2_model_dir = Path(gpt2_model_dir)
167
+ self.model_name = model_name
168
+ self.input_sequence_length = input_sequence_length
169
+ self.output_sequence_length = output_sequence_length
170
+ self.batch_size = batch_size
171
+ self.lowercase = lowercase
172
+ self.handle_numbers = handle_numbers
173
+ self.handle_special_characters = handle_special_characters
174
+ self.handle_stop_words = handle_stop_words
175
+ self.lemmatize = lemmatize
176
+ self.handle_python_code = handle_python_code
177
+ self.lstm_units = lstm_units
178
+ self.dropout_rate = dropout_rate
179
+ self.epochs = epochs
180
+ self.learning_rate = learning_rate
181
+ self.amsgrad = amsgrad
182
+ self.kernel_regularizer = kernel_regularizer
183
+ self.recurrent_regularizer = recurrent_regularizer
184
+ self.bias_regularizer = bias_regularizer
185
+ self.num_difficult_sequences = num_difficult_sequences
186
+ self.stop_words = set(stopwords.words('english')) if stop_words is None else stop_words
187
+ self.tokenizer = None
188
+ self.embedding_matrix = None
189
+ self.vocab_size = 0
190
+ self.model = None
191
+ self.processed_texts = []
192
+ self.log_dir = log_dir
193
+ self.glove_model = None
194
+ self.gpt2_model = None
195
+ self.gpt2_tokenizer = None
196
+
197
+ self.load_models()
198
+
199
+ def create_tokenizer(self, texts: List[str]) -> None:
200
+ if not texts:
201
+ raise ValueError("Lista tekstów jest pusta lub None.")
202
+
203
+ self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
204
+ self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
205
+
206
+ print("Tokenizacja zakończona. Liczba unikalnych tokenów:", len(self.tokenizer.get_vocab()))
207
+
208
+ def load_models(self):
209
+ print("Ładowanie modelu GloVe...")
210
+ self.glove_model = self.load_glove_model()
211
+ print("Model GloVe załadowany.")
212
+
213
+ print("Ładowanie modelu GPT-2...")
214
+ if not Path(self.gpt2_model_dir).exists():
215
+ print(f"Model GPT-2 ({self.model_name}) nie jest dostępny lokalnie. Pobieranie...")
216
+ self.gpt2_model = AutoModel.from_pretrained(self.model_name)
217
+ self.gpt2_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
218
+ self.gpt2_model.save_pretrained(self.gpt2_model_dir)
219
+ self.gpt2_tokenizer.save_pretrained(self.gpt2_model_dir)
220
+ else:
221
+ self.load_gpt2_model()
222
+ print("Model GPT-2 załadowany.")
223
+
224
+ def download_file(self, url, save_path):
225
+ response = requests.get(url, stream=True)
226
+ total_length = response.headers.get('content-length')
227
+
228
+ if total_length is None:
229
+ with open(save_path, 'wb') as f:
230
+ for chunk in response.iter_content(chunk_size=8192):
231
+ if chunk:
232
+ f.write(chunk)
233
+ else:
234
+ dl = 0
235
+ total_length = int(total_length)
236
+ with open(save_path, 'wb') as f:
237
+ for chunk in response.iter_content(chunk_size=8192):
238
+ if chunk:
239
+ dl += len(chunk)
240
+ f.write(chunk)
241
+ done = int(50 * dl / total_length)
242
+ print("\r[%s%s]" % ('=' * done, ' ' * (50-done)), end='')
243
+
244
+ def load_glove_model(self):
245
+ glove_file = "glove.6B.100d.txt"
246
+ if not os.path.exists(glove_file):
247
+ print(f"Plik {glove_file} nie został znaleziony. Rozpoczynam pobieranie...")
248
+ try:
249
+ url = "http://nlp.stanford.edu/data/glove.6B.zip"
250
+ with tempfile.NamedTemporaryFile(delete=False) as tmp_zip:
251
+ self.download_file(url, tmp_zip.name)
252
+ with zipfile.ZipFile(tmp_zip.name) as zf:
253
+ zf.extractall('.')
254
+ glove_file = 'glove.6B.100d.txt'
255
+ print("Pobrano i wypakowano plik GloVe.")
256
+ except Exception as e:
257
+ print(f"Błąd podczas pobierania lub wypakowywania pliku GloVe: {e}")
258
+ return None
259
+
260
+ glove_model = {}
261
+ with open(glove_file, 'r', encoding='utf-8') as f:
262
+ for line in f:
263
+ split_line = line.split()
264
+ word = split_line[0]
265
+ embedding = np.array([float(val) for val in split_line[1:]])
266
+ glove_model[word] = embedding
267
+
268
+ return glove_model
269
+
270
+ def load_gpt2_model(self):
271
+ try:
272
+ self.gpt2_model = AutoModel.from_pretrained(self.model_name)
273
+ self.gpt2_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
274
+ print("Standardowy model GPT-2 załadowany pomyślnie.")
275
+ except Exception as e:
276
+ print(f"Błąd podczas wczytywania standardowego modelu GPT-2: {e}")
277
+
278
+ def preprocess_text(self, text_input):
279
+ if isinstance(text_input, bytes):
280
+ text = text_input.decode('utf-8')
281
+ elif isinstance(text_input, tf.Tensor):
282
+ text = text_input.numpy().decode('utf-8')
283
+ else:
284
+ text = text_input
285
+
286
+ tokens = word_tokenize(text)
287
+ if self.lowercase:
288
+ tokens = [token.lower() for token in tokens]
289
+ if self.lemmatize:
290
+ tokens = [lemmatizer.lemmatize(token) for token in tokens]
291
+ if self.handle_stop_words:
292
+ tokens = [token for token in tokens if token not in self.stop_words]
293
+
294
+ return ' '.join(tokens)
295
+
296
+ def create_embedding_matrix(self, vocab_size, embedding_dim=100):
297
+ embedding_matrix = np.zeros((vocab_size, embedding_dim))
298
+ missed_embeddings = 0
299
+
300
+ all_embeddings = np.stack(list(self.glove_model.values()))
301
+ mean_embedding = np.mean(all_embeddings, axis=0)
302
+
303
+ for word, idx in self.tokenizer.get_vocab().items():
304
+ embedding_vector = self.glove_model.get(word)
305
+
306
+ if embedding_vector is not None:
307
+ embedding_matrix[idx] = embedding_vector
308
+ else:
309
+ missed_embeddings += 1
310
+ embedding_matrix[idx] = mean_embedding
311
+
312
+ print(f"Liczba słów bez dostępnego wektora embeddingu: {missed_embeddings}")
313
+
314
+ return embedding_matrix
315
+
316
+ def create_sequences(self):
317
+ processed_texts, _ = self._load_and_preprocess_files(self.directory, ['.txt'])
318
+
319
+ self.create_tokenizer(processed_texts)
320
+ vocab_size = len(self.tokenizer.get_vocab())
321
+ embedding_matrix = self.create_embedding_matrix(vocab_size)
322
+
323
+ sequences = []
324
+ for text in processed_texts:
325
+ encoded = self.tokenizer.encode(text)
326
+ for i in range(1, len(encoded)):
327
+ input_seq = encoded[:i]
328
+ sequences.append(input_seq)
329
+
330
+ max_sequence_len = max([len(seq) for seq in sequences])
331
+ sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))
332
+
333
+ X, y = sequences[:, :-1], sequences[:, -1]
334
+ y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)
335
+
336
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
337
+
338
+ return X_train, X_val, y_train, y_val, embedding_matrix, vocab_size, max_sequence_len
339
+
340
+ def _load_and_preprocess_files(self, directory, file_formats):
341
+ processed_texts = []
342
+ word_counts = {}
343
+
344
+ if not os.path.isdir(directory):
345
+ raise FileNotFoundError(f"Błąd: Podana ścieżka '{directory}' nie jest katalogiem.")
346
+
347
+ files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and any(f.endswith(format) for format in file_formats)]
348
+ if not files:
349
+ raise FileNotFoundError("Brak plików w podanym formacie w katalogu.")
350
+
351
+ for file in files:
352
+ file_path = os.path.join(directory, file)
353
+ with open(file_path, "r", encoding='utf-8') as f:
354
+ lines = f.readlines()
355
+ if not lines:
356
+ print(f"Plik {file} jest pusty.")
357
+ continue
358
+
359
+ for line in lines:
360
+ processed_line = self.preprocess_text(line)
361
+ processed_texts.append(processed_line)
362
+ word_count = len(processed_line.split())
363
+ word_counts[file] = word_counts.get(file, 0) + word_count
364
+ print(f"Przetworzono plik: {file}, liczba słów: {word_count}")
365
+
366
+ if not processed_texts:
367
+ raise ValueError("Brak przetworzonych tekstów. Proszę sprawdzić zawartość katalogu.")
368
+ else:
369
+ print(f"Liczba przetworzonych tekstów: {len(processed_texts)}")
370
+
371
+ return processed_texts, word_counts
372
+
373
+ def create_and_train_model(self):
374
+ X_train, X_val, y_train, y_val, embedding_matrix, vocab_size, max_sequence_len = self.create_sequences()
375
+
376
+ model = Sequential()
377
+ model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_sequence_len - 1, trainable=False))
378
+ model.add(Bidirectional(LSTM(self.lstm_units)))
379
+ model.add(Dropout(self.dropout_rate))
380
+ model.add(Dense(vocab_size, activation='softmax'))
381
+
382
+ model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
383
+ model.summary()
384
+
385
+ log_dir = os.path.join(KATALOG_LOGOW, self.model_name)
386
+ tensorboard_callback = TensorBoard(log_dir=log_dir)
387
+
388
+ early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
389
+
390
+ model.fit(X_train, y_train, epochs=self.epochs, validation_data=(X_val, y_val), callbacks=[tensorboard_callback, early_stopping_callback])
391
+
392
+ self.model = model
393
+ self.save_model_and_tokenizer()
394
+
395
+ def save_model_and_tokenizer(self):
396
+ if not os.path.exists(ZAPISZ_KATALOG):
397
+ os.makedirs(ZAPISZ_KATALOG)
398
+ self.model.save(f'{ZAPISZ_KATALOG}/{self.model_name}.h5')
399
+ with open(f'{ZAPISZ_KATALOG}/{self.model_name}_tokenizer.pkl', 'wb') as handle:
400
+ pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
401
+ print("Model i tokenizer zapisane.")
402
+
403
+ def main():
404
+ print("Witaj w AI Code Generator!")
405
+ directory = "test"
406
+ model_name = input("Podaj nazwę modelu: ")
407
+
408
+ processor = TextProcessor(
409
+ directory=directory,
410
+ model_name=model_name,
411
+ input_sequence_length=100,
412
+ output_sequence_length=100,
413
+ epochs=10,
414
+ )
415
+
416
+ processor.create_and_train_model()
417
+ print("Model utworzony i wytrenowany pomyślnie!")
418
+
419
+ if __name__ == "__main__":
420
+ main()