smhavens commited on
Commit
052d58f
1 Parent(s): 3835be6

Update model calls to use variable.

Browse files
Files changed (1) hide show
  1. app.py +9 -80
app.py CHANGED
@@ -23,6 +23,7 @@ import random
23
  # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
24
  subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
25
  # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
 
26
  nltk.download('stopwords')
27
  nlp = spacy.load("en_core_web_sm")
28
  stops = stopwords.words("english")
@@ -75,85 +76,11 @@ def compute_metrics(eval_pred):
75
  predictions = np.argmax(logits, axis=-1)
76
  metric = evaluate.load("accuracy")
77
  return metric.compute(predictions=predictions, references=labels)
78
-
79
-
80
- def training():
81
- dataset_id = "ag_news"
82
- dataset = load_dataset(dataset_id)
83
- # dataset = dataset["train"]
84
- # tokenized_datasets = dataset.map(tokenize_function, batched=True)
85
-
86
- print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
87
- print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['text'])} as value.")
88
- print(f"- Examples look like this: {dataset['train'][0]}")
89
-
90
- # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
91
- # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
92
-
93
- # dataset = dataset["train"].map(tokenize_function, batched=True)
94
- # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
95
- # dataset.format['type']
96
-
97
- # print(dataset)
98
-
99
- train_examples = []
100
- train_data = dataset["train"]
101
- # For agility we only 1/2 of our available data
102
- n_examples = dataset["train"].num_rows // 2
103
-
104
- for i in range(n_examples):
105
- example = train_data[i]
106
- # example_opposite = dataset_clean[-(i)]
107
- # print(example["text"])
108
- train_examples.append(InputExample(texts=[example['text']], label=example['label']))
109
-
110
- train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
111
-
112
- print("END DATALOADER")
113
-
114
- # print(train_examples)
115
-
116
- embeddings = finetune(train_dataloader)
117
-
118
- return (dataset['train'].num_rows, type(dataset['train'][0]), type(dataset['train'][0]['text']), dataset['train'][0], embeddings)
119
-
120
-
121
- def finetune(train_dataloader):
122
- # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
123
- model_id = "sentence-transformers/all-MiniLM-L6-v2"
124
- model = SentenceTransformer(model_id)
125
-
126
- # training_args = TrainingArguments(output_dir="test_trainer")
127
-
128
- # USE THIS LINK
129
- # https://huggingface.co/blog/how-to-train-sentence-transformers
130
-
131
- train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
132
-
133
- print("BEGIN FIT")
134
-
135
- model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
136
-
137
- model.save("ag_news_model")
138
-
139
- model.save_to_hub("smhavens/all-MiniLM-agNews")
140
- # accuracy = compute_metrics(eval, metric)
141
-
142
- # training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
143
-
144
- # trainer = Trainer(
145
- # model=model,
146
- # args=training_args,
147
- # train_dataset=train,
148
- # eval_dataset=eval,
149
- # compute_metrics=compute_metrics,
150
- # )
151
-
152
- # trainer.train()
153
 
154
 
155
  def get_model():
156
- model = SentenceTransformer("bert-analogies")
 
157
  gpu_available = torch.cuda.is_available()
158
  device = torch.device("cuda" if gpu_available else "cpu")
159
  model = model.to(device)
@@ -175,9 +102,10 @@ def embeddings(model, sentences):
175
  global word1
176
  global word2
177
  global word3
 
178
 
179
  # Load model from HuggingFace Hub
180
- tokenizer = AutoTokenizer.from_pretrained('bert-analogies')
181
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
182
  # token_ids = tokenizer.encode(sentences, return_tensors='pt')
183
  # blank_id = tokenizer.mask_token_id
@@ -199,7 +127,7 @@ def embeddings(model, sentences):
199
  model_output = model(**encoded_input)
200
  # output = model(encoded_input_topk)
201
 
202
- unmasker = pipeline('fill-mask', model='bert-analogies')
203
  guesses = unmasker(sentences)
204
  print(guesses)
205
 
@@ -223,12 +151,13 @@ def embeddings(model, sentences):
223
 
224
 
225
  def random_word():
226
- with open('ag_news_model/vocab.txt', 'r') as file:
 
227
  line = ""
228
  content = file.readlines()
229
  length = len(content)
230
  while line == "":
231
- rand_line = random.randrange(1997, length)
232
 
233
  if content[rand_line][0].isalpha() and content[rand_line][:-1] not in stops and content[rand_line][:-1] not in ROMAN_CONSTANTS:
234
  line = content[rand_line]
 
23
  # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
24
  subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
25
  # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
26
+ model_base = "bert-analogies"
27
  nltk.download('stopwords')
28
  nlp = spacy.load("en_core_web_sm")
29
  stops = stopwords.words("english")
 
76
  predictions = np.argmax(logits, axis=-1)
77
  metric = evaluate.load("accuracy")
78
  return metric.compute(predictions=predictions, references=labels)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
 
81
  def get_model():
82
+ global model_base
83
+ model = SentenceTransformer(model_base)
84
  gpu_available = torch.cuda.is_available()
85
  device = torch.device("cuda" if gpu_available else "cpu")
86
  model = model.to(device)
 
102
  global word1
103
  global word2
104
  global word3
105
+ global model_base
106
 
107
  # Load model from HuggingFace Hub
108
+ tokenizer = AutoTokenizer.from_pretrained(model_base)
109
  encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
110
  # token_ids = tokenizer.encode(sentences, return_tensors='pt')
111
  # blank_id = tokenizer.mask_token_id
 
127
  model_output = model(**encoded_input)
128
  # output = model(encoded_input_topk)
129
 
130
+ unmasker = pipeline('fill-mask', model=model_base)
131
  guesses = unmasker(sentences)
132
  print(guesses)
133
 
 
151
 
152
 
153
  def random_word():
154
+ global model_base
155
+ with open(model_base + '/vocab.txt', 'r') as file:
156
  line = ""
157
  content = file.readlines()
158
  length = len(content)
159
  while line == "":
160
+ rand_line = random.randrange(0, length)
161
 
162
  if content[rand_line][0].isalpha() and content[rand_line][:-1] not in stops and content[rand_line][:-1] not in ROMAN_CONSTANTS:
163
  line = content[rand_line]