Spaces:

smhavens
/

AnalogyArcade

Sleeping

App Files Files Community

smhavens commited on Dec 9, 2023

Commit

3922a86

•

1 Parent(s): a2c6b40

Massive changes, using better dataset and now returning random masks

Browse files

Files changed (13) hide show

analogy_train.py +301 -0
app.py +117 -44
bert-analogies/1_Pooling/config.json +7 -0
bert-analogies/README.md +88 -0
bert-analogies/config.json +26 -0
bert-analogies/config_sentence_transformers.json +7 -0
bert-analogies/model.safetensors +3 -0
bert-analogies/modules.json +20 -0
bert-analogies/sentence_bert_config.json +4 -0
bert-analogies/special_tokens_map.json +7 -0
bert-analogies/tokenizer.json +0 -0
bert-analogies/tokenizer_config.json +64 -0
bert-analogies/vocab.txt +0 -0

analogy_train.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import gradio as gr
+import math
+import spacy
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+from sentence_transformers import InputExample
+from sentence_transformers import losses
+from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
+from transformers import TrainingArguments, Trainer
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+import numpy as np
+import evaluate
+import nltk
+from nltk.corpus import stopwords
+import subprocess
+import sys
+from transformers import DataCollatorWithPadding
+from transformers import TrainingArguments
+from transformers import (
+    BertModel,
+    BertTokenizerFast,
+    Trainer,
+    EvalPrediction
+)
+# !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
+# subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
+# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+# nltk.download('stopwords')
+# nlp = spacy.load("en_core_web_sm")
+# stops = stopwords.words("english")
+# answer = "Pizza"
+guesses = []
+answer = "Pizza"
+tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+metric = evaluate.load("accuracy")
+def tokenize_function(examples):
+    return tokenizer(examples["stem"], padding="max_length", truncation=True)
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    metric = evaluate.load("accuracy")
+    return metric.compute(predictions=predictions, references=labels)
+# def training():
+#     dataset_id = "relbert/analogy_questions"
+#     dataset_sub = "bats"
+#     print("GETTING DATASET")
+#     raw_dataset = load_dataset(dataset_id, dataset_sub)
+#     # data_metric = evaluate.load(dataset_id, dataset_sub)
+#     checkpoint = "bert-base-uncased"
+#     model = BertModel.from_pretrained(checkpoint)
+#     # dataset = dataset["train"]
+#     # tokenized_datasets = dataset.map(tokenize_function, batched=True)
+#     # print(raw_dataset)
+#     test_data = raw_dataset["test"]
+#     # print(test_data["stem"])
+#     all_answers = []
+#     for answer in raw_dataset["answer"]:
+#         answer = raw_dataset["choice"][answer]
+#     raw_dataset = raw_dataset.add_column("label", all_answers)
+#     print(raw_dataset)
+#     print(raw_dataset["label"])
+#     dataset = raw_dataset.map(
+#         lambda x: tokenizer(x["stem"], truncation=True),
+#         batched=True,
+#     )
+#     print(dataset)
+#     dataset = dataset.remove_columns(["stem", "answer", "choice"])
+#     dataset = dataset.rename_column("label", "labels")
+#     dataset = dataset.with_format("torch")
+#     training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
+#     print(dataset)
+#     # print(f"- The {dataset_id} dataset has {dataset.num_rows} examples.")
+#     # print(f"- Each example is a {type(dataset[0])} with a {type(dataset[0]['stem'])} as value.")
+#     # print(f"- Examples look like this: {dataset[0]}")
+#     # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+#     # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+#     # dataset = dataset["train"].map(tokenize_function, batched=True)
+#     # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
+#     # dataset.format['type']
+#     # tokenized_news = dataset.map(tokenize_function, batched=True)
+#     # model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", num_labels=2)
+#     # print(dataset)
+#     # Choose the appropriate device based on availability (CUDA or CPU)
+#     # gpu_available = torch.cuda.is_available()
+#     # device = torch.device("cuda" if gpu_available else "cpu")
+#     # model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
+#     # tokenized_datasets = dataset.map(tokenize_function, batched=True)
+#     # print(tokenized_datasets)
+#     # # small_train_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+#     # # small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(1000))
+#     # model = model.to(device)
+#     # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+#     # training_args = TrainingArguments(output_dir="test_trainer")
+#     trainer = Trainer(
+#         model=model,
+#         args=training_args,
+#         train_dataset=dataset["test"],
+#         eval_dataset=dataset["validation"],
+#         compute_metrics=compute_metrics,
+#     )
+#     output = trainer.train()
+#     # train_examples = []
+#     # train_data = dataset["train"]
+#     # # For agility we only 1/2 of our available data
+#     # n_examples = dataset["train"].num_rows // 2
+#     # for i in range(n_examples):
+#     #     example = train_data[i]
+#     #     # example_opposite = dataset_clean[-(i)]
+#     #     # print(example["text"])
+#     #     train_examples.append(InputExample(texts=[example['stem'], example]))
+#     # train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
+#     # print("END DATALOADER")
+#     # # print(train_examples)
+#     # embeddings = finetune(train_dataloader)
+#     print(output)
+#     model.save("bert-analogies")
+#     model.save_to_hub("smhavens/bert-base-analogies")
+#     return output
+# def finetune(train_dataloader):
+#     # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+#     model_id = "sentence-transformers/all-MiniLM-L6-v2"
+#     model = SentenceTransformer(model_id)
+#     device = torch.device('cuda:0')
+#     model = model.to(device)
+#     # training_args = TrainingArguments(output_dir="test_trainer")
+#     # USE THIS LINK
+#     # https://huggingface.co/blog/how-to-train-sentence-transformers
+#     train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
+#     print("BEGIN FIT")
+#     model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
+#     model.save("bert-analogies")
+#     model.save_to_hub("smhavens/bert-base-analogies")
+#     return 0
+def training():
+    dataset_id = "relbert/analogy_questions"
+    dataset_sub = "bats"
+    print("GETTING DATASET")
+    dataset = load_dataset(dataset_id, dataset_sub)
+    # dataset = dataset["train"]
+    # tokenized_datasets = dataset.map(tokenize_function, batched=True)
+    print(f"- The {dataset_id} dataset has {dataset['test'].num_rows} examples.")
+    print(f"- Each example is a {type(dataset['test'][0])} with a {type(dataset['test'][0]['stem'])} as value.")
+    print(f"- Examples look like this: {dataset['test'][0]}")
+    train_examples = []
+    train_data = dataset["test"]
+    # For agility we only 1/2 of our available data
+    n_examples = dataset["test"].num_rows // 2
+    for i in range(n_examples):
+        example = train_data[i]
+        temp_word_1 = example["stem"][0]
+        temp_word_2 = example["stem"][1]
+        temp_word_3 = example["choice"][example["answer"]][0]
+        temp_word_4 = example["choice"][example["answer"]][1]
+        comp1 = f"{temp_word_1} to {temp_word_2}"
+        comp2 = f"{temp_word_3} to {temp_word_4}"
+        # example_opposite = dataset_clean[-(i)]
+        # print(example["text"])
+        train_examples.append(InputExample(texts=[comp1, comp2]))
+    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
+    print("END DATALOADER")
+    # print(train_examples)
+    embeddings = finetune(train_dataloader)
+    return (dataset['test'].num_rows, type(dataset['test'][0]), type(dataset['test'][0]['stem']), dataset['test'][0], embeddings)
+def finetune(train_dataloader):
+    # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+    model_id = "sentence-transformers/all-MiniLM-L6-v2"
+    model = SentenceTransformer(model_id)
+    device = torch.device('cuda:0')
+    model = model.to(device)
+    # training_args = TrainingArguments(output_dir="test_trainer")
+    # USE THIS LINK
+    # https://huggingface.co/blog/how-to-train-sentence-transformers
+    train_loss = losses.MegaBatchMarginLoss(model=model)
+    print("BEGIN FIT")
+    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
+    model.save("bert-analogies")
+    # model.save_to_hub("smhavens/bert-base-analogies")
+    # accuracy = compute_metrics(eval, metric)
+    return 0
+def greet(name):
+    return "Hello " + name + "!!"
+def check_answer(guess:str):
+    global guesses
+    global answer
+    guesses.append(guess)
+    output = ""
+    for guess in guesses:
+        output += ("- " + guess + "\n")
+    output = output[:-1]
+    if guess.lower() == answer.lower():
+        return "Correct!", output
+    else:
+        return "Try again!", output
+def main():
+    print("BEGIN")
+    word1 = "Black"
+    word2 = "White"
+    word3 = "Sun"
+    global answer
+    answer = "Moon"
+    global guesses
+    num_rows, data_type, value, example, embeddings = training()
+    # prompt = f"{word1} is to {word2} as {word3} is to ____"
+    # with gr.Blocks() as iface:
+    #     gr.Markdown(prompt)
+    #     with gr.Tab("Guess"):
+    #         text_input = gr.Textbox()
+    #         text_output = gr.Textbox()
+    #         text_button = gr.Button("Submit")
+    #     with gr.Accordion("Open for previous guesses"):
+    #         text_guesses = gr.Textbox()
+    #     with gr.Tab("Testing"):
+    #         gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
+    #                     An example is {example}.
+    #                     The Embeddings are {embeddings}.""")
+    #     text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
+    # # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
+    # iface.launch()
+if __name__ == "__main__":
+    main()

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ from datasets import load_dataset
 from sentence_transformers import SentenceTransformer
 from sentence_transformers import InputExample
 from sentence_transformers import losses
 from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
 from transformers import TrainingArguments, Trainer
 import torch
@@ -16,6 +18,7 @@ import nltk
 from nltk.corpus import stopwords
 import subprocess
 import sys
 # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
 subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
@@ -23,10 +26,20 @@ subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingf
 nltk.download('stopwords')
 nlp = spacy.load("en_core_web_sm")
 stops = stopwords.words("english")
 # answer = "Pizza"
 guesses = []
-answer = "Pizza"
 #Mean Pooling - Take attention mask into account for correct averaging
@@ -134,65 +147,108 @@ def finetune(train_dataloader):
     # trainer.train()
-def embeddings():
-    model = SentenceTransformer("ag_news_model")
     device = torch.device('cuda:0')
     model = model.to(device)
-    sentences = ["This is an example sentence", "Each sentence is converted"]
-    # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
     embeddings = model.encode(sentences)
-    # print(embeddings)
-    # Sentences we want sentence embeddings for
-    sentences = ['This is an example sentence', 'Each sentence is converted']
     # Load model from HuggingFace Hub
-    tokenizer = AutoTokenizer.from_pretrained('ag_news_model')
-    # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
-    # Tokenize sentences
     encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
-    # print(model.device)
-    # print(encoded_input["input_ids"].device)
-    # print(encoded_input["attention_mask"].device)
-    # print(encoded_input["token_type_ids"].device)
     encoded_input["input_ids"] = encoded_input["input_ids"].to(device)
     encoded_input["attention_mask"] = encoded_input["attention_mask"].to(device)
     encoded_input['token_type_ids'] = encoded_input['token_type_ids'].to(device)
-    # print(encoded_input)
-    # print(encoded_input["input_ids"].device)
-    # print(encoded_input["attention_mask"].device)
-    # print(encoded_input["token_type_ids"].device)
     encoded_input['input'] = {'input_ids':encoded_input['input_ids'], 'attention_mask':encoded_input['attention_mask']}
-    #  + encoded_input['token_type_ids'] + encoded_input['attention_mask']
     del encoded_input['input_ids']
     del encoded_input['token_type_ids']
     del encoded_input['attention_mask']
-    # print(encoded_input)
-    # encoded_input.to(device)
-    # Compute token embeddings
     with torch.no_grad():
         model_output = model(**encoded_input)
-    print(model_output)
     # Perform pooling
     sentence_embeddings = mean_pooling(model_output, encoded_input['input']["attention_mask"])
     # Normalize embeddings
     sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
-    print("Sentence embeddings:")
-    print(sentence_embeddings)
-    return sentence_embeddings
 def greet(name):
     return "Hello " + name + "!!"
@@ -200,29 +256,46 @@ def greet(name):
 def check_answer(guess:str):
     global guesses
     global answer
-    guesses.append(guess)
     output = ""
-    for guess in guesses:
-        output += ("- " + guess + "\n")
     output = output[:-1]
-    if guess.lower() == answer.lower():
         return "Correct!", output
     else:
         return "Try again!", output
 def main():
-    word1 = "Black"
-    word2 = "White"
-    word3 = "Sun"
     global answer
-    answer = "Moon"
     global guesses
     # num_rows, data_type, value, example, embeddings = training()
-    sent_embeddings = embeddings()
     prompt = f"{word1} is to {word2} as {word3} is to ____"
     with gr.Blocks() as iface:
         gr.Markdown(prompt)
         with gr.Tab("Guess"):
@@ -231,8 +304,8 @@ def main():
             text_button = gr.Button("Submit")
         with gr.Accordion("Open for previous guesses"):
             text_guesses = gr.Textbox()
-        with gr.Tab("Testing"):
-            gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
         text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
     # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
     iface.launch()

 from sentence_transformers import SentenceTransformer
 from sentence_transformers import InputExample
 from sentence_transformers import losses
+from sentence_transformers import util
+from transformers import pipeline
 from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
 from transformers import TrainingArguments, Trainer
 import torch
 from nltk.corpus import stopwords
 import subprocess
 import sys
+import random
 # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
 subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
 nltk.download('stopwords')
 nlp = spacy.load("en_core_web_sm")
 stops = stopwords.words("english")
+ROMAN_CONSTANTS = (
+            ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
+            ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
+            ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
+            ( "", "M", "MM", "MMM", "",   "",  "-",  "",    "",     ""   ),
+        )
 # answer = "Pizza"
 guesses = []
+return_guesses = []
+answer = "Moon"
+word1 = "Black"
+word2 = "White"
+word3 = "Sun"
 #Mean Pooling - Take attention mask into account for correct averaging
     # trainer.train()
+def get_model():
+    model = SentenceTransformer("bert-analogies")
     device = torch.device('cuda:0')
     model = model.to(device)
+    return model
+def cosine_scores(model, sentence):
+    global word1
+    global word2
+    global word3
+    # sentence1 = f"{word1} is to {word2} as"
+    embeddings1 = model.encode(sentence, convert_to_tensor=True)
+def embeddings(model, sentences):
+    gpu_available = torch.cuda.is_available()
+    device = torch.device("cuda" if gpu_available else "cpu")
+    # device = torch.device('cuda:0')
     embeddings = model.encode(sentences)
+    global word1
+    global word2
+    global word3
     # Load model from HuggingFace Hub
+    tokenizer = AutoTokenizer.from_pretrained('bert-analogies')
     encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+    token_ids = tokenizer.encode(sentences, return_tensors='pt')
+    blank_id = tokenizer.mask_token_id
+    blank_id_idx = torch.where(encoded_input["input_ids"] == blank_id)[1]
     encoded_input["input_ids"] = encoded_input["input_ids"].to(device)
     encoded_input["attention_mask"] = encoded_input["attention_mask"].to(device)
     encoded_input['token_type_ids'] = encoded_input['token_type_ids'].to(device)
     encoded_input['input'] = {'input_ids':encoded_input['input_ids'], 'attention_mask':encoded_input['attention_mask']}
     del encoded_input['input_ids']
     del encoded_input['token_type_ids']
     del encoded_input['attention_mask']
     with torch.no_grad():
+        # output = model(encoded_input)
+        print(encoded_input)
         model_output = model(**encoded_input)
+        # output = model(encoded_input_topk)
+    unmasker = pipeline('fill-mask', model='bert-analogies')
+    guesses = unmasker(sentences)
+    print(guesses)
     # Perform pooling
     sentence_embeddings = mean_pooling(model_output, encoded_input['input']["attention_mask"])
     # Normalize embeddings
     sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+    potential_words = []
+    for guess in guesses:
+        temp_word = guess['token_str']
+        if temp_word[0].isalpha() and temp_word not in stops and temp_word not in ROMAN_CONSTANTS:
+            potential_words.append(guess['token_str'])
+    return potential_words
+def random_word():
+    with open('ag_news_model/vocab.txt', 'r') as file:
+        line = ""
+        content = file.readlines()
+        length = len(content)
+        while line == "":
+            rand_line = random.randrange(1997, length)
+            if content[rand_line][0].isalpha() and content[rand_line][:-1] not in stops and content[rand_line][:-1] not in ROMAN_CONSTANTS:
+                line = content[rand_line]
+            else:
+                print(f"{content[rand_line]} is not alpha or is a stop word")
+        # for num, aline in enumerate(file, 1997):
+        #     if random.randrange(num) and aline.isalpha():
+        #         continue
+        #     # elif not aline.isalpha():
+        #     line = aline
+    print(line)
+    return line[:-1]
+def generate_prompt(model):
+    global word1
+    global word2
+    global word3
+    global answer
+    word1 = random_word()
+    word2 = random_word()
+    word3 = random_word()
+    sentence = f"{word1} is to {word2} as {word3} is to [MASK]"
+    print(sentence)
+    answer = embeddings(model, sentence)[0]
+    print("ANSWER IS", answer)
+    # cosine_scores(model, sentence)
 def greet(name):
     return "Hello " + name + "!!"
 def check_answer(guess:str):
     global guesses
     global answer
+    global return_guesses
+    model = get_model()
     output = ""
+    protected_guess = guess
+    sentence = f"{word1} is to {word2} as [MASK] is to {guess}"
+    other_word = embeddings(model, sentence)[0]
+    guesses.append(guess)
+    print("GUESS IS", guess)
+    return_guess = f"{guess}: {word1} is to {word2} as {other_word} is to {guess}"
+    print("GUESS IS", guess)
+    return_guesses.append(return_guess)
+    for guess in return_guesses:
+        output += (guess + "\n")
     output = output[:-1]
+    print("GUESS IS", protected_guess)
+    print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
+    if protected_guess.lower() == answer.lower():
         return "Correct!", output
     else:
         return "Try again!", output
 def main():
+    global word1
+    global word2
+    global word3
     global answer
+    # answer = "Moon"
     global guesses
     # num_rows, data_type, value, example, embeddings = training()
+    # sent_embeddings = embeddings()
+    model = get_model()
+    generate_prompt(model)
     prompt = f"{word1} is to {word2} as {word3} is to ____"
+    print(prompt)
+    print("TESTING EMBEDDINGS")
     with gr.Blocks() as iface:
         gr.Markdown(prompt)
         with gr.Tab("Guess"):
             text_button = gr.Button("Submit")
         with gr.Accordion("Open for previous guesses"):
             text_guesses = gr.Textbox()
+        # with gr.Tab("Testing"):
+        #     gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
         text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
     # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
     iface.launch()

bert-analogies/1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "word_embedding_dimension": 384,
+  "pooling_mode_cls_token": false,
+  "pooling_mode_mean_tokens": true,
+  "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false
+}

bert-analogies/README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+pipeline_tag: sentence-similarity
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+---
+# {MODEL_NAME}
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+<!--- Describe your model here -->
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('{MODEL_NAME}')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Evaluation Results
+<!--- Describe how your model was evaluated -->
+For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
+## Training
+The model was trained with the parameters:
+**DataLoader**:
+`torch.utils.data.dataloader.DataLoader` of length 36 with parameters:
+```
+{'batch_size': 25, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
+```
+**Loss**:
+`sentence_transformers.losses.MegaBatchMarginLoss.MegaBatchMarginLoss`
+Parameters of the fit()-Method:
+```
+{
+    "epochs": 10,
+    "evaluation_steps": 0,
+    "evaluator": "NoneType",
+    "max_grad_norm": 1,
+    "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
+    "optimizer_params": {
+        "lr": 2e-05
+    },
+    "scheduler": "WarmupLinear",
+    "steps_per_epoch": null,
+    "warmup_steps": 10000,
+    "weight_decay": 0.01
+}
+```
+## Full Model Architecture
+```
+SentenceTransformer(
+  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel
+  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
+  (2): Normalize()
+)
+```
+## Citing & Authors
+<!--- Describe where people can find more information -->

bert-analogies/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "/home/smhavens/.cache/torch/sentence_transformers/sentence-transformers_all-MiniLM-L6-v2/",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

bert-analogies/config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "__version__": {
+    "sentence_transformers": "2.0.0",
+    "transformers": "4.6.1",
+    "pytorch": "1.8.1"
+  }
+}

bert-analogies/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0f7c659ba309023355f651ecbe72279f6caef5fe5f274e59168f1bcedb36368
+size 90864192

bert-analogies/modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

bert-analogies/sentence_bert_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 256,
+  "do_lower_case": false
+}

bert-analogies/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

bert-analogies/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

bert-analogies/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "max_length": 128,
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

bert-analogies/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff