Updated model with better training and evaluation. Test and val data included as pickle files.

Browse files

Files changed (17) hide show

.gitattributes +3 -37
1_Pooling/config.json +3 -1
LT_training_config.json +15 -13
README.md +95 -34
added_tokens.json +4 -0
config.json +11 -17
config_sentence_transformers.json +3 -3
entity_vocab.json +6 -0
merges.txt +0 -0
pytorch_model.bin → model.safetensors +2 -2
sentence_bert_config.json +1 -1
sentencepiece.bpe.model +3 -0
special_tokens_map.json +66 -6
tokenizer.json +0 -0
tokenizer_config.json +97 -4
vocab.json +0 -0
vocab.txt +0 -0

.gitattributes CHANGED Viewed

@@ -1,37 +1,3 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
-pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
-.git/lfs/objects/2c/54/2c547cd0200d5e2941a0df1be6f08c9c58aa7909c52edc93b34fd74a26360708 filter=lfs diff=lfs merge=lfs -text

+model.safetensors filter=lfs diff=lfs merge=lfs -text
+.git/lfs/objects/38/03/38038b2d482f03da65b16b695cca791699e9d40235edd0dbe368b855c05ca162 filter=lfs diff=lfs merge=lfs -text
+sentencepiece.bpe.model filter=lfs diff=lfs merge=lfs -text

1_Pooling/config.json CHANGED Viewed

@@ -3,5 +3,7 @@
   "pooling_mode_cls_token": false,
   "pooling_mode_mean_tokens": true,
   "pooling_mode_max_tokens": false,
-  "pooling_mode_mean_sqrt_len_tokens": false
 }

   "pooling_mode_cls_token": false,
   "pooling_mode_mean_tokens": true,
   "pooling_mode_max_tokens": false,
+  "pooling_mode_mean_sqrt_len_tokens": false,
+  "pooling_mode_weightedmean_tokens": false,
+  "pooling_mode_lasttoken": false
 }

LT_training_config.json CHANGED Viewed

@@ -1,27 +1,29 @@
 {
   "model_save_dir": "models",
-  "model_save_name": "check2",
-  "opt_model_description": null,
-  "opt_model_lang": null,
   "train_batch_size": 64,
   "num_epochs": 1,
   "warm_up_perc": 1,
-  "learning_rate": 2e-06,
   "val_perc": 0.2,
   "wandb_names": {
-    "project": "linkage",
-    "id": "econabhishek",
-    "run": "paraphrase-xlm-r-multilingual-v1-es",
-    "entity": "econabhishek"
   },
   "add_pooling_layer": false,
   "large_val": true,
-  "eval_steps_perc": 0.1,
   "test_at_end": true,
   "save_val_test_pickles": true,
   "val_query_prop": 0.5,
-  "eval_type": "retrieval",
-  "training_dataset": "/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/deeprecordlinkage/linktransformer/src/linktransformer/data/es_mexican_products.xlsx",
-  "base_model_path": "hiiamsid/sentence_similarity_spanish_es",
-  "best_model_path": "models/check2"
 }

 {
   "model_save_dir": "models",
+  "model_save_name": "check",
+  "opt_model_description": "test",
+  "opt_model_lang": "jp",
   "train_batch_size": 64,
   "num_epochs": 1,
   "warm_up_perc": 1,
+  "learning_rate": 2e-05,
+  "loss_type": "supcon",
   "val_perc": 0.2,
   "wandb_names": {
+    "project": "linktransformer",
+    "id": "your-id",
+    "run": "run-name",
+    "entity": "your-id"
   },
   "add_pooling_layer": false,
   "large_val": true,
+  "eval_steps_perc": 0.5,
   "test_at_end": true,
   "save_val_test_pickles": true,
   "val_query_prop": 0.5,
+  "loss_params": {},
+  "eval_type": "classification",
+  "training_dataset": "dataframe",
+  "base_model_path": "oshizo/sbert-jsnli-luke-japanese-base-lite",
+  "best_model_path": "models/check"
 }

README.md CHANGED Viewed

@@ -1,67 +1,84 @@
 ---
-pipeline_tag: text-classification
 language:
-- en
 tags:
 - linktransformer
-- transformers
-- text-classification
 - tabular-classification
 ---
 # dell-research-harvard/linktransformer-models-test
-This model is part of the [LinkTransformer](https://linktransformer.github.io/) ecosystem. While rooted in the a standard HuggingFace Transformer, this specific instance is tailored for text classification tasks. It classifies input sentences or paragraphs into specific categories or labels, leveraging the power of transformer architectures.
-The base model for this classifier is: roberta. It is pretrained for the language: - en.
-Labels are mapped to integers as follows:
-- Neither: 0
-- Protest: 1
-- Riot: 2
-This is a LinkTransformer model for classification of text into 'Protest', 'Riot' or 'Neither' classes. It was trained on annotated newspaper articles.
-## Usage with LinkTransformer
-After installing [LinkTransformer](https://linktransformer.github.io/):
-```python
 pip install -U linktransformer
 ```
-Employ the model for text classification tasks:
 ```python
 import linktransformer as lt
-df_clf_output = lt.classify_rows(df, on=["col_of_interest"], model="dell-research-harvard/linktransformer-models-test")
-```
-## Training
-### Training your own LinkTransformer Classification Model
-With the provided tools, you can train a custom classification model:
 ```python
-from linktransformer import train_clf_model
-best_model_path, best_metric, label_map = train_clf_model(
-    data="path_to_dataset.csv",
-    model="you-model-path-or-name",
-    on=["col_of_interest"],
-    label_col_name="label_column_name",
-    lr=5e-5,
-    batch_size=16,
-    epochs=3
-)
 ```
@@ -69,7 +86,50 @@ best_model_path, best_metric, label_map = train_clf_model(
 <!--- Describe how your model was evaluated -->
-Evaluation is typically based on metrics like accuracy, F1-score, precision, and recall.
 ## Citing & Authors
@@ -81,5 +141,6 @@ Evaluation is typically based on metrics like accuracy, F1-score, precision, and
                   eprint={2309.00789},
                   archivePrefix={arXiv},
                   primaryClass={cs.CL}
-}
 ```

 ---
+pipeline_tag: sentence-similarity
 language:
+- jp
 tags:
 - linktransformer
+- sentence-transformers
+- sentence-similarity
 - tabular-classification
 ---
 # dell-research-harvard/linktransformer-models-test
+This is a [LinkTransformer](https://linktransformer.github.io/) model. At its core this model this is a sentence transformer model [sentence-transformers](https://www.SBERT.net) model- it just wraps around the class.
+It is designed for quick and easy record linkage (entity-matching) through the LinkTransformer package. The tasks include clustering, deduplication, linking, aggregation and more.
+Notwithstanding that, it can be used for any sentence similarity task within the sentence-transformers framework as well.
+It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+Take a look at the documentation of [sentence-transformers](https://www.sbert.net/index.html) if you want to use this model for more than what we support in our applications.
+This model has been fine-tuned on the model : oshizo/sbert-jsnli-luke-japanese-base-lite. It is pretrained for the language : - jp.
+test
+## Usage (LinkTransformer)
+Using this model becomes easy when you have [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) installed:
+```
 pip install -U linktransformer
 ```
+Then you can use the model like this:
 ```python
 import linktransformer as lt
+import pandas as pd
+##Load the two dataframes that you want to link. For example, 2 dataframes with company names that are written differently
+df1=pd.read_csv("data/df1.csv") ###This is the left dataframe with key CompanyName for instance
+df2=pd.read_csv("data/df2.csv") ###This is the right dataframe with key CompanyName for instance
+###Merge the two dataframes on the key column!
+df_merged = lt.merge(df1, df2, on="CompanyName", how="inner")
+##Done! The merged dataframe has a column called "score" that contains the similarity score between the two company names
+```
+## Training your own LinkTransformer model
+Any Sentence Transformers can be used as a backbone by simply adding a pooling layer.  Any other transformer on HuggingFace can also be used by specifying the option add_pooling_layer==True
+The model was trained using SupCon loss.
+Usage can be found in the package docs.
+The training config can be found in the repo with the name LT_training_config.json
+To replicate the training, you can download the file and specify the path in the config_path argument of the training function. You can also override the config by specifying the training_args argument.
+Here is an example.
 ```python
+##Consider the example in the paper that has a dataset of Mexican products and their tariff codes from 1947 and 1948 and we want train a model to link the two tariff codes.
+saved_model_path = train_model(
+        model_path="hiiamsid/sentence_similarity_spanish_es",
+        dataset_path=dataset_path,
+        left_col_names=["description47"],
+        right_col_names=['description48'],
+        left_id_name=['tariffcode47'],
+        right_id_name=['tariffcode48'],
+        log_wandb=False,
+        config_path=LINKAGE_CONFIG_PATH,
+        training_args={"num_epochs": 1}
+    )
 ```
+You can also use this package for deduplication (clusters a df on the supplied key column). Merging a fine class (like product) to a coarse class (like HS code) is also possible.
+Read our paper and the documentation for more!
 <!--- Describe how your model was evaluated -->
+You can evaluate the model using the [LinkTransformer](https://github.com/dell-research-harvard/linktransformer) package's inference functions.
+We have provided a few datasets in the package for you to try out. We plan to host more datasets on Huggingface and our website (Coming soon) that you can take a look at.
+## Training
+The model was trained with the parameters:
+**DataLoader**:
+`torch.utils.data.dataloader.DataLoader` of length 10 with parameters:
+```
+{'batch_size': 64, 'sampler': 'torch.utils.data.dataloader._InfiniteConstantSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
+```
+**Loss**:
+`linktransformer.modified_sbert.losses.SupConLoss_wandb`
+Parameters of the fit()-Method:
+```
+{
+    "epochs": 1,
+    "evaluation_steps": 5,
+    "evaluator": "sentence_transformers.evaluation.SequentialEvaluator.SequentialEvaluator",
+    "max_grad_norm": 1,
+    "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
+    "optimizer_params": {
+        "lr": 2e-05
+    },
+    "scheduler": "WarmupLinear",
+    "steps_per_epoch": null,
+    "warmup_steps": 10,
+    "weight_decay": 0.01
+}
+```
+LinkTransformer(
+  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: LukeModel
+  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
+)
+```
 ## Citing & Authors
                   eprint={2309.00789},
                   archivePrefix={arXiv},
                   primaryClass={cs.CL}
+                }
 ```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<ent2>": 32771,
+  "<ent>": 32770
+}

config.json CHANGED Viewed

@@ -1,38 +1,32 @@
 {
-  "_name_or_path": "test_lt_clf/checkpoint-90",
   "architectures": [
-    "RobertaForSequenceClassification"
   ],
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
   "classifier_dropout": null,
   "eos_token_id": 2,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
-  "id2label": {
-    "0": "LABEL_0",
-    "1": "LABEL_1",
-    "2": "LABEL_2"
-  },
   "initializer_range": 0.02,
   "intermediate_size": 3072,
-  "label2id": {
-    "LABEL_0": 0,
-    "LABEL_1": 1,
-    "LABEL_2": 2
-  },
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
-  "model_type": "roberta",
   "num_attention_heads": 12,
-  "num_hidden_layers": 6,
   "pad_token_id": 1,
   "position_embedding_type": "absolute",
-  "problem_type": "single_label_classification",
   "torch_dtype": "float32",
-  "transformers_version": "4.33.2",
   "type_vocab_size": 1,
   "use_cache": true,
-  "vocab_size": 50265
 }

 {
+  "_name_or_path": "/mnt/122a7683-fa4b-45dd-9f13-b18cc4f4a187/deeprecordlinkage/linktransformer/models/check",
   "architectures": [
+    "LukeModel"
   ],
   "attention_probs_dropout_prob": 0.1,
+  "bert_model_name": "models/luke-japanese/hf_xlm_roberta",
   "bos_token_id": 0,
   "classifier_dropout": null,
+  "cls_entity_prediction": false,
+  "entity_emb_size": 256,
+  "entity_vocab_size": 4,
   "eos_token_id": 2,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "initializer_range": 0.02,
   "intermediate_size": 3072,
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
+  "model_type": "luke",
   "num_attention_heads": 12,
+  "num_hidden_layers": 12,
   "pad_token_id": 1,
   "position_embedding_type": "absolute",
   "torch_dtype": "float32",
+  "transformers_version": "4.35.1",
   "type_vocab_size": 1,
   "use_cache": true,
+  "use_entity_aware_attention": true,
+  "vocab_size": 32772
 }

config_sentence_transformers.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "__version__": {
-    "sentence_transformers": "2.0.0",
-    "transformers": "4.10.2",
-    "pytorch": "1.9.0+cu102"
   }
 }

 {
   "__version__": {
+    "sentence_transformers": "2.2.2",
+    "transformers": "4.25.1",
+    "pytorch": "1.13.0+cu116"
   }
 }

entity_vocab.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "[MASK2]": 3,
+  "[MASK]": 0,
+  "[PAD]": 2,
+  "[UNK]": 1
+}

merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin → model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:38038b2d482f03da65b16b695cca791699e9d40235edd0dbe368b855c05ca162
-size 328517105

 version https://git-lfs.github.com/spec/v1
+oid sha256:ffff8b1e1118917383c8481ca5631e9b5a05616039a6f0039ad5d0fae975d7ed
+size 532299592

sentence_bert_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
-  "max_seq_length": 512,
   "do_lower_case": false
 }

 {
+  "max_seq_length": 128,
   "do_lower_case": false
 }

sentencepiece.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8b73a5e054936c920cf5b7d1ec21ce9c281977078269963beb821c6c86fbff7
+size 841889

special_tokens_map.json CHANGED Viewed

@@ -1,15 +1,75 @@
 {
-  "bos_token": "<s>",
-  "cls_token": "<s>",
-  "eos_token": "</s>",
   "mask_token": {
     "content": "<mask>",
     "lstrip": true,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "<pad>",
-  "sep_token": "</s>",
-  "unk_token": "<unk>"
 }

 {
+  "additional_special_tokens": [
+    "<ent>",
+    "<ent2>",
+    "<ent>",
+    "<ent2>",
+    "<ent>",
+    "<ent2>",
+    "<ent>",
+    "<ent2>",
+    {
+      "content": "<ent>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<ent2>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
   "mask_token": {
     "content": "<mask>",
     "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }

tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,15 +1,108 @@
 {
-  "add_prefix_space": false,
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": true,
   "cls_token": "<s>",
   "eos_token": "</s>",
-  "errors": "replace",
   "mask_token": "<mask>",
   "model_max_length": 512,
   "pad_token": "<pad>",
   "sep_token": "</s>",
-  "tokenizer_class": "RobertaTokenizer",
-  "trim_offsets": true,
   "unk_token": "<unk>"
 }

 {
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32769": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32770": {
+      "content": "<ent>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32771": {
+      "content": "<ent2>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<ent>",
+    "<ent2>",
+    "<ent>",
+    "<ent2>",
+    "<ent>",
+    "<ent2>",
+    "<ent>",
+    "<ent2>",
+    "<ent>",
+    "<ent2>"
+  ],
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": true,
   "cls_token": "<s>",
+  "entity_mask2_token": "[MASK2]",
+  "entity_mask_token": "[MASK]",
+  "entity_pad_token": "[PAD]",
+  "entity_token_1": {
+    "__type": "AddedToken",
+    "content": "<ent>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false,
+    "special": false
+  },
+  "entity_token_2": {
+    "__type": "AddedToken",
+    "content": "<ent2>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false,
+    "special": false
+  },
+  "entity_unk_token": "[UNK]",
   "eos_token": "</s>",
   "mask_token": "<mask>",
+  "max_entity_length": 32,
+  "max_mention_length": 30,
   "model_max_length": 512,
   "pad_token": "<pad>",
   "sep_token": "</s>",
+  "sp_model_kwargs": {},
+  "task": null,
+  "tokenizer_class": "MLukeTokenizer",
   "unk_token": "<unk>"
 }

vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

vocab.txt DELETED Viewed

The diff for this file is too large to render. See raw diff