Initial commit

Browse files

Files changed (9) hide show

.gitattributes +1 -0
README.md +71 -0
all_results.json +17 -0
config.json +51 -0
pytorch_model.bin +3 -0
special_tokens_map.json +15 -0
tokenizer.json +3 -0
tokenizer_config.json +19 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,74 @@
 ---
 license: gpl-3.0
 ---

 ---
+language: es
 license: gpl-3.0
+tags:
+- PyTorch
+- Transformers
+- Token Classification
+- xlm-roberta
+- xlm-roberta-large
+widget:
+- text: "Fue antes de llegar a Sigüeiro, en el Camino de Santiago."
+- text: "Si te metes en el Franco desde la Alameda, vas hacia la Catedral."
+- text: "Y allí precisamente es Santiago el patrón del pueblo."
+model-index:
+- name: es_trf_ner_cds_xlm-large
+  results: []
 ---
+# Introduction
+This model is a fine-tuned version of [xlm-roberta-large](https://huggingface.co/xlm-roberta-large) for Named-Entity Recognition, in the domain of tourism related to the Way of Saint Jacques. It recognizes four types of entities: location (LOC), organizations (ORG), person (PER) and miscellaneous (MISC).
+## Usage
+You can use this model with Transformers *pipeline* for NER.
+```python
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+tokenizer = AutoTokenizer.from_pretrained("es_trf_ner_cds_xlm-large")
+model = AutoModelForTokenClassification.from_pretrained("es_trf_ner_cds_xlm-large")
+example = "Fue antes de llegar a Sigüeiro, en el Camino de Santiago. Si te metes en el Franco desde la Alameda, vas hacia la Catedral. Y allí precisamente es Santiago el patrón del pueblo."
+ner_pipe = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+for ent in ner_pipe(example):
+    print(ent)
+```
+## Dataset
+ToDo
+## Model performance
+entity|precision|recall|f1
+-|-|-|-
+LOC|0.973|0.983|0.978
+MISC|0.760|0.788|0.773
+ORG|0.885|0.701|0.783
+PER|0.937|0.878|0.906
+micro avg|0.953|0.958|0.955
+macro avg|0.889|0.838|0.860
+weighted avg|0.953|0.958|0.955
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 32
+- eval_batch_size: 8
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- num_epochs: 3.0
+### Framework versions
+- Transformers 4.28.1
+- Pytorch 2.0.1+cu117
+- Datasets 2.12.0
+- Tokenizers 0.13.3

all_results.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "epoch": 3.0,
+    "eval_accuracy": 0.9979369961144651,
+    "eval_f1": 0.9566217926590725,
+    "eval_loss": 0.009228814393281937,
+    "eval_precision": 0.9547542489664677,
+    "eval_recall": 0.9584966566751211,
+    "eval_runtime": 38.1835,
+    "eval_samples": 15178,
+    "eval_samples_per_second": 397.502,
+    "eval_steps_per_second": 49.707,
+    "train_loss": 0.08099352212526335,
+    "train_runtime": 1003.7611,
+    "train_samples": 45533,
+    "train_samples_per_second": 136.087,
+    "train_steps_per_second": 4.253
+}

config.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "_name_or_path": "xlm-roberta-large",
+  "architectures": [
+    "XLMRobertaForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "finetuning_task": "ner",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "B-LOC",
+    "1": "B-MISC",
+    "2": "B-ORG",
+    "3": "B-PER",
+    "4": "I-LOC",
+    "5": "I-MISC",
+    "6": "I-ORG",
+    "7": "I-PER",
+    "8": "O"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "B-LOC": 0,
+    "B-MISC": 1,
+    "B-ORG": 2,
+    "B-PER": 3,
+    "I-LOC": 4,
+    "I-MISC": 5,
+    "I-ORG": 6,
+    "I-PER": 7,
+    "O": 8
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "xlm-roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "output_past": true,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.28.1",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 250002
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c2b10e42f3d811754eebae3495d6a247824a93bbce5707c5ab0b6f198c99725
+size 2235539565

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1edb0658cb47689db5cf78194ebe041bba3b6b775d1f1069fc9501b372d4acb0
+size 17082758

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "__type": "AddedToken",
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "XLMRobertaTokenizer",
+  "unk_token": "<unk>"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81ff6af0468d14857cf0bc6096131bad61a715d6c50507564e63a69aa2380138
+size 3579