Duplicate from sshleifer/distilbart-cnn-12-6

Browse files

Co-authored-by: Sam Shleifer <[email protected]>

Files changed (9) hide show

.gitattributes +9 -0
README.md +29 -0
config.json +75 -0
flax_model.msgpack +3 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
rust_model.ot +3 -0
tokenizer_config.json +1 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,9 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+---
+language: en
+tags:
+- summarization
+license: apache-2.0
+datasets:
+- cnn_dailymail
+- xsum
+thumbnail: https://huggingface.co/front/thumbnails/distilbart_medium.png
+---
+### Usage
+This checkpoint should be loaded into `BartForConditionalGeneration.from_pretrained`. See the [BART docs](https://huggingface.co/transformers/model_doc/bart.html?#transformers.BartForConditionalGeneration) for more information.
+### Metrics for DistilBART models
+| Model Name                 |   MM Params |   Inference Time (MS) |   Speedup |   Rouge 2 |   Rouge-L |
+|:---------------------------|------------:|----------------------:|----------:|----------:|----------:|
+| distilbart-xsum-12-1       |         222 |                    90 |      2.54 |     18.31 |     33.37 |
+| distilbart-xsum-6-6        |         230 |                   132 |      1.73 |     20.92 |     35.73 |
+| distilbart-xsum-12-3       |         255 |                   106 |      2.16 |     21.37 |     36.39 |
+| distilbart-xsum-9-6        |         268 |                   136 |      1.68 |     21.72 |     36.61 |
+| bart-large-xsum (baseline) |         406 |                   229 |      1    |     21.85 |     36.50 |
+| distilbart-xsum-12-6       |         306 |                   137 |      1.68 |     22.12 |     36.99 |
+| bart-large-cnn (baseline)  |         406 |                   381 |      1    |     21.06 |     30.63 |
+| distilbart-12-3-cnn        |         255 |                   214 |      1.78 |     20.57 |     30.00 |
+| distilbart-12-6-cnn        |         306 |                   307 |      1.24 |     21.26 |     30.59 |
+| distilbart-6-6-cnn         |         230 |                   182 |      2.09 |     20.17 |     29.70 |

config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "_num_labels": 3,
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "BartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classif_dropout": 0.0,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 6,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "early_stopping": true,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "eos_token_id": 2,
+  "extra_pos_embeddings": 2,
+  "force_bos_token_to_be_generated": true,
+  "forced_bos_token_id": 0,
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "length_penalty": 2.0,
+  "max_length": 142,
+  "max_position_embeddings": 1024,
+  "min_length": 56,
+  "model_type": "bart",
+  "no_repeat_ngram_size": 3,
+  "normalize_before": false,
+  "normalize_embedding": true,
+  "num_beams": 4,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "prefix": " ",
+  "replacing_rate": 0,
+  "scale_embedding": false,
+  "static_position_embeddings": false,
+  "student_decoder_layers": null,
+  "student_encoder_layers": null,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 142,
+      "min_length": 56,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4
+    }
+  },
+  "transformers_version": "4.7.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50264
+}

flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e850d264574dac2076ae01ce78afe398ac02ac4b68e144feb9ca108bb5851c0
+size 1222255172

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bac65d18c99463302d12ca75c2220ea714f9c81ce235f205fa818efe71df6ea
+size 1222317369

rust_model.ot ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e589ff34942ff07948bbce579cf701cc19e1bfe370f4e4afaf24484ca5d2a2b
+size 1634092538

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"model_max_length": 1024}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff