Initial commit
Browse files- README.md +110 -0
- config.json +43 -0
- pytorch_model.bin +3 -0
- source.spm +0 -0
- special_tokens_map.json +1 -0
- target.spm +0 -0
- tokenizer_config.json +1 -0
- vocab.json +0 -0
README.md
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language:
|
3 |
+
- gmw
|
4 |
+
- gmw
|
5 |
+
|
6 |
+
tags:
|
7 |
+
- translation
|
8 |
+
|
9 |
+
license: CC-BY 4.0
|
10 |
+
---
|
11 |
+
# opus-mt-tc-base-gmw-gmw
|
12 |
+
|
13 |
+
Neural machine translation model for translating from West Germanic languages to West Germanic languages.
|
14 |
+
|
15 |
+
This model is part of the [OPUS-MT project](https://github.com/Helsinki-NLP/Opus-MT), an effort to make neural machine translation models widely available and accessible for many languages in the world. All models are originally trained using the amazing framework of [Marian NMT](https://marian-nmt.github.io/), an efficient NMT implementation writtin in pure C++. The models have been converted to pyTorch using the transformers library by huggingface. Training data is taken from [OPUS](https://opus.nlpl.eu/) and training pipelines use the procedures of [OPUS-MT-train](https://github.com/Helsinki-NLP/Opus-MT-train).
|
16 |
+
|
17 |
+
* Publications: [OPUS-MT – Building open translation services for the World](https://aclanthology.org/2020.eamt-1.61/) , [The Tatoeba Translation Challenge – Realistic Data Sets for Low Resource and Multilingual MT](https://aclanthology.org/2020.wmt-1.139/)
|
18 |
+
|
19 |
+
## Model info
|
20 |
+
|
21 |
+
* Release: 2021-02-23
|
22 |
+
* source language(s): afr deu eng fry gos hrx ltz nds nld pdc yid
|
23 |
+
* target language(s): afr deu eng fry nds nld
|
24 |
+
* valid target language labels: >>afr<< >>ang_Latn<< >>deu<< >>eng<< >>fry<< >>ltz<< >>nds<< >>nld<< >>sco<< >>yid<<
|
25 |
+
* model: transformer
|
26 |
+
* data: opus
|
27 |
+
* tokenization: SentencePiece (spm32k,spm32k)
|
28 |
+
* original model: [opus-2021-02-23.zip](https://object.pouta.csc.fi/Tatoeba-MT-models/gmw-gmw/opus-2021-02-23.zip)
|
29 |
+
|
30 |
+
This is a multilingual translation model with multiple target languages. A sentence initial language token is required in the form of `>>id<<` (id = valid target language ID), e.g. `>>afr<<`
|
31 |
+
|
32 |
+
## Usage
|
33 |
+
|
34 |
+
You can use OPUS-MT models with the transformers pipelines, for example:
|
35 |
+
|
36 |
+
```python
|
37 |
+
from transformers import pipeline
|
38 |
+
pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-tc-base-gmw-gmw")
|
39 |
+
print(pipe(">>afr<< Replace this with text in an accepted source language.")
|
40 |
+
```
|
41 |
+
|
42 |
+
## Benchmarks
|
43 |
+
|
44 |
+
| langpair | testset | BLEU | chr-F | #sent | #words | BP |
|
45 |
+
|----------|---------|-------|-------|-------|--------|----|
|
46 |
+
| afr-deu | Tatoeba-test | 48.5 | 0.677 | 1583 | 9105 | 1.000 |
|
47 |
+
| afr-eng | Tatoeba-test | 58.7 | 0.727 | 1374 | 9622 | 0.995 |
|
48 |
+
| afr-nld | Tatoeba-test | 54.7 | 0.713 | 1056 | 6710 | 0.989 |
|
49 |
+
| deu-afr | Tatoeba-test | 52.4 | 0.697 | 1583 | 9507 | 1.000 |
|
50 |
+
| deu-eng | newssyscomb2009 | 25.4 | 0.527 | 502 | 11821 | 0.986 |
|
51 |
+
| deu-eng | news-test2008 | 23.9 | 0.519 | 2051 | 49380 | 0.992 |
|
52 |
+
| deu-eng | newstest2009 | 23.5 | 0.517 | 2525 | 65402 | 0.978 |
|
53 |
+
| deu-eng | newstest2010 | 26.1 | 0.548 | 2489 | 61724 | 1.000 |
|
54 |
+
| deu-eng | newstest2011 | 23.9 | 0.525 | 3003 | 74681 | 1.000 |
|
55 |
+
| deu-eng | newstest2012 | 25.0 | 0.533 | 3003 | 72812 | 1.000 |
|
56 |
+
| deu-eng | newstest2013 | 27.7 | 0.549 | 3000 | 64505 | 1.000 |
|
57 |
+
| deu-eng | newstest2014-deen | 27.4 | 0.549 | 3003 | 67337 | 0.977 |
|
58 |
+
| deu-eng | newstest2015-ende | 28.8 | 0.554 | 2169 | 46443 | 0.973 |
|
59 |
+
| deu-eng | newstest2016-ende | 33.7 | 0.598 | 2999 | 64126 | 1.000 |
|
60 |
+
| deu-eng | newstest2017-ende | 29.6 | 0.562 | 3004 | 64399 | 0.979 |
|
61 |
+
| deu-eng | newstest2018-ende | 36.3 | 0.611 | 2998 | 67013 | 0.977 |
|
62 |
+
| deu-eng | newstest2019-deen | 32.7 | 0.585 | 2000 | 39282 | 0.984 |
|
63 |
+
| deu-eng | Tatoeba-test | 44.7 | 0.629 | 10000 | 81233 | 0.975 |
|
64 |
+
| deu-nds | Tatoeba-test | 18.7 | 0.444 | 10000 | 76144 | 0.988 |
|
65 |
+
| deu-nld | Tatoeba-test | 48.7 | 0.672 | 10000 | 73546 | 0.969 |
|
66 |
+
| eng-afr | Tatoeba-test | 56.5 | 0.735 | 1374 | 10317 | 0.984 |
|
67 |
+
| eng-deu | newssyscomb2009 | 19.4 | 0.503 | 502 | 11271 | 0.991 |
|
68 |
+
| eng-deu | news-test2008 | 19.5 | 0.493 | 2051 | 47427 | 0.996 |
|
69 |
+
| eng-deu | newstest2009 | 18.8 | 0.499 | 2525 | 62816 | 0.993 |
|
70 |
+
| eng-deu | newstest2010 | 20.8 | 0.509 | 2489 | 61511 | 0.958 |
|
71 |
+
| eng-deu | newstest2011 | 19.2 | 0.493 | 3003 | 72981 | 0.980 |
|
72 |
+
| eng-deu | newstest2012 | 19.6 | 0.494 | 3003 | 72886 | 0.960 |
|
73 |
+
| eng-deu | newstest2013 | 22.8 | 0.518 | 3000 | 63737 | 0.974 |
|
74 |
+
| eng-deu | newstest2015-ende | 25.8 | 0.545 | 2169 | 44260 | 1.000 |
|
75 |
+
| eng-deu | newstest2016-ende | 30.3 | 0.581 | 2999 | 62670 | 0.989 |
|
76 |
+
| eng-deu | newstest2017-ende | 24.2 | 0.537 | 3004 | 61291 | 1.000 |
|
77 |
+
| eng-deu | newstest2018-ende | 35.5 | 0.616 | 2998 | 64276 | 1.000 |
|
78 |
+
| eng-deu | newstest2019-ende | 31.6 | 0.586 | 1997 | 48969 | 0.973 |
|
79 |
+
| eng-deu | Tatoeba-test | 37.8 | 0.591 | 10000 | 83347 | 0.991 |
|
80 |
+
| eng-nds | Tatoeba-test | 16.5 | 0.411 | 2500 | 18264 | 0.992 |
|
81 |
+
| eng-nld | Tatoeba-test | 50.3 | 0.677 | 10000 | 71436 | 0.979 |
|
82 |
+
| fry-deu | Tatoeba-test | 28.7 | 0.545 | 66 | 432 | 1.000 |
|
83 |
+
| fry-eng | Tatoeba-test | 31.9 | 0.496 | 205 | 1500 | 1.000 |
|
84 |
+
| fry-nld | Tatoeba-test | 43.0 | 0.634 | 233 | 1672 | 1.000 |
|
85 |
+
| gos-nld | Tatoeba-test | 15.9 | 0.409 | 1852 | 9903 | 0.959 |
|
86 |
+
| hrx-deu | Tatoeba-test | 24.7 | 0.487 | 471 | 2805 | 0.984 |
|
87 |
+
| ltz-deu | Tatoeba-test | 36.6 | 0.552 | 337 | 2144 | 1.000 |
|
88 |
+
| ltz-eng | Tatoeba-test | 31.4 | 0.477 | 283 | 1751 | 1.000 |
|
89 |
+
| ltz-nld | Tatoeba-test | 37.5 | 0.523 | 273 | 1567 | 1.000 |
|
90 |
+
| multi-multi | Tatoeba-test | 37.1 | 0.569 | 10000 | 73153 | 1.000 |
|
91 |
+
| nds-deu | Tatoeba-test | 34.5 | 0.572 | 10000 | 74571 | 1.000 |
|
92 |
+
| nds-eng | Tatoeba-test | 29.6 | 0.492 | 2500 | 17589 | 1.000 |
|
93 |
+
| nds-nld | Tatoeba-test | 42.2 | 0.621 | 1657 | 11490 | 0.994 |
|
94 |
+
| nld-afr | Tatoeba-test | 59.0 | 0.756 | 1056 | 6823 | 1.000 |
|
95 |
+
| nld-deu | Tatoeba-test | 50.6 | 0.688 | 10000 | 72438 | 1.000 |
|
96 |
+
| nld-eng | Tatoeba-test | 54.5 | 0.702 | 10000 | 69848 | 0.975 |
|
97 |
+
| nld-fry | Tatoeba-test | 23.3 | 0.462 | 233 | 1679 | 1.000 |
|
98 |
+
| nld-nds | Tatoeba-test | 21.7 | 0.462 | 1657 | 11711 | 0.998 |
|
99 |
+
| pdc-eng | Tatoeba-test | 24.3 | 0.402 | 53 | 399 | 1.000 |
|
100 |
+
| yid-nld | Tatoeba-test | 21.3 | 0.402 | 55 | 323 | 1.000 |
|
101 |
+
|
102 |
+
* test set translations: [opus-2021-02-23.test.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/gmw-gmw/opus-2021-02-23.test.txt)
|
103 |
+
* test set scores: [opus-2021-02-23.eval.txt](https://object.pouta.csc.fi/Tatoeba-MT-models/gmw-gmw/opus-2021-02-23.eval.txt)
|
104 |
+
|
105 |
+
## Model conversion info
|
106 |
+
|
107 |
+
* transformers version: 4.12.3
|
108 |
+
* OPUS-MT git hash: fc19512
|
109 |
+
* port time: Thu Jan 27 18:04:00 EET 2022
|
110 |
+
* port machine: LM0-400-22516.local
|
config.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"activation_dropout": 0.0,
|
3 |
+
"activation_function": "swish",
|
4 |
+
"architectures": [
|
5 |
+
"MarianMTModel"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bad_words_ids": [
|
9 |
+
[
|
10 |
+
35451
|
11 |
+
]
|
12 |
+
],
|
13 |
+
"bos_token_id": 0,
|
14 |
+
"classifier_dropout": 0.0,
|
15 |
+
"d_model": 512,
|
16 |
+
"decoder_attention_heads": 8,
|
17 |
+
"decoder_ffn_dim": 2048,
|
18 |
+
"decoder_layerdrop": 0.0,
|
19 |
+
"decoder_layers": 6,
|
20 |
+
"decoder_start_token_id": 35451,
|
21 |
+
"dropout": 0.1,
|
22 |
+
"encoder_attention_heads": 8,
|
23 |
+
"encoder_ffn_dim": 2048,
|
24 |
+
"encoder_layerdrop": 0.0,
|
25 |
+
"encoder_layers": 6,
|
26 |
+
"eos_token_id": 0,
|
27 |
+
"forced_eos_token_id": 0,
|
28 |
+
"init_std": 0.02,
|
29 |
+
"is_encoder_decoder": true,
|
30 |
+
"max_length": 512,
|
31 |
+
"max_position_embeddings": 512,
|
32 |
+
"model_type": "marian",
|
33 |
+
"normalize_embedding": false,
|
34 |
+
"num_beams": 6,
|
35 |
+
"num_hidden_layers": 6,
|
36 |
+
"pad_token_id": 35451,
|
37 |
+
"scale_embedding": true,
|
38 |
+
"static_position_embeddings": true,
|
39 |
+
"torch_dtype": "float16",
|
40 |
+
"transformers_version": "4.12.3",
|
41 |
+
"use_cache": true,
|
42 |
+
"vocab_size": 35452
|
43 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a4586f60a829f1bd8331e4f877b3ddf493c333c821afe8bac9b84847681df2c7
|
3 |
+
size 161034627
|
source.spm
ADDED
Binary file (802 kB). View file
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
target.spm
ADDED
Binary file (802 kB). View file
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"source_lang": "gmw", "target_lang": "gmw", "unk_token": "<unk>", "eos_token": "</s>", "pad_token": "<pad>", "model_max_length": 512, "sp_model_kwargs": {}, "special_tokens_map_file": null, "tokenizer_file": null, "name_or_path": "marian-models/opus-2021-02-23/gmw-gmw", "tokenizer_class": "MarianTokenizer"}
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|