p208p2002 commited on
Commit
8369309
1 Parent(s): 28b995a
README.md ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Transformer QG on DRCD
2
+ The inputs of the model refers to
3
+ ```
4
+ we integrate C and A into a new C' in the following form.
5
+ C' = [c1, c2, ..., [HL], a1, ..., a|A|, [HL], ..., c|C|]
6
+ ```
7
+ > Proposed by [Ying-Hong Chan & Yao-Chung Fan. (2019). A Re-current BERT-based Model for Question Generation.](https://www.aclweb.org/anthology/D19-5821/)
8
+
9
+ ## Features
10
+ - Fully pipline from fine-tune to evaluation
11
+ - Support most of state of the art models
12
+ - Fast deploy as a API server
13
+
14
+ ## DRCD dataset
15
+ [台達閱讀理解資料集 Delta Reading Comprehension Dataset (DRCD)](https://github.com/DRCKnowledgeTeam/DRCD) 屬於通用領域繁體中文機器閱讀理解資料集。 DRCD資料集從2,108篇維基條目中整理出10,014篇段落,並從段落中標註出30,000多個問題。
16
+
17
+ ## Available models
18
+ - BART (base on **[uer/bart-base-chinese-cluecorpussmall](https://huggingface.co/uer/bart-base-chinese-cluecorpussmall)**)
19
+
20
+ ## Expriments
21
+ Model |Bleu 1|Bleu 2|Bleu 3|Bleu 4|METEOR|ROUGE-L|
22
+ ------------------|------|------|------|------|------|-------|
23
+ BART-HLSQG |34.25 |27.70 |22.43 |18.13 |23.58 |36.88 |
24
+
25
+ ## Environment requirements
26
+ The hole development is based on Ubuntu system
27
+
28
+ 1. If you don't have pytorch 1.6+ please install or update first
29
+ > https://pytorch.org/get-started/locally/
30
+
31
+ 2. Install packages `pip install -r requirements.txt`
32
+
33
+ 3. Setup scorer `python setup_scorer.py`
34
+
35
+ 5. Download dataset `python init_dataset.py`
36
+
37
+
38
+ ## Training
39
+ ### Seq2Seq LM
40
+ ```
41
+ usage: train_seq2seq_lm.py [-h]
42
+ [--base_model {facebook/bart-base,facebook/bart-large,t5-small,t5-base,t5-large}]
43
+ [-d {squad,squad-nqg}] [--epoch EPOCH] [--lr LR]
44
+ [--dev DEV] [--server] [--run_test]
45
+ [-fc FROM_CHECKPOINT]
46
+
47
+ optional arguments:
48
+ -h, --help show this help message and exit
49
+ --base_model {facebook/bart-base,facebook/bart-large,t5-small,t5-base,t5-large}
50
+ -d {squad,squad-nqg}, --dataset {squad,squad-nqg}
51
+ --epoch EPOCH
52
+ --lr LR
53
+ --dev DEV
54
+ --server
55
+ --run_test
56
+ -fc FROM_CHECKPOINT, --from_checkpoint FROM_CHECKPOINT
57
+ ```
58
+
59
+ ## Deploy
60
+ ### Start up
61
+ ```
62
+ python train_seq2seq_lm.py --server --base_model YOUR_BASE_MODEL --from_checkpoint FROM_CHECKPOINT
63
+ ```
64
+ ### Request example
65
+ ```
66
+ curl --location --request POST 'http://127.0.0.1:5000/' \
67
+ --header 'Content-Type: application/x-www-form-urlencoded' \
68
+ --data-urlencode 'context=哈利·波特是英國作家[HL]羅琳[HL]撰寫的七部幻想小說系列'
69
+ ```
70
+ ```json
71
+ {"predict": "誰撰寫哈利·波特?"}
72
+ ```
73
+
config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "uer/bart-base-chinese-cluecorpussmall",
3
+ "activation_dropout": 0.1,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "BartForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 768,
12
+ "decoder_attention_heads": 12,
13
+ "decoder_ffn_dim": 3072,
14
+ "decoder_layerdrop": 0.1,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 101,
17
+ "dropout": 0.1,
18
+ "early_stopping": true,
19
+ "encoder_attention_heads": 12,
20
+ "encoder_ffn_dim": 3072,
21
+ "encoder_layerdrop": 0.1,
22
+ "encoder_layers": 6,
23
+ "eos_token_id": 0,
24
+ "forced_eos_token_id": 0,
25
+ "gradient_checkpointing": false,
26
+ "id2label": {
27
+ "0": "LABEL_0",
28
+ "1": "LABEL_1",
29
+ "2": "LABEL_2"
30
+ },
31
+ "init_std": 0.02,
32
+ "is_encoder_decoder": true,
33
+ "label2id": {
34
+ "LABEL_0": 0,
35
+ "LABEL_1": 1,
36
+ "LABEL_2": 2
37
+ },
38
+ "max_length": 256,
39
+ "max_position_embeddings": 1024,
40
+ "model_type": "bart",
41
+ "num_hidden_layers": 6,
42
+ "pad_token_id": 0,
43
+ "scale_embedding": false,
44
+ "tie_word_embeddings": 0,
45
+ "tokenizer_class": "BertTokenizer",
46
+ "transformers_version": "4.5.0",
47
+ "use_cache": true,
48
+ "vocab_size": 21128
49
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66ed0b84b7e07533abd9a7323173682f62ae1c24c9c218a1020159aa23790b9f
3
+ size 533263126
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "never_split": null, "model_max_length": 128, "special_tokens_map_file": "gpt2-base-chinese-cluecorpussmall\\special_tokens_map.json", "name_or_path": "uer/bart-base-chinese-cluecorpussmall"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff