Update README.md
Browse files
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
license:
|
3 |
language:
|
4 |
- vi
|
5 |
pipeline_tag: token-classification
|
@@ -27,9 +27,9 @@ For more details on the training process, please refer to this
|
|
27 |
|
28 |
## How to use this model
|
29 |
There are just a few steps:
|
30 |
-
- Step 1: Load the model as a token classification model (
|
31 |
- Step 2: Run the input through the model to obtain the tag index for each input token.
|
32 |
-
- Step 3: Use the tags' index to retreive the actual tags in the file
|
33 |
apply the conversion indicated by the tag to each token to obtain accented tokens.
|
34 |
|
35 |
### Step 1: Load model
|
@@ -95,7 +95,8 @@ tokens, predictions = insert_accents(text, model, tokenizer)
|
|
95 |
|
96 |
### Step3: Obtain the accented words
|
97 |
|
98 |
-
3.1 Download the tags set file from this repo.
|
|
|
99 |
```python
|
100 |
def _load_tags_set(fpath):
|
101 |
labels = []
|
@@ -107,7 +108,7 @@ def _load_tags_set(fpath):
|
|
107 |
|
108 |
return labels
|
109 |
|
110 |
-
label_list = _load_tags_set("
|
111 |
assert len(label_list) == 528, f"Expect {len(label_list)} tags"
|
112 |
```
|
113 |
|
@@ -116,12 +117,101 @@ assert len(label_list) == 528, f"Expect {len(label_list)} tags"
|
|
116 |
print(tokens)
|
117 |
print(list(f"{pred} ({label_list[pred]})" for pred in predictions))
|
118 |
```
|
|
|
119 |
Obtained
|
120 |
```python
|
121 |
['▁Nhi', 'n', '▁nhu', 'ng', '▁mua', '▁thu', '▁di', ',', '▁em', '▁nghe', '▁sau', '▁len', '▁trong', '▁nang', '.']
|
122 |
['217 (i-ì)', '217 (i-ì)', '388 (u-ữ)', '388 (u-ữ)', '407 (ua-ùa)', '378 (u-u)', '120 (di-đi)', '0 (-)', '185 (e-e)', '185 (e-e)', '41 (au-âu)', '188 (e-ê)', '302 (o-o)', '14 (a-ắ)', '0 (-)']
|
123 |
```
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
## Limitations
|
127 |
-
- This model will accept a maximum of 512 tokens, which is a limitation inherited from the base pretrained XLM-Roberta model.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
license: apache-2.0
|
3 |
language:
|
4 |
- vi
|
5 |
pipeline_tag: token-classification
|
|
|
27 |
|
28 |
## How to use this model
|
29 |
There are just a few steps:
|
30 |
+
- Step 1: Load the model as a token classification model (`AutoModelForTokenClassification`).
|
31 |
- Step 2: Run the input through the model to obtain the tag index for each input token.
|
32 |
+
- Step 3: Use the tags' index to retreive the actual tags in the file `selected_tags_names.txt`. Then,
|
33 |
apply the conversion indicated by the tag to each token to obtain accented tokens.
|
34 |
|
35 |
### Step 1: Load model
|
|
|
95 |
|
96 |
### Step3: Obtain the accented words
|
97 |
|
98 |
+
3.1 Download the tags set file (`selected_tags_names.txt`) from this repo.
|
99 |
+
Suppose that it's put int the current dir, we can then load it:
|
100 |
```python
|
101 |
def _load_tags_set(fpath):
|
102 |
labels = []
|
|
|
108 |
|
109 |
return labels
|
110 |
|
111 |
+
label_list = _load_tags_set("./selected_tags_names.txt")
|
112 |
assert len(label_list) == 528, f"Expect {len(label_list)} tags"
|
113 |
```
|
114 |
|
|
|
117 |
print(tokens)
|
118 |
print(list(f"{pred} ({label_list[pred]})" for pred in predictions))
|
119 |
```
|
120 |
+
|
121 |
Obtained
|
122 |
```python
|
123 |
['▁Nhi', 'n', '▁nhu', 'ng', '▁mua', '▁thu', '▁di', ',', '▁em', '▁nghe', '▁sau', '▁len', '▁trong', '▁nang', '.']
|
124 |
['217 (i-ì)', '217 (i-ì)', '388 (u-ữ)', '388 (u-ữ)', '407 (ua-ùa)', '378 (u-u)', '120 (di-đi)', '0 (-)', '185 (e-e)', '185 (e-e)', '41 (au-âu)', '188 (e-ê)', '302 (o-o)', '14 (a-ắ)', '0 (-)']
|
125 |
```
|
126 |
|
127 |
+
We can see here that our original words have been further split into smaller tokens by the model. But we know the first token of each word
|
128 |
+
starts with the special char "▁".
|
129 |
+
|
130 |
+
Here, we'd need to merge these tokens (and similarly, the corresponding tags) into our original Vietnamese words.
|
131 |
+
Then, for each word, we'd apply the first tag (if it's associated with more than 1 tags) that change the word.
|
132 |
+
|
133 |
+
This can be done as follows:
|
134 |
+
|
135 |
+
```python
|
136 |
+
TOKENIZER_WORD_PREFIX = "▁"
|
137 |
+
def merge_tokens_and_preds(tokens, predictions):
|
138 |
+
merged_tokens_preds = []
|
139 |
+
i = 0
|
140 |
+
while i < len(tokens):
|
141 |
+
tok = tokens[i]
|
142 |
+
label_indexes = set([predictions[i]])
|
143 |
+
if tok.startswith(TOKENIZER_WORD_PREFIX): # start a new word
|
144 |
+
tok_no_prefix = tok[len(TOKENIZER_WORD_PREFIX):]
|
145 |
+
cur_word_toks = [tok_no_prefix]
|
146 |
+
# check if subsequent toks are part of this word
|
147 |
+
j = i + 1
|
148 |
+
while j < len(tokens):
|
149 |
+
if not tokens[j].startswith(TOKENIZER_WORD_PREFIX):
|
150 |
+
cur_word_toks.append(tokens[j])
|
151 |
+
label_indexes.add(predictions[j])
|
152 |
+
j += 1
|
153 |
+
else:
|
154 |
+
break
|
155 |
+
cur_word = ''.join(cur_word_toks)
|
156 |
+
merged_tokens_preds.append((cur_word, label_indexes))
|
157 |
+
i = j
|
158 |
+
else:
|
159 |
+
merged_tokens_preds.append((tok, label_indexes))
|
160 |
+
i += 1
|
161 |
+
|
162 |
+
return merged_tokens_preds
|
163 |
+
|
164 |
+
|
165 |
+
merged_tokens_preds = merge_tokens_and_preds(tokens, predictions)
|
166 |
+
print(merged_tokens_preds)
|
167 |
+
```
|
168 |
+
|
169 |
+
Obtained:
|
170 |
+
```python
|
171 |
+
[('Nhin', {217}), ('nhung', {388}), ('mua', {407}), ('thu', {378}), ('di,', {120, 0}), ('em', {185}), ('nghe', {185}), ('sau', {41}), ('len', {188}), ('trong', {302}), ('nang.', {0, 14})]
|
172 |
+
```
|
173 |
+
|
174 |
+
Then our final part:
|
175 |
+
|
176 |
+
```python
|
177 |
+
def get_accented_words(merged_tokens_preds, label_list):
|
178 |
+
accented_words = []
|
179 |
+
for word_raw, label_indexes in merged_tokens_preds:
|
180 |
+
# use the first label that changes word_raw
|
181 |
+
for label_index in label_indexes:
|
182 |
+
tag_name = label_list[int(label_index)]
|
183 |
+
raw, vowel = tag_name.split("-")
|
184 |
+
if raw and raw in word_raw:
|
185 |
+
word_accented = word_raw.replace(raw, vowel)
|
186 |
+
break
|
187 |
+
else:
|
188 |
+
word_accented = word_raw
|
189 |
+
|
190 |
+
accented_words.append(word_accented)
|
191 |
+
|
192 |
+
return accented_words
|
193 |
+
|
194 |
+
|
195 |
+
accented_words = get_accented_words(merged_tokens_preds, label_list)
|
196 |
+
print(accented_words)
|
197 |
+
```
|
198 |
+
|
199 |
+
Obtained:
|
200 |
+
```python
|
201 |
+
['Nhìn', 'những', 'mùa', 'thu', 'đi,', 'em', 'nghe', 'sâu', 'lên', 'trong', 'nắng.']
|
202 |
+
```
|
203 |
+
|
204 |
+
In this example, the model made 1 mistake with the word "sầu" (but predicted "sâu").
|
205 |
+
|
206 |
+
|
207 |
|
208 |
## Limitations
|
209 |
+
- This model will accept a maximum of 512 tokens, which is a limitation inherited from the base pretrained XLM-Roberta model.
|
210 |
+
- It has a higher accuracy (97%) than an HMM version (91%).
|
211 |
+
More info can be found <a href="https://peterhung.org/tech/insert-vietnamese-accent-transformer-model/#vs-hmm" target="_blank">here</a>.
|
212 |
+
|
213 |
+
|
214 |
+
## Live Demo
|
215 |
+
There is a live demo of this model available <a href="https://ai.vietnameseaccent.com/" target="_blank">here</a>.
|
216 |
+
|
217 |
+
This demo is run on CPU so the speed is quite slow.
|