save all
Browse files- finetune.sh +4 -1
- run_common_voice.py +3 -3
finetune.sh
CHANGED
@@ -6,7 +6,6 @@ mkdir -p ${model_path}
|
|
6 |
python run_common_voice.py \
|
7 |
--dataloader_num_workers="8" \
|
8 |
--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
|
9 |
-
#--overwrite_output_dir \
|
10 |
--dataset_config_name="ar" \
|
11 |
--output_dir=${model_path} \
|
12 |
--num_train_epochs="50" \
|
@@ -32,6 +31,10 @@ python run_common_voice.py \
|
|
32 |
--do_train --do_eval
|
33 |
|
34 |
|
|
|
|
|
|
|
|
|
35 |
|
36 |
|
37 |
|
|
|
6 |
python run_common_voice.py \
|
7 |
--dataloader_num_workers="8" \
|
8 |
--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
|
|
|
9 |
--dataset_config_name="ar" \
|
10 |
--output_dir=${model_path} \
|
11 |
--num_train_epochs="50" \
|
|
|
31 |
--do_train --do_eval
|
32 |
|
33 |
|
34 |
+
#--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
|
35 |
+
#--overwrite_output_dir \
|
36 |
+
#--model_name_or_path="/home/othrif/projects/wav2vec2/finetune-xlsr/models/ar/msa/wav2vec2-large-xlsr-arabic" \
|
37 |
+
|
38 |
|
39 |
|
40 |
|
run_common_voice.py
CHANGED
@@ -319,14 +319,14 @@ def main():
|
|
319 |
batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
|
320 |
return batch
|
321 |
|
322 |
-
train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"])
|
323 |
-
eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"])
|
324 |
-
|
325 |
# For arabic diacritics
|
326 |
cleander = tn.Tnkeeh(remove_diacritics=True)
|
327 |
train_dataset = cleander.clean_hf_dataset(train_dataset, 'sentence')
|
328 |
eval_dataset = cleander.clean_hf_dataset(eval_dataset, 'sentence')
|
329 |
|
|
|
|
|
|
|
330 |
def extract_all_chars(batch):
|
331 |
all_text = " ".join(batch["text"])
|
332 |
vocab = list(set(all_text))
|
|
|
319 |
batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
|
320 |
return batch
|
321 |
|
|
|
|
|
|
|
322 |
# For arabic diacritics
|
323 |
cleander = tn.Tnkeeh(remove_diacritics=True)
|
324 |
train_dataset = cleander.clean_hf_dataset(train_dataset, 'sentence')
|
325 |
eval_dataset = cleander.clean_hf_dataset(eval_dataset, 'sentence')
|
326 |
|
327 |
+
train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"])
|
328 |
+
eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"])
|
329 |
+
|
330 |
def extract_all_chars(batch):
|
331 |
all_text = " ".join(batch["text"])
|
332 |
vocab = list(set(all_text))
|