othrif commited on
Commit
cb918cb
1 Parent(s): 3e7d7b0
Files changed (2) hide show
  1. finetune.sh +4 -1
  2. run_common_voice.py +3 -3
finetune.sh CHANGED
@@ -6,7 +6,6 @@ mkdir -p ${model_path}
6
  python run_common_voice.py \
7
  --dataloader_num_workers="8" \
8
  --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
9
- #--overwrite_output_dir \
10
  --dataset_config_name="ar" \
11
  --output_dir=${model_path} \
12
  --num_train_epochs="50" \
@@ -32,6 +31,10 @@ python run_common_voice.py \
32
  --do_train --do_eval
33
 
34
 
 
 
 
 
35
 
36
 
37
 
 
6
  python run_common_voice.py \
7
  --dataloader_num_workers="8" \
8
  --model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
 
9
  --dataset_config_name="ar" \
10
  --output_dir=${model_path} \
11
  --num_train_epochs="50" \
 
31
  --do_train --do_eval
32
 
33
 
34
+ #--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
35
+ #--overwrite_output_dir \
36
+ #--model_name_or_path="/home/othrif/projects/wav2vec2/finetune-xlsr/models/ar/msa/wav2vec2-large-xlsr-arabic" \
37
+
38
 
39
 
40
 
run_common_voice.py CHANGED
@@ -319,14 +319,14 @@ def main():
319
  batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
320
  return batch
321
 
322
- train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"])
323
- eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"])
324
-
325
  # For arabic diacritics
326
  cleander = tn.Tnkeeh(remove_diacritics=True)
327
  train_dataset = cleander.clean_hf_dataset(train_dataset, 'sentence')
328
  eval_dataset = cleander.clean_hf_dataset(eval_dataset, 'sentence')
329
 
 
 
 
330
  def extract_all_chars(batch):
331
  all_text = " ".join(batch["text"])
332
  vocab = list(set(all_text))
 
319
  batch["text"] = re.sub(chars_to_ignore_regex, "", batch["sentence"]).lower() + " "
320
  return batch
321
 
 
 
 
322
  # For arabic diacritics
323
  cleander = tn.Tnkeeh(remove_diacritics=True)
324
  train_dataset = cleander.clean_hf_dataset(train_dataset, 'sentence')
325
  eval_dataset = cleander.clean_hf_dataset(eval_dataset, 'sentence')
326
 
327
+ train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"])
328
+ eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"])
329
+
330
  def extract_all_chars(batch):
331
  all_text = " ".join(batch["text"])
332
  vocab = list(set(all_text))