m3hrdadfi commited on
Commit
1809a17
1 Parent(s): 9eca64d

Add normalization steps, fix som bugs, add tfboard tracker

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. README.md +2 -2
  3. src/data_utils.py +3 -7
  4. src/requirements.txt +2 -1
.gitattributes CHANGED
@@ -14,3 +14,4 @@
14
  *.pb filter=lfs diff=lfs merge=lfs -text
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
 
 
14
  *.pb filter=lfs diff=lfs merge=lfs -text
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -33,9 +33,9 @@ python create_config.py --name_or_path gpt2-medium --params '{"vocab_size": 4200
33
 
34
  Steps:
35
 
36
- - [ ] Remove stretched words such as ســــــــــلام
37
 
38
- - [ ] Remove links, user-mentioning (such as @jane_doe)
39
 
40
  - [ ] Remove Telegram, Instagram advertisements, or posts (a whole record)
41
 
 
33
 
34
  Steps:
35
 
36
+ - [x] Remove stretched words such as ســــــــــلام
37
 
38
+ - [x] Remove links, user-mentioning (such as @jane_doe)
39
 
40
  - [ ] Remove Telegram, Instagram advertisements, or posts (a whole record)
41
 
src/data_utils.py CHANGED
@@ -2,7 +2,6 @@ from hazm import word_tokenize
2
  from hazm import sent_tokenize
3
  import re
4
  import six
5
- import string
6
 
7
  from normalizer import normalize
8
 
@@ -13,15 +12,15 @@ def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئا
13
  candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
14
  text = text.replace(" ", "")
15
 
16
- return True if (len(candidate_text) / len(text)) > ratio else False
17
 
18
 
19
  def filter_by_num_tokens(text, gt=64):
20
- return True if len(word_tokenize(text)) > gt else False
21
 
22
 
23
  def filter_by_num_sents(text, gt=2):
24
- return True if len(sent_tokenize(text)) > gt else False
25
 
26
 
27
  def normalizer(text, do_lowercase=False):
@@ -31,6 +30,3 @@ def normalizer(text, do_lowercase=False):
31
  text = text.lower()
32
 
33
  return text
34
-
35
-
36
-
 
2
  from hazm import sent_tokenize
3
  import re
4
  import six
 
5
 
6
  from normalizer import normalize
7
 
 
12
  candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
13
  text = text.replace(" ", "")
14
 
15
+ return (len(candidate_text) / len(text)) > ratio
16
 
17
 
18
  def filter_by_num_tokens(text, gt=64):
19
+ return len(word_tokenize(text)) > gt
20
 
21
 
22
  def filter_by_num_sents(text, gt=2):
23
+ return len(sent_tokenize(text)) > gt
24
 
25
 
26
  def normalizer(text, do_lowercase=False):
 
30
  text = text.lower()
31
 
32
  return text
 
 
 
src/requirements.txt CHANGED
@@ -3,4 +3,5 @@ jax>=0.2.8
3
  jaxlib>=0.1.59
4
  flax>=0.3.4
5
  optax>=0.0.8
6
- hazm
 
 
3
  jaxlib>=0.1.59
4
  flax>=0.3.4
5
  optax>=0.0.8
6
+ hazm
7
+ tensorboard