saied commited on
Commit
95cd35a
1 Parent(s): ec2c00e

editted data_utils-url,html,streched alphabet

Browse files
Files changed (1) hide show
  1. src/data_utils.py +34 -1
src/data_utils.py CHANGED
@@ -2,11 +2,12 @@ from hazm import word_tokenize
2
  from hazm import sent_tokenize
3
  import re
4
  import six
 
5
 
6
  from normalizer import normalize
7
 
8
  persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
9
-
10
 
11
  def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
12
  candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
@@ -25,7 +26,39 @@ def filter_by_num_sents(text, gt=2):
25
 
26
  def normalizer(text, do_lowercase=False):
27
  text = normalize(text)
 
28
  if do_lowercase:
29
  text = text.lower()
30
 
31
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from hazm import sent_tokenize
3
  import re
4
  import six
5
+ import string
6
 
7
  from normalizer import normalize
8
 
9
  persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
10
+ allowed_char = string.ascii_letters + string.digits + ':/@_-. '
11
 
12
  def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
13
  candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
 
26
 
27
  def normalizer(text, do_lowercase=False):
28
  text = normalize(text)
29
+ text = text.replace('ـ', '')
30
  if do_lowercase:
31
  text = text.lower()
32
 
33
  return text
34
+
35
+
36
+ def clean_url(text):
37
+ ## removing html tags
38
+ text = re.sub('<.*?>', '', text)
39
+ ## removing normal(without space urls)
40
+ text = re.sub(r'http\S+', "", text)
41
+ ## removing urls that contains space
42
+ result = ''
43
+ for char in text:
44
+ if char in allowed_char:
45
+ result += char
46
+ result = result.replace(' ', '')
47
+ result = result.split(':')
48
+ for phrase in result:
49
+ p = phrase.replace(' ', '')
50
+ # text = text.replace(p, "")
51
+ if "/ /" or "//" in p:
52
+ if ('https :' + p) or ('https:' + p) in text:
53
+ text = text.replace('https :' + p, '')
54
+ text = text.replace('https:' + p, '')
55
+ elif ('http :' + p) or ('http:' + p) in text:
56
+ text = text.replace('http :' + p, '')
57
+ text = text.replace('http:' + p, '')
58
+ elif '@' in p:
59
+ if p in text:
60
+ text = text.replace(p, '')
61
+ else:
62
+ text = text.replace(p, "")
63
+
64
+ return text