editted data_utils-url,html,streched alphabet
Browse files- src/data_utils.py +34 -1
src/data_utils.py
CHANGED
@@ -2,11 +2,12 @@ from hazm import word_tokenize
|
|
2 |
from hazm import sent_tokenize
|
3 |
import re
|
4 |
import six
|
|
|
5 |
|
6 |
from normalizer import normalize
|
7 |
|
8 |
persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
|
9 |
-
|
10 |
|
11 |
def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
|
12 |
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
|
@@ -25,7 +26,39 @@ def filter_by_num_sents(text, gt=2):
|
|
25 |
|
26 |
def normalizer(text, do_lowercase=False):
|
27 |
text = normalize(text)
|
|
|
28 |
if do_lowercase:
|
29 |
text = text.lower()
|
30 |
|
31 |
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from hazm import sent_tokenize
|
3 |
import re
|
4 |
import six
|
5 |
+
import string
|
6 |
|
7 |
from normalizer import normalize
|
8 |
|
9 |
persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"
|
10 |
+
allowed_char = string.ascii_letters + string.digits + ':/@_-. '
|
11 |
|
12 |
def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"):
|
13 |
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
|
|
|
26 |
|
27 |
def normalizer(text, do_lowercase=False):
|
28 |
text = normalize(text)
|
29 |
+
text = text.replace('ـ', '')
|
30 |
if do_lowercase:
|
31 |
text = text.lower()
|
32 |
|
33 |
return text
|
34 |
+
|
35 |
+
|
36 |
+
def clean_url(text):
|
37 |
+
## removing html tags
|
38 |
+
text = re.sub('<.*?>', '', text)
|
39 |
+
## removing normal(without space urls)
|
40 |
+
text = re.sub(r'http\S+', "", text)
|
41 |
+
## removing urls that contains space
|
42 |
+
result = ''
|
43 |
+
for char in text:
|
44 |
+
if char in allowed_char:
|
45 |
+
result += char
|
46 |
+
result = result.replace(' ', '')
|
47 |
+
result = result.split(':')
|
48 |
+
for phrase in result:
|
49 |
+
p = phrase.replace(' ', '')
|
50 |
+
# text = text.replace(p, "")
|
51 |
+
if "/ /" or "//" in p:
|
52 |
+
if ('https :' + p) or ('https:' + p) in text:
|
53 |
+
text = text.replace('https :' + p, '')
|
54 |
+
text = text.replace('https:' + p, '')
|
55 |
+
elif ('http :' + p) or ('http:' + p) in text:
|
56 |
+
text = text.replace('http :' + p, '')
|
57 |
+
text = text.replace('http:' + p, '')
|
58 |
+
elif '@' in p:
|
59 |
+
if p in text:
|
60 |
+
text = text.replace(p, '')
|
61 |
+
else:
|
62 |
+
text = text.replace(p, "")
|
63 |
+
|
64 |
+
return text
|