some modification in preprocessing
Browse files- src/data_utils.py +1 -1
src/data_utils.py
CHANGED
@@ -37,7 +37,7 @@ def clean_url(text):
|
|
37 |
## removing html tags
|
38 |
text = re.sub('<.*?>', '', text)
|
39 |
## removing normal(without space urls)
|
40 |
-
text = re.sub(r'http\
|
41 |
## removing urls that contains space
|
42 |
result = ''
|
43 |
for char in text:
|
|
|
37 |
## removing html tags
|
38 |
text = re.sub('<.*?>', '', text)
|
39 |
## removing normal(without space urls)
|
40 |
+
text = re.sub(r'(?:(?:http|https):\/\/)?([-a-zA-Z0-9.]{2,256}\.[a-z]{2,4})\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?', "", text)
|
41 |
## removing urls that contains space
|
42 |
result = ''
|
43 |
for char in text:
|