saied commited on
Commit
79fa2a7
1 Parent(s): 95cd35a

some modification in preprocessing

Browse files
Files changed (1) hide show
  1. src/data_utils.py +1 -1
src/data_utils.py CHANGED
@@ -37,7 +37,7 @@ def clean_url(text):
37
  ## removing html tags
38
  text = re.sub('<.*?>', '', text)
39
  ## removing normal(without space urls)
40
- text = re.sub(r'http\S+', "", text)
41
  ## removing urls that contains space
42
  result = ''
43
  for char in text:
 
37
  ## removing html tags
38
  text = re.sub('<.*?>', '', text)
39
  ## removing normal(without space urls)
40
+ text = re.sub(r'(?:(?:http|https):\/\/)?([-a-zA-Z0-9.]{2,256}\.[a-z]{2,4})\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?', "", text)
41
  ## removing urls that contains space
42
  result = ''
43
  for char in text: