nam194 commited on
Commit
4d38a62
1 Parent(s): 916197c

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +4 -5
utils.py CHANGED
@@ -1,8 +1,5 @@
1
- import jdk
2
- jdk.install('11', jre=True)
3
  from imports import *
4
  import unicodedata
5
- rdrsegmenter = VnCoreNLP("./vncorenlp_segmenter/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
6
  dict_map = {
7
  "òa": "oà",
8
  "Òa": "Oà",
@@ -59,7 +56,8 @@ def replace_all(text, dict_map=dict_map):
59
  def normalize(text, segment=True):
60
  text = replace_all(text, dict_map)
61
  if segment:
62
- text = ' '.join([' '.join(sent) for sent in rdrsegmenter.tokenize(text)])
 
63
  return text
64
  def text_preprocess(document):
65
  punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
@@ -85,7 +83,8 @@ def text_preprocess(document):
85
  document = re.sub(" ", " ", document)
86
  document = re.sub(" ", " ", document)
87
  try:
88
- document = ' '.join(rdrsegmenter.tokenize(document)[0])
 
89
  except:
90
  pass
91
  return document.lower()
 
 
 
1
  from imports import *
2
  import unicodedata
 
3
  dict_map = {
4
  "òa": "oà",
5
  "Òa": "Oà",
 
56
  def normalize(text, segment=True):
57
  text = replace_all(text, dict_map)
58
  if segment:
59
+ text = text.split(".")
60
+ text = ". ".join([underthesea.word_tokenize(i, format="text") for i in text)])
61
  return text
62
  def text_preprocess(document):
63
  punc = [i for i in ["\"", "-", ".", ":"]]#string.punctuation.replace(",","")]
 
83
  document = re.sub(" ", " ", document)
84
  document = re.sub(" ", " ", document)
85
  try:
86
+ document = document.split(".")
87
+ document = ". ".join([underthesea.word_tokenize(i, format="text") for i in document)])
88
  except:
89
  pass
90
  return document.lower()