nam194 commited on
Commit
1cafba1
1 Parent(s): 246d50e

Update all_datasets.py

Browse files
Files changed (1) hide show
  1. all_datasets.py +3 -43
all_datasets.py CHANGED
@@ -61,54 +61,14 @@ def sentiment_dataset(path_folder, train_file_name, test_file_name):
61
  custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
62
  return custom_tokenized
63
 
64
- # support function for ner task
65
- def get_dict_map(data, mode="token"):
66
- if mode == "token":
67
- vocab = list(set([j[0] for i in data for j in i]))
68
- else:
69
- vocab = list(set([j[1] for i in data for j in i]))
70
- idx2tok = {idx:tok for idx, tok in enumerate(vocab)}
71
- tok2idx = {tok:idx for idx, tok in enumerate(vocab)}
72
- return tok2idx, idx2tok
73
-
74
- def read_csv_to_ner_data(path):
75
- data = pd.read_csv(path, encoding="utf-8")
76
- tok = list(data["token"])
77
- tok = [replace_all(i) for i in tok]
78
- lab = list(data["label"])
79
- token = []
80
- label = []
81
- tmp = []
82
- tmp_ = []
83
- for i, txt in enumerate(tok):
84
- if str(txt) != "nan":
85
- tmp.append(txt)
86
- tmp_.append(lab[i])
87
- else:
88
- token.append(tmp)
89
- label.append(tmp_)
90
- tmp = []
91
- tmp_ = []
92
-
93
- data = []
94
- tmp = []
95
- for i, sent in enumerate(token):
96
- for j, tok in enumerate(sent):
97
- tmp.append([tok, label[i][j]])
98
- data.append(tmp)
99
- tmp = []
100
- return data
101
-
102
  # get feature for ner task
103
  def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
104
  features = []
105
  tokens = []
106
  tag_ids = []
107
- # args = parse_arguments()
108
- path = os.path.abspath("./data/topic")
109
- file_name = os.listdir(path)[0]
110
- df = read_csv_to_ner_data(os.path.join(path, file_name))
111
- tag2idx, idx2tag = get_dict_map(df, 'tag')
112
  for id, tokens in enumerate(data):
113
  if tokens == []:
114
  continue
 
61
  custom_tokenized.set_format('torch',columns=["input_ids", 'token_type_ids', "attention_mask", "label"])
62
  return custom_tokenized
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  # get feature for ner task
65
  def feature_for_phobert(data, tokenizer, max_seq_len: int=256, use_crf: bool = False) -> List[NerFeatures]:
66
  features = []
67
  tokens = []
68
  tag_ids = []
69
+
70
+ idx2tag = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
71
+ tag2idx = {v: k for k, v in idx2tag.items()}
 
 
72
  for id, tokens in enumerate(data):
73
  if tokens == []:
74
  continue