tqfang229 commited on
Commit
2e3f432
1 Parent(s): b4a6d7e

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cached_dev_deberta-mlm_128_atomic filter=lfs diff=lfs merge=lfs -text
37
+ cached_train_deberta-mlm_128_atomic filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 128000
3
+ }
cached_dev_deberta-mlm_128_atomic ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de2aeddcf9134d495f3461fedebd548864b101e1920dcc1facf27a6790e27e75
3
+ size 4501475
cached_train_deberta-mlm_128_atomic ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5be198ba32b721dd13ee5c578e9de10d3e49aaa3f273d7407b847bad3ae39e1b
3
+ size 365724007
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/deberta-v3-large",
3
+ "architectures": [
4
+ "DebertaV2ForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "finetuning_task": "atomic",
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4096,
13
+ "layer_norm_eps": 1e-07,
14
+ "max_position_embeddings": 512,
15
+ "max_relative_positions": -1,
16
+ "model_type": "deberta-v2",
17
+ "norm_rel_ebd": "layer_norm",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "pad_token_id": 0,
21
+ "pooler_dropout": 0,
22
+ "pooler_hidden_act": "gelu",
23
+ "pooler_hidden_size": 1024,
24
+ "pos_att_type": [
25
+ "p2c",
26
+ "c2p"
27
+ ],
28
+ "position_biased_input": false,
29
+ "position_buckets": 256,
30
+ "relative_attention": true,
31
+ "share_att_key": true,
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.31.0",
34
+ "type_vocab_size": 0,
35
+ "vocab_size": 128100
36
+ }
data_utils.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+
4
+ import nltk
5
+ from nltk.corpus import stopwords
6
+ from tqdm import tqdm
7
+ from tqdm import tqdm
8
+
9
+ skip_words = set(stopwords.words('english'))
10
+ skip_words.add('\'s')
11
+ skip_words.add('.')
12
+ skip_words.add(',')
13
+ PERSON_NAMES = ['Alex', 'Ash', 'Aspen', 'Bali', 'Berkeley', 'Cameron', 'Chris', 'Cody', 'Dana', 'Drew', 'Emory',
14
+ 'Flynn', 'Gale', 'Jamie', 'Jesse',
15
+ 'Kai', 'Kendall', 'Kyle', 'Lee', 'Logan', 'Max', 'Morgan', 'Nico', 'Paris', 'Pat', 'Quinn', 'Ray',
16
+ 'Robin', 'Rowan', 'Rudy', 'Sam', 'Skylar', 'Sydney',
17
+ 'Taylor', 'Tracy', 'West', 'Wynne']
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def accuracy(out, labels):
22
+ return {'acc': (out == labels).mean()}
23
+
24
+
25
+ def handle_words(span, tokenizer, keywords=None, is_start=False):
26
+ inputs = []
27
+ labels = []
28
+ words = nltk.word_tokenize(span)
29
+ for w_i, w in enumerate(words):
30
+ if (w_i == 0 and is_start) or w == '.' or w == ',' or w.startswith('\''):
31
+ w_bpes = tokenizer.tokenize(w)
32
+ else:
33
+ w_bpes = tokenizer.tokenize(w, add_prefix_space=True)
34
+ inputs.extend(w_bpes)
35
+ if keywords != None:
36
+ if w in keywords:
37
+ labels.extend(w_bpes)
38
+ else:
39
+ labels.extend([-100] * len(w_bpes))
40
+ else:
41
+ if w not in PERSON_NAMES and w not in skip_words and w.lower() not in skip_words:
42
+ labels.extend(w_bpes)
43
+ else:
44
+ labels.extend([-100] * len(w_bpes))
45
+ return inputs, labels
46
+
47
+
48
+ def handle_underscores(suffix, tokenizer, keywords=None, prefix=False):
49
+ inputs = []
50
+ labels = []
51
+ if '_' in suffix:
52
+ suffix_parts = [i.strip() for i in suffix.split('___')]
53
+ for i, part in enumerate(suffix_parts):
54
+ if part:
55
+ tmp_inputs, tmp_labels = handle_words(part, tokenizer, keywords=keywords, is_start=(i == 0 and prefix))
56
+ inputs += tmp_inputs
57
+ labels += tmp_labels
58
+
59
+ if i != len(suffix_parts) - 1 and suffix_parts[i + 1]:
60
+ inputs.append(tokenizer.mask_token)
61
+ labels.append(-100)
62
+ else:
63
+ inputs.append(tokenizer.mask_token)
64
+ labels.append(-100)
65
+ else:
66
+ inputs, labels = handle_words(suffix, tokenizer, keywords=keywords, is_start=prefix)
67
+ return inputs, labels
68
+
69
+ from tqdm import tqdm
70
+ def convert_examples_to_features(examples, tokenizer, max_length=512):
71
+ data = []
72
+ for example in tqdm(examples, desc="converting examples to features"):
73
+ inputs, labels = handle_underscores(example['context'], tokenizer, keywords=example.get('keywords', None), prefix=True)
74
+ choices = [handle_underscores(cand, tokenizer) for cand in example['candidates']]
75
+ input_ids = [inputs + cand[0] for cand in choices]
76
+ input_ids = [tokenizer.convert_tokens_to_ids(cand) for cand in input_ids]
77
+ label_ids = [labels + cand[1] for cand in choices]
78
+ label_ids = [[t if t == -100 else input_ids[i][t_i] for t_i, t in enumerate(cand)] for i, cand in
79
+ enumerate(label_ids)]
80
+ label_ids = [[-100] + cand + [-100] for cand in label_ids]
81
+ input_ids = [tokenizer.prepare_for_model(cand, max_length=max_length, truncation=True)['input_ids'] for cand in
82
+ input_ids]
83
+ data.append([input_ids, label_ids, example['correct']])
84
+ return data
85
+
86
+
87
+ class ATOMICMLMProcessor(object):
88
+ def __init__(self, args):
89
+ self.D = []
90
+ self.filelist = [args.train_file, args.dev_file]
91
+
92
+ def get_train_examples(self):
93
+ self.load_data(self.filelist[0])
94
+ return self.D
95
+
96
+ def get_dev_examples(self):
97
+ data = []
98
+ with open(self.filelist[1], 'r') as f:
99
+ for row in tqdm(f):
100
+ sample = json.loads(row)
101
+ data.append(sample)
102
+ print(len(data))
103
+ return data
104
+
105
+ def load_data(self, filename):
106
+ with open(filename, "r") as f:
107
+ for row in tqdm(f):
108
+ sample = json.loads(row)
109
+ self.D.append({'id': sample['id'], 'context': sample['context'],
110
+ 'ending': sample['candidates'][sample['correct']], 'keywords': sample.get('keywords', None)})
111
+ print(len(self.D))
112
+
113
+
114
+ class ATOMICProcessor(object):
115
+ def __init__(self, args):
116
+ print('loading from %s %s' % (args.train_file, args.dev_file))
117
+ self.filelist = [args.train_file, args.dev_file]
118
+ self.D = [[], []]
119
+
120
+ def get_train_examples(self):
121
+ self.load_data(self.filelist[0], 0)
122
+ return self.D[0]
123
+
124
+ def get_dev_examples(self):
125
+ self.load_data(self.filelist[1], 1)
126
+ return self.D[1]
127
+
128
+ def load_data(self, filename, sid):
129
+ with open(filename, "r") as f:
130
+ for row in tqdm(f):
131
+ sample = json.loads(row)
132
+ self.D[sid].append(sample)
133
+ print(len(self.D[sid]))
134
+
135
+
136
+ class CWWVProcessor(object):
137
+ def __init__(self, args):
138
+ self.answerKey_mapping = {'A': 0, 'B': 1, 'C': 2}
139
+ self.D = [[], []]
140
+ if args.task_name == 'cskg':
141
+ print('loading from %s %s' % (args.second_train_file, args.second_dev_file))
142
+ self.filelist = [args.second_train_file, args.second_dev_file]
143
+ else:
144
+ print('loading from %s %s' % (args.train_file, args.dev_file))
145
+ self.filelist = [args.train_file, args.dev_file]
146
+
147
+ def get_train_examples(self):
148
+ self.load_data(self.filelist[0], 0)
149
+ return self.D[0]
150
+
151
+ def get_dev_examples(self):
152
+ self.load_data(self.filelist[1], 1)
153
+ return self.D[1]
154
+
155
+ def load_data(self, filename, sid):
156
+ skipped = 0
157
+ with open(filename, "r") as f:
158
+ for row in tqdm(f):
159
+ sample = json.loads(row)
160
+ context = sample['question']['stem']
161
+ if context.endswith('.'):
162
+ context = context[:-1]
163
+ if not context.endswith('[MASK]'):
164
+ skipped += 1
165
+ context_parts = context.split('[MASK]')
166
+ context = context_parts[0].strip()
167
+ candidates = [c['text'] + context_parts[1] + '.' for c in sample['question']['choices']]
168
+ else:
169
+ context = context[:-7]
170
+ candidates = [c['text'] + '.' for c in sample['question']['choices']]
171
+ label = self.answerKey_mapping[sample['answerKey']]
172
+ keywords = nltk.word_tokenize(sample['question']['head'])
173
+ keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
174
+ self.D[sid].append({'id': sample['id'], 'context': context, 'correct': label, 'candidates': candidates,
175
+ 'keywords': keywords})
176
+ print(len(self.D[sid]), skipped)
177
+
178
+
179
+ class CWWVMLMProcessor(object):
180
+ def __init__(self, args):
181
+ self.answerKey_mapping = {'A': 0, 'B': 1, 'C': 2}
182
+ self.D = []
183
+ self.filelist = [args.train_file, args.dev_file]
184
+ self.args = args
185
+
186
+ def get_train_examples(self):
187
+ self.load_data(self.filelist[0])
188
+ return self.D
189
+
190
+ def get_dev_examples(self):
191
+ processor = CSKGProcessor(self.args)
192
+ return processor.get_dev_examples()
193
+
194
+ def load_data(self, filename):
195
+ skipped = 0
196
+ with open(filename, "r") as f:
197
+ for row in tqdm(f):
198
+ sample = json.loads(row)
199
+ context = sample['question']['stem']
200
+ if context.endswith('.'):
201
+ context = context[:-1]
202
+ assert context.endswith('[MASK]')
203
+ context = context[:-7]
204
+ candidates = [c['text'] + '.' for c in sample['question']['choices']]
205
+ label = self.answerKey_mapping[sample['answerKey']]
206
+ keywords = nltk.word_tokenize(sample['question']['head'])
207
+ keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
208
+ self.D.append(
209
+ {'id': sample['id'], 'context': context, 'ending': candidates[label], 'keywords': keywords})
210
+ print(len(self.D))
211
+
212
+
213
+ class CSKGProcessor(object):
214
+ def __init__(self, args):
215
+ # CWWV set always uses second train/dev file params
216
+ self.atomicprocessor = ATOMICProcessor(args)
217
+ self.cwwvprocessor = CWWVProcessor(args)
218
+
219
+ def get_train_examples(self):
220
+ cwwv_questions = self.cwwvprocessor.get_train_examples()
221
+ atomic_questions = self.atomicprocessor.get_train_examples()
222
+ return cwwv_questions + atomic_questions
223
+
224
+ def get_dev_examples(self):
225
+ cwwv_questions = self.cwwvprocessor.get_dev_examples()
226
+ atomic_questions = self.atomicprocessor.get_dev_examples()
227
+ return cwwv_questions + atomic_questions
228
+
229
+
230
+ myprocessors = {
231
+ "atomic": ATOMICProcessor,
232
+ "cwwv": CWWVProcessor,
233
+ "atomicmlm": ATOMICMLMProcessor,
234
+ "cwwvmlm": CWWVMLMProcessor,
235
+ "cskg": CSKGProcessor
236
+ }
eval_results.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ acc = 0.525
logits_test.txt ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -13.485865592956543 -12.554702758789062 -16.843822479248047 -17.034975051879883 -15.528239250183105
2
+ -11.945438385009766 -15.764880180358887 -15.65064811706543 -17.049840927124023 -15.45334243774414
3
+ -11.587425231933594 -16.005020141601562 -18.591140747070312 -13.294342041015625 -16.296398162841797
4
+ -10.009797096252441 -15.341827392578125 -10.539949417114258 -10.960021018981934 -14.047414779663086
5
+ -12.25959300994873 -10.891813278198242 -14.401248931884766 -11.447187423706055 -13.685133934020996
6
+ -12.804305076599121 -15.47064208984375 -12.90679931640625 -12.644388198852539 -16.551565170288086
7
+ -12.235984802246094 -17.26417350769043 -15.814994812011719 -14.540316581726074 -16.57213020324707
8
+ -13.064817428588867 -14.498727798461914 -12.224983215332031 -18.461196899414062 -15.455927848815918
9
+ -9.443912506103516 -15.756866455078125 -14.860000610351562 -11.844358444213867 -16.200241088867188
10
+ -11.700218200683594 -15.54508113861084 -18.18265724182129 -17.967411041259766 -15.61465072631836
11
+ -11.174132347106934 -17.90937614440918 -17.444185256958008 -14.454703330993652 -14.554675102233887
12
+ -12.42685604095459 -12.927202224731445 -14.598489761352539 -13.699914932250977 -16.133480072021484
13
+ -10.828542709350586 -11.421213150024414 -14.742795944213867 -16.211345672607422 -15.933958053588867
14
+ -9.981731414794922 -13.114375114440918 -10.851600646972656 -13.30392074584961 -15.448455810546875
15
+ -12.899808883666992 -12.504766464233398 -11.667335510253906 -11.96485710144043 -16.87687110900879
16
+ -10.543583869934082 -10.229654312133789 -11.832517623901367 -13.934309005737305 -14.101360321044922
17
+ -12.628169059753418 -14.552963256835938 -15.033495903015137 -14.496928215026855 -14.277275085449219
18
+ -10.540155410766602 -17.203994750976562 -15.181567192077637 -13.87678337097168 -16.33222198486328
19
+ -12.89242935180664 -12.802375793457031 -11.98313045501709 -12.95468807220459 -14.210517883300781
20
+ -13.707990646362305 -15.822447776794434 -14.352869033813477 -16.99176597595215 -15.332679748535156
21
+ -18.221946716308594 -14.591471672058105 -11.751192092895508 -15.28943157196045 -15.79006576538086
22
+ -10.999690055847168 -17.102828979492188 -11.563447952270508 -16.147525787353516 -15.312467575073242
23
+ -12.119155883789062 -13.997671127319336 -14.05282974243164 -15.500701904296875 -15.024360656738281
24
+ -12.406027793884277 -13.902008056640625 -14.89413070678711 -13.278053283691406 -16.163454055786133
25
+ -12.729238510131836 -15.645377159118652 -14.512473106384277 -16.292064666748047 -14.945101737976074
26
+ -11.565582275390625 -15.689852714538574 -15.196950912475586 -16.58242416381836 -16.84659194946289
27
+ -9.898529052734375 -12.66911506652832 -12.563604354858398 -15.67273235321045 -14.335538864135742
28
+ -9.923110961914062 -11.773245811462402 -11.786827087402344 -17.258573532104492 -15.622135162353516
29
+ -10.604410171508789 -18.388168334960938 -17.512317657470703 -19.24336051940918 -14.125904083251953
30
+ -16.949708938598633 -13.706928253173828 -16.456981658935547 -14.97553539276123 -14.764923095703125
31
+ -11.796066284179688 -17.549203872680664 -16.29804229736328 -14.065374374389648 -14.975845336914062
32
+ -16.109678268432617 -10.775965690612793 -14.450605392456055 -14.85343074798584 -14.199943542480469
33
+ -12.714162826538086 -14.389898300170898 -14.631216049194336 -13.585458755493164 -14.533248901367188
34
+ -11.866647720336914 -14.02271842956543 -16.863229751586914 -16.302730560302734 -15.423881530761719
35
+ -15.34256362915039 -13.611021995544434 -16.341060638427734 -18.729846954345703 -16.023725509643555
36
+ -13.729169845581055 -13.358748435974121 -16.144556045532227 -15.138229370117188 -15.170283317565918
37
+ -15.203313827514648 -15.215215682983398 -15.796660423278809 -13.29316520690918 -14.272794723510742
38
+ -10.206670761108398 -14.634307861328125 -16.95503807067871 -15.288354873657227 -14.735435485839844
39
+ -15.958247184753418 -11.922269821166992 -15.554479598999023 -16.529380798339844 -14.621512413024902
40
+ -11.756235122680664 -13.470741271972656 -14.978282928466797 -12.765861511230469 -15.564665794372559
41
+ -12.600726127624512 -15.769315719604492 -15.375072479248047 -18.390806198120117 -15.738916397094727
42
+ -15.593986511230469 -14.472156524658203 -13.198034286499023 -14.962379455566406 -16.735809326171875
43
+ -14.267927169799805 -12.278310775756836 -14.038803100585938 -15.891279220581055 -15.780318260192871
44
+ -12.514389038085938 -17.2401180267334 -14.838483810424805 -15.404850006103516 -15.5819091796875
45
+ -12.566061019897461 -12.968563079833984 -10.93451976776123 -13.795123100280762 -14.745218276977539
46
+ -21.77191162109375 -12.907783508300781 -14.735198974609375 -16.297962188720703 -16.843549728393555
47
+ -12.247344970703125 -13.780832290649414 -12.38991928100586 -14.87220573425293 -16.25807762145996
48
+ -10.589265823364258 -14.178890228271484 -15.336084365844727 -12.817083358764648 -15.201112747192383
49
+ -11.625978469848633 -12.869548797607422 -12.684764862060547 -13.059203147888184 -14.451128959655762
50
+ -15.250967979431152 -15.940725326538086 -12.647708892822266 -17.80953598022461 -14.153979301452637
51
+ -13.933172225952148 -11.741453170776367 -16.345247268676758 -15.836057662963867 -14.906820297241211
52
+ -11.782374382019043 -14.776283264160156 -14.698659896850586 -18.39876365661621 -15.45709228515625
53
+ -11.998146057128906 -16.717647552490234 -17.300548553466797 -18.50290298461914 -15.252758026123047
54
+ -13.041389465332031 -14.309408187866211 -16.311140060424805 -19.84041404724121 -15.644210815429688
55
+ -11.62677001953125 -14.909978866577148 -18.91885757446289 -16.421764373779297 -15.917211532592773
56
+ -15.454143524169922 -14.28858757019043 -14.329549789428711 -13.516512870788574 -15.161291122436523
57
+ -14.137840270996094 -14.48200511932373 -14.00358772277832 -17.134559631347656 -14.228231430053711
58
+ -14.116720199584961 -15.66433334350586 -12.732897758483887 -13.650927543640137 -13.985754013061523
59
+ -12.114367485046387 -13.667549133300781 -16.13338279724121 -16.44911766052246 -13.523371696472168
60
+ -13.346145629882812 -18.75728988647461 -17.335689544677734 -17.35544204711914 -14.020357131958008
61
+ -12.425326347351074 -16.646942138671875 -14.301322937011719 -15.461494445800781 -15.571342468261719
62
+ -12.522256851196289 -11.32237434387207 -12.392147064208984 -10.751494407653809 -13.668184280395508
63
+ -11.427050590515137 -12.420162200927734 -12.610843658447266 -13.642584800720215 -13.698797225952148
64
+ -10.28366470336914 -10.129068374633789 -17.50717544555664 -18.64311408996582 -14.564188957214355
65
+ -12.855567932128906 -15.540145874023438 -19.281057357788086 -16.280914306640625 -15.508527755737305
66
+ -13.631608963012695 -13.476893424987793 -10.74915599822998 -17.62717056274414 -15.61255168914795
67
+ -14.164091110229492 -14.324302673339844 -17.250370025634766 -13.515758514404297 -15.604305267333984
68
+ -15.42667293548584 -18.33716583251953 -14.98896598815918 -17.703462600708008 -14.412519454956055
69
+ -11.1312255859375 -13.484000205993652 -10.89915657043457 -14.660863876342773 -14.351375579833984
70
+ -11.8013916015625 -15.06019115447998 -14.530506134033203 -14.725985527038574 -15.17402458190918
71
+ -11.380867004394531 -16.774526596069336 -19.0806941986084 -14.300642013549805 -14.787707328796387
72
+ -15.317098617553711 -10.536006927490234 -16.74585723876953 -17.00075340270996 -14.233205795288086
73
+ -12.836723327636719 -14.365041732788086 -13.245519638061523 -14.606501579284668 -15.848045349121094
74
+ -14.671722412109375 -12.97309398651123 -18.96438980102539 -18.358306884765625 -14.383865356445312
75
+ -12.006148338317871 -19.101789474487305 -18.057790756225586 -17.27611541748047 -14.254199981689453
76
+ -12.239377975463867 -16.24175262451172 -18.39486312866211 -17.79523468017578 -15.77902889251709
77
+ -11.026527404785156 -17.164255142211914 -16.89369773864746 -14.13223648071289 -14.647666931152344
78
+ -13.34439468383789 -15.085588455200195 -14.015231132507324 -14.57590389251709 -15.548398971557617
79
+ -8.840858459472656 -13.32087230682373 -11.948570251464844 -12.7437744140625 -12.76047420501709
80
+ -15.12006664276123 -14.209697723388672 -16.21258544921875 -15.09268569946289 -15.725016593933105
81
+ -12.218896865844727 -19.344924926757812 -14.896772384643555 -15.128798484802246 -15.085726737976074
82
+ -13.855364799499512 -14.396207809448242 -14.475112915039062 -16.274978637695312 -16.561267852783203
83
+ -12.931468963623047 -20.261959075927734 -15.067851066589355 -18.67504119873047 -15.390913963317871
84
+ -10.541189193725586 -17.481229782104492 -16.9975643157959 -19.102977752685547 -14.517946243286133
85
+ -12.27475643157959 -16.989179611206055 -16.667797088623047 -15.321989059448242 -13.522154808044434
86
+ -13.157089233398438 -16.034408569335938 -18.0081729888916 -12.339900016784668 -14.611282348632812
87
+ -11.697013854980469 -18.469486236572266 -17.101667404174805 -18.21478271484375 -15.592073440551758
88
+ -10.410983085632324 -12.342016220092773 -19.91473960876465 -13.985015869140625 -13.253973007202148
89
+ -11.477904319763184 -10.292394638061523 -14.22382926940918 -14.867258071899414 -14.294317245483398
90
+ -13.928434371948242 -15.648571014404297 -13.287254333496094 -18.776403427124023 -14.822330474853516
91
+ -15.338903427124023 -17.95416831970215 -19.120832443237305 -16.684314727783203 -16.52161407470703
92
+ -11.756458282470703 -15.748832702636719 -16.6794376373291 -14.992359161376953 -14.095579147338867
93
+ -12.05335807800293 -16.373342514038086 -17.24437713623047 -16.61781883239746 -13.433090209960938
94
+ -10.43917465209961 -17.29708480834961 -14.298304557800293 -18.689228057861328 -13.535030364990234
95
+ -12.81536865234375 -16.188444137573242 -13.543594360351562 -17.28397560119629 -13.897306442260742
96
+ -11.524691581726074 -16.597545623779297 -16.99277114868164 -17.697744369506836 -15.411514282226562
97
+ -12.537017822265625 -15.651408195495605 -17.060205459594727 -16.015666961669922 -14.72602367401123
98
+ -16.58844757080078 -12.648893356323242 -12.805540084838867 -16.793424606323242 -14.052775382995605
99
+ -8.198223114013672 -14.490667343139648 -15.456783294677734 -18.407733917236328 -13.935432434082031
100
+ -12.235084533691406 -13.254622459411621 -17.220083236694336 -17.194355010986328 -16.625526428222656
101
+ -11.645500183105469 -12.380645751953125 -16.97612762451172 -16.918365478515625 -16.081634521484375
102
+ -15.551166534423828 -18.355005264282227 -17.210491180419922 -16.315765380859375 -16.989500045776367
103
+ -11.072067260742188 -14.111711502075195 -14.02056884765625 -13.801989555358887 -14.380411148071289
104
+ -10.74884033203125 -17.039485931396484 -16.053722381591797 -17.099529266357422 -14.157671928405762
105
+ -14.432212829589844 -15.668070793151855 -17.897315979003906 -19.030807495117188 -16.767248153686523
106
+ -10.875746726989746 -16.729921340942383 -16.830699920654297 -10.96748161315918 -14.065666198730469
107
+ -12.992372512817383 -13.389198303222656 -15.847464561462402 -18.74746322631836 -14.213224411010742
108
+ -16.27631378173828 -15.575210571289062 -15.83560562133789 -19.659807205200195 -15.64935302734375
109
+ -14.469172477722168 -17.462440490722656 -13.705839157104492 -14.783185958862305 -15.626546859741211
110
+ -12.304758071899414 -11.911407470703125 -15.557670593261719 -12.007896423339844 -13.70199966430664
111
+ -12.623868942260742 -13.375839233398438 -17.584022521972656 -14.576876640319824 -15.300482749938965
112
+ -12.100661277770996 -15.983474731445312 -13.280969619750977 -16.395549774169922 -13.329227447509766
113
+ -11.029035568237305 -17.723522186279297 -16.722801208496094 -16.074941635131836 -14.450968742370605
114
+ -14.947690963745117 -13.623953819274902 -17.58797836303711 -18.477333068847656 -14.811092376708984
115
+ -15.929779052734375 -12.438506126403809 -13.058927536010742 -18.52623748779297 -14.129800796508789
116
+ -14.234972953796387 -16.270557403564453 -16.478979110717773 -16.46598243713379 -16.054523468017578
117
+ -11.928365707397461 -12.55320930480957 -17.857826232910156 -15.371994018554688 -15.036209106445312
118
+ -13.607194900512695 -15.683792114257812 -16.34781837463379 -15.106184005737305 -13.208913803100586
119
+ -12.801432609558105 -18.26355743408203 -17.48211669921875 -12.184883117675781 -15.798808097839355
120
+ -14.617281913757324 -14.461203575134277 -18.154417037963867 -14.951545715332031 -14.540334701538086
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f9e50c31777d2402062072d7ea15663f5a6b50395c09328957f06df9b7f7138
3
+ size 1740904889
run_pretrain.py ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from __future__ import absolute_import
18
+ from __future__ import division
19
+ from __future__ import print_function
20
+
21
+ import argparse
22
+ import json
23
+ import logging
24
+ import os
25
+ import random
26
+ import wandb
27
+
28
+ import numpy as np
29
+ import torch
30
+ from torch.optim import AdamW
31
+ from torch.utils.data import DataLoader
32
+ from torch.utils.data import RandomSampler
33
+ from torch.utils.data import SequentialSampler
34
+ from torch.utils.data.distributed import DistributedSampler
35
+ from torch.utils.tensorboard import SummaryWriter
36
+ from tqdm import tqdm
37
+ from tqdm import trange
38
+ from transformers import DebertaV2Config
39
+ from transformers import DebertaV2ForMaskedLM
40
+ from transformers import DebertaV2Tokenizer
41
+ from transformers import RobertaConfig
42
+ from transformers import RobertaForMaskedLM
43
+ from transformers import RobertaTokenizer
44
+ from transformers import get_linear_schedule_with_warmup
45
+
46
+ from data_utils import accuracy
47
+ from data_utils import convert_examples_to_features
48
+ from data_utils import myprocessors
49
+
50
+ from evaluate_DeBERTa import eval_tasks
51
+ from evaluate_DeBERTa import main as evaluate_main
52
+
53
+ logger = logging.getLogger(__name__)
54
+
55
+ from transformers import MODEL_WITH_LM_HEAD_MAPPING
56
+
57
+ MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
58
+ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
59
+ MODEL_CLASSES = {
60
+ 'roberta-mlm': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
61
+ 'deberta-mlm': (DebertaV2Config, DebertaV2ForMaskedLM, DebertaV2Tokenizer)
62
+ }
63
+
64
+
65
+ class MyDataset(torch.utils.data.Dataset):
66
+
67
+ def __init__(self, data, pad_token, mask_token, max_words_to_mask):
68
+ self.data = data
69
+ self.pad_token = pad_token
70
+ self.mask_token = mask_token
71
+ self.max_words_to_mask = max_words_to_mask
72
+
73
+ def __len__(self):
74
+ return len(self.data)
75
+
76
+ def __getitem__(self, idx):
77
+ sample = self.data[idx]
78
+ return sample, self.pad_token, self.mask_token, self.max_words_to_mask
79
+
80
+
81
+ def mCollateFn(batch):
82
+ batch_input_ids = []
83
+ batch_input_mask = []
84
+ batch_input_labels = []
85
+ batch_label_ids = []
86
+ features = [b[0] for b in batch]
87
+ pad_token = batch[0][1]
88
+ mask_token = batch[0][2]
89
+ MAX_WORDS_TO_MASK = batch[0][3]
90
+ max_len = max([len(cand) for f in features for cand in f[0]])
91
+ for f in features:
92
+ batch_input_ids.append([])
93
+ batch_input_mask.append([])
94
+ batch_input_labels.append([])
95
+ batch_label_ids.append(f[2])
96
+ for i in range(len(f[0])):
97
+ masked_sequences = []
98
+ masked_labels = []
99
+ this_att_mask = []
100
+ sequence = f[0][i] + [pad_token] * (max_len - len(f[0][i]))
101
+ label_sequence = f[1][i] + [-100] * (max_len - len(f[1][i]))
102
+ valid_indices = [l_i for l_i, l in enumerate(label_sequence) if l != -100]
103
+ if len(valid_indices) > MAX_WORDS_TO_MASK:
104
+ rm_indices = random.sample(valid_indices, (len(valid_indices) - MAX_WORDS_TO_MASK))
105
+ label_sequence = [-100 if l_i in rm_indices else l for l_i, l in enumerate(label_sequence)]
106
+ for j, t in enumerate(label_sequence):
107
+ if t == -100:
108
+ continue
109
+ masked_sequences.append(sequence)
110
+ masked_labels.append([-100] * max_len)
111
+ else:
112
+ masked_sequences.append(sequence[:j] + [mask_token] + sequence[j + 1:])
113
+ masked_labels.append([-100] * j + [sequence[j]] + [-100] * (max_len - j - 1))
114
+ this_att_mask.append([1] * len(f[0][i]) + [0] * (max_len - len(f[0][i])))
115
+ batch_input_ids[-1].append(torch.tensor(masked_sequences, dtype=torch.long))
116
+ batch_input_mask[-1].append(torch.tensor(this_att_mask, dtype=torch.long))
117
+ batch_input_labels[-1].append(torch.tensor(masked_labels, dtype=torch.long))
118
+ return batch_input_ids, batch_input_mask, batch_input_labels, torch.tensor(batch_label_ids, dtype=torch.long)
119
+
120
+
121
+ def set_seed(args):
122
+ random.seed(args.seed)
123
+ np.random.seed(args.seed)
124
+ torch.manual_seed(args.seed)
125
+ if args.n_gpu > 0:
126
+ torch.cuda.manual_seed_all(args.seed)
127
+
128
+
129
+ def count_parameters(model):
130
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
131
+
132
+
133
+ def train(args, train_dataset, model, tokenizer, eval_dataset):
134
+ """ Train the model """
135
+ if args.local_rank in [-1, 0]:
136
+ tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs'))
137
+
138
+ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
139
+ train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
140
+ train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,
141
+ collate_fn=mCollateFn)
142
+
143
+ if args.max_steps > 0:
144
+ t_total = args.max_steps
145
+ args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
146
+ else:
147
+ t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
148
+
149
+ # Prepare optimizer and schedule (linear warmup and decay)
150
+ no_decay = ['bias', 'LayerNorm.weight']
151
+ optimizer_grouped_parameters = [
152
+ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
153
+ 'weight_decay': args.weight_decay},
154
+ {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
155
+ ]
156
+
157
+ warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total)
158
+ logger.info("warm up steps = %d", warmup_steps)
159
+ optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98))
160
+ scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
161
+
162
+ if args.fp16:
163
+ try:
164
+ from apex import amp
165
+ except ImportError:
166
+ raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
167
+ model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
168
+
169
+ # multi-gpu training (should be after apex fp16 initialization)
170
+ if args.n_gpu > 1:
171
+ model = torch.nn.DataParallel(model)
172
+
173
+ # Distributed training (should be after apex fp16 initialization)
174
+ if args.local_rank != -1:
175
+ model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
176
+ output_device=args.local_rank,
177
+ find_unused_parameters=True)
178
+ # Train!
179
+ logger.info("***** Running training *****")
180
+ logger.info(" Num examples = %d", len(train_dataset))
181
+ logger.info(" Num Epochs = %d", args.num_train_epochs)
182
+ logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
183
+ logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
184
+ args.train_batch_size * args.gradient_accumulation_steps * (
185
+ torch.distributed.get_world_size() if args.local_rank != -1 else 1))
186
+ logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
187
+ logger.info(" Total optimization steps = %d", t_total)
188
+
189
+ global_step = 0
190
+ tr_loss, logging_loss = 0.0, 0.0
191
+ model.zero_grad()
192
+ train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
193
+ set_seed(args) # Added here for reproductibility (even between python 2 and 3)
194
+ curr_best = 0.0
195
+ CE = torch.nn.CrossEntropyLoss(reduction='none')
196
+ loss_fct = torch.nn.MultiMarginLoss(margin=args.margin)
197
+ for _ in train_iterator:
198
+ epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
199
+ for step, batch in tqdm(enumerate(epoch_iterator), desc=f"Train Epoch {_}"):
200
+ model.train()
201
+ num_cand = len(batch[0][0])
202
+ choice_loss = []
203
+ choice_seq_lens = np.array([0] + [len(c) for sample in batch[0] for c in sample])
204
+ choice_seq_lens = np.cumsum(choice_seq_lens)
205
+ input_ids = torch.cat([c for sample in batch[0] for c in sample], dim=0).to(args.device)
206
+ att_mask = torch.cat([c for sample in batch[1] for c in sample], dim=0).to(args.device)
207
+ input_labels = torch.cat([c for sample in batch[2] for c in sample], dim=0).to(args.device)
208
+
209
+ if len(input_ids) < args.max_sequence_per_time:
210
+ inputs = {'input_ids': input_ids,
211
+ 'attention_mask': att_mask}
212
+ outputs = model(**inputs)
213
+ ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels.view(-1))
214
+ ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1)
215
+ else:
216
+ ce_loss = []
217
+ for chunk in range(0, len(input_ids), args.max_sequence_per_time):
218
+ inputs = {'input_ids': input_ids[chunk:chunk + args.max_sequence_per_time],
219
+ 'attention_mask': att_mask[chunk:chunk + args.max_sequence_per_time]}
220
+ outputs = model(**inputs)
221
+ tmp_ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)),
222
+ input_labels[chunk:chunk + args.max_sequence_per_time].view(-1))
223
+ tmp_ce_loss = tmp_ce_loss.view(outputs[0].size(0), -1).sum(1)
224
+ ce_loss.append(tmp_ce_loss)
225
+ ce_loss = torch.cat(ce_loss, dim=0)
226
+ # all tokens are valid
227
+ for c_i in range(len(choice_seq_lens) - 1):
228
+ start = choice_seq_lens[c_i]
229
+ end = choice_seq_lens[c_i + 1]
230
+ choice_loss.append(-ce_loss[start:end].sum() / (end - start))
231
+
232
+ choice_loss = torch.stack(choice_loss)
233
+ choice_loss = choice_loss.view(-1, num_cand)
234
+ loss = loss_fct(choice_loss, batch[3].to(args.device))
235
+
236
+ if args.n_gpu > 1:
237
+ loss = loss.mean() # mean() to average on multi-gpu parallel training
238
+ if args.gradient_accumulation_steps > 1:
239
+ loss = loss / args.gradient_accumulation_steps
240
+
241
+ if args.fp16:
242
+ with amp.scale_loss(loss, optimizer) as scaled_loss:
243
+ scaled_loss.backward()
244
+ else:
245
+ loss.backward()
246
+
247
+ tr_loss += loss.item()
248
+
249
+ if (step + 1) % args.gradient_accumulation_steps == 0:
250
+ optimizer.step()
251
+ scheduler.step() # Update learning rate schedule
252
+ model.zero_grad()
253
+ global_step += 1
254
+
255
+ if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
256
+ # Log metrics
257
+ tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step)
258
+ tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step)
259
+ tb_writer.add_scalar('Batch_loss', loss.item() * args.gradient_accumulation_steps, global_step)
260
+ logger.info(" global_step = %s, average loss = %s", global_step,
261
+ (tr_loss - logging_loss) / args.logging_steps)
262
+ wandb.log({"train/loss":loss.item()})
263
+ logging_loss = tr_loss
264
+
265
+ if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0:
266
+ torch.cuda.empty_cache()
267
+ results = evaluate(args, model, tokenizer, eval_dataset)
268
+ wandb.log({"eval/"+k:v for k,v in results.items()})
269
+ for key, value in results.items():
270
+ tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
271
+ if results['acc'] > curr_best:
272
+ curr_best = results['acc']
273
+ print("At iteration {}, best acc is {}".format(global_step, curr_best))
274
+ # Save model checkpoint
275
+ output_dir = args.output_dir
276
+ if not os.path.exists(output_dir):
277
+ os.makedirs(output_dir)
278
+ model_to_save = model.module if hasattr(model,
279
+ 'module') else model # Take care of distributed/parallel training
280
+ model_to_save.save_pretrained(output_dir)
281
+ tokenizer.save_pretrained(output_dir)
282
+ torch.save(args, os.path.join(output_dir, 'training_args.bin'))
283
+ logger.info("Saving model checkpoint to %s", output_dir)
284
+
285
+ if args.max_steps > 0 and global_step > args.max_steps:
286
+ epoch_iterator.close()
287
+ break
288
+ if args.max_steps > 0 and global_step > args.max_steps:
289
+ train_iterator.close()
290
+ break
291
+ results = evaluate(args, model, tokenizer, eval_dataset)
292
+ for key, value in results.items():
293
+ tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
294
+ if results['acc'] > curr_best:
295
+ curr_best = results['acc']
296
+ # Save model checkpoint
297
+ output_dir = args.output_dir
298
+ if not os.path.exists(output_dir):
299
+ os.makedirs(output_dir)
300
+ model_to_save = model.module if hasattr(model,
301
+ 'module') else model # Take care of distributed/parallel training
302
+ model_to_save.save_pretrained(output_dir)
303
+ tokenizer.save_pretrained(output_dir)
304
+ torch.save(args, os.path.join(output_dir, 'training_args.bin'))
305
+ logger.info("Saving model checkpoint to %s", output_dir)
306
+ if args.local_rank in [-1, 0]:
307
+ tb_writer.close()
308
+ return global_step, tr_loss / global_step
309
+
310
+
311
+ def save_logits(logits_all, filename):
312
+ with open(filename, "w") as f:
313
+ for i in range(len(logits_all)):
314
+ for j in range(len(logits_all[i])):
315
+ f.write(str(logits_all[i][j]))
316
+ if j == len(logits_all[i]) - 1:
317
+ f.write("\n")
318
+ else:
319
+ f.write(" ")
320
+
321
+
322
+ def evaluate(args, model, tokenizer, eval_dataset):
323
+ results = {}
324
+ if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
325
+ os.makedirs(args.output_dir)
326
+
327
+ args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
328
+ # Note that DistributedSampler samples randomly
329
+ eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
330
+ eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size,
331
+ collate_fn=mCollateFn)
332
+
333
+ # Eval!
334
+ logger.info("***** Running evaluation *****")
335
+ logger.info(" Num examples = %d", len(eval_dataset))
336
+ logger.info(" Batch size = %d", args.eval_batch_size)
337
+ CE = torch.nn.CrossEntropyLoss(reduction='none')
338
+ preds = []
339
+ out_label_ids = []
340
+ for batch in tqdm(eval_dataloader, desc="Evaluating"):
341
+ model.eval()
342
+ with torch.no_grad():
343
+ num_cand = len(batch[0][0])
344
+ choice_loss = []
345
+ choice_seq_lens = np.array([0] + [len(c) for sample in batch[0] for c in sample])
346
+ choice_seq_lens = np.cumsum(choice_seq_lens)
347
+ input_ids = torch.cat([c for sample in batch[0] for c in sample], dim=0).to(args.device)
348
+ att_mask = torch.cat([c for sample in batch[1] for c in sample], dim=0).to(args.device)
349
+ input_labels = torch.cat([c for sample in batch[2] for c in sample], dim=0).to(args.device)
350
+ if len(input_ids) < args.max_sequence_per_time:
351
+ inputs = {'input_ids': input_ids,
352
+ 'attention_mask': att_mask}
353
+ outputs = model(**inputs)
354
+ ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels.view(-1))
355
+ ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1)
356
+ else:
357
+ ce_loss = []
358
+ for chunk in range(0, len(input_ids), args.max_sequence_per_time):
359
+ inputs = {'input_ids': input_ids[chunk:chunk + args.max_sequence_per_time],
360
+ 'attention_mask': att_mask[chunk:chunk + args.max_sequence_per_time]}
361
+ outputs = model(**inputs)
362
+ tmp_ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)),
363
+ input_labels[chunk:chunk + args.max_sequence_per_time].view(-1))
364
+ tmp_ce_loss = tmp_ce_loss.view(outputs[0].size(0), -1).sum(1)
365
+ ce_loss.append(tmp_ce_loss)
366
+ ce_loss = torch.cat(ce_loss, dim=0)
367
+ for c_i in range(len(choice_seq_lens) - 1):
368
+ start = choice_seq_lens[c_i]
369
+ end = choice_seq_lens[c_i + 1]
370
+ choice_loss.append(-ce_loss[start:end].sum() / (end - start))
371
+ choice_loss = torch.stack(choice_loss)
372
+ choice_loss = choice_loss.view(-1, num_cand)
373
+ preds.append(choice_loss)
374
+ out_label_ids.append(batch[3].numpy())
375
+ preds = torch.cat(preds, dim=0).cpu().numpy()
376
+ save_logits(preds.tolist(), os.path.join(args.output_dir, args.logits_file))
377
+ preds = np.argmax(preds, axis=1)
378
+ result = accuracy(preds, np.concatenate(out_label_ids, axis=0))
379
+ results.update(result)
380
+ output_eval_file = os.path.join(args.output_dir, args.results_file)
381
+ with open(output_eval_file, "w") as writer:
382
+ logger.info("***** Eval results *****")
383
+ for key in sorted(result.keys()):
384
+ print("%s = %s\n" % (key, str(result[key])))
385
+ logger.info(" %s = %s", key, str(result[key]))
386
+ writer.write("%s = %s\n" % (key, str(result[key])))
387
+ return results
388
+
389
+
390
+ def write_data(filename, data):
391
+ with open(filename, 'w') as fout:
392
+ for sample in data:
393
+ fout.write(json.dumps(sample))
394
+ fout.write('\n')
395
+
396
+
397
+ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
398
+ if args.local_rank not in [-1, 0] and not evaluate:
399
+ torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
400
+ processor = myprocessors[task](args)
401
+ cached_features_file = os.path.join(args.output_dir, 'cached_{}_{}_{}_{}'.format(
402
+ 'dev' if evaluate else 'train',
403
+ str(args.model_type),
404
+ str(args.max_seq_length),
405
+ str(task)))
406
+ if os.path.exists(cached_features_file): # remove evaluate
407
+ features = torch.load(cached_features_file)
408
+ else:
409
+ examples = processor.get_dev_examples() if evaluate else processor.get_train_examples()
410
+ features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length)
411
+ # if evaluate:
412
+ torch.save(features, cached_features_file)
413
+ if args.local_rank == 0 and not evaluate:
414
+ torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
415
+ print('max_words_to_mask is %s for pretraining tasks %s' % (args.max_words_to_mask, task))
416
+ return MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id, args.max_words_to_mask)
417
+
418
+
419
+ def main():
420
+ parser = argparse.ArgumentParser()
421
+
422
+ ## Required parameters
423
+ parser.add_argument("--train_file", default=None, type=str, required=True,
424
+ help="The train file name")
425
+ parser.add_argument("--dev_file", default=None, type=str, required=True,
426
+ help="The dev file name")
427
+ parser.add_argument("--model_type", default=None, type=str, required=True,
428
+ help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
429
+ parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
430
+ help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
431
+ MODEL_TYPES))
432
+ parser.add_argument("--config_name", default="", type=str,
433
+ help="Pretrained config name or path if not the same as model_name")
434
+ parser.add_argument("--tokenizer_name", default="", type=str,
435
+ help="Pretrained tokenizer name or path if not the same as model_name")
436
+ parser.add_argument("--cache_dir", default=".cache", type=str,
437
+ help="Where do you want to store the pre-trained models downloaded")
438
+ parser.add_argument("--task_name", default=None, type=str, required=True,
439
+ help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys()))
440
+ parser.add_argument("--output_dir", default=None, type=str, required=True,
441
+ help="The output directory where the model predictions and checkpoints will be written.")
442
+
443
+ ## Other parameters
444
+ parser.add_argument("--second_train_file", default=None, type=str,
445
+ help="Used when combining ATOMIC and CWWV")
446
+ parser.add_argument("--second_dev_file", default=None, type=str,
447
+ help="Used when combining ATOMIC and CWWV")
448
+ parser.add_argument("--max_seq_length", default=128, type=int,
449
+ help="The maximum total input sequence length after tokenization. Sequences longer "
450
+ "than this will be truncated, sequences shorter will be padded.")
451
+ parser.add_argument("--max_words_to_mask", default=6, type=int,
452
+ help="The maximum number of tokens to mask when computing scores")
453
+ parser.add_argument("--max_sequence_per_time", default=80, type=int,
454
+ help="The maximum number of sequences to feed into the model")
455
+ parser.add_argument("--do_train", action='store_true',
456
+ help="Whether to run training.")
457
+ parser.add_argument("--do_eval", action='store_true',
458
+ help="Whether to run eval on the dev set.")
459
+ parser.add_argument("--do_ext_eval", action='store_true',
460
+ help="Whether to run external eval on the downstream mcqa datasets.")
461
+ parser.add_argument("--evaluate_during_training", action='store_true',
462
+ help="Run evaluation during training at each logging step.")
463
+ parser.add_argument("--do_lower_case", action='store_true',
464
+ help="Set this flag if you are using an uncased model.")
465
+ parser.add_argument("--per_gpu_train_batch_size", default=1, type=int,
466
+ help="Batch size per GPU/CPU for training.")
467
+ parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int,
468
+ help="Batch size per GPU/CPU for evaluation.")
469
+ parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
470
+ help="Number of updates steps to accumulate before performing a backward/update pass.")
471
+ parser.add_argument("--margin", default=1.0, type=float,
472
+ help="The margin for ranking loss")
473
+ parser.add_argument("--learning_rate", default=1e-5, type=float,
474
+ help="The initial learning rate for Adam.")
475
+ parser.add_argument("--weight_decay", default=0.01, type=float,
476
+ help="Weight deay if we apply some.")
477
+ parser.add_argument("--adam_epsilon", default=1e-6, type=float,
478
+ help="Epsilon for Adam optimizer.")
479
+ parser.add_argument("--max_grad_norm", default=1.0, type=float,
480
+ help="Max gradient norm.")
481
+ parser.add_argument("--num_train_epochs", default=1.0, type=float,
482
+ help="Total number of training epochs to perform.")
483
+ parser.add_argument("--max_steps", default=-1, type=int,
484
+ help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
485
+ parser.add_argument("--warmup_steps", default=0, type=int,
486
+ help="Linear warmup over warmup_steps.")
487
+ parser.add_argument("--warmup_proportion", default=0.05, type=float,
488
+ help="Linear warmup over warmup proportion.")
489
+ parser.add_argument('--logging_steps', type=int, default=50,
490
+ help="Log every X updates steps.")
491
+ parser.add_argument('--save_steps', type=int, default=50,
492
+ help="Save checkpoint every X updates steps.")
493
+ parser.add_argument("--logits_file", default='logits_test.txt', type=str,
494
+ help="The file where prediction logits will be written")
495
+ parser.add_argument("--results_file", default='eval_results.txt', type=str,
496
+ help="The file where eval results will be written")
497
+ parser.add_argument("--no_cuda", action='store_true',
498
+ help="Avoid using CUDA when available")
499
+ parser.add_argument('--overwrite_output_dir', action='store_true',
500
+ help="Overwrite the content of the output directory")
501
+ parser.add_argument('--seed', type=int, default=42,
502
+ help="random seed for initialization")
503
+ parser.add_argument('--fp16', action='store_true',
504
+ help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
505
+ parser.add_argument('--fp16_opt_level', type=str, default='O1',
506
+ help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
507
+ "See details at https://nvidia.github.io/apex/amp.html")
508
+ parser.add_argument("--local_rank", type=int, default=-1,
509
+ help="For distributed training: local_rank")
510
+ parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
511
+ parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
512
+
513
+ ### for extrinsic evaluation
514
+
515
+ parser.add_argument("--eval_output_dir", default="./output/eval_results", type=str, required=True,
516
+ help="output of the predictions")
517
+
518
+ args = parser.parse_args()
519
+
520
+ wandb.init(project="car_mcqa", config=args)
521
+
522
+ if os.path.exists(args.output_dir) and os.listdir(
523
+ args.output_dir) and not args.overwrite_output_dir and args.do_train:
524
+ raise ValueError(
525
+ "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
526
+ args.output_dir))
527
+ if not os.path.exists(args.output_dir):
528
+ os.makedirs(args.output_dir)
529
+
530
+ # Setup CUDA, GPU & distributed training
531
+ if args.local_rank == -1 or args.no_cuda:
532
+ device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
533
+ args.n_gpu = torch.cuda.device_count()
534
+ else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
535
+ torch.cuda.set_device(args.local_rank)
536
+ device = torch.device("cuda", args.local_rank)
537
+ torch.distributed.init_process_group(backend='nccl')
538
+ args.n_gpu = 1
539
+ args.device = device
540
+
541
+ if args.do_train:
542
+ for handler in logging.root.handlers[:]:
543
+ logging.root.removeHandler(handler)
544
+ # Setup logging
545
+ if args.do_train:
546
+ log_file = os.path.join(args.output_dir, 'train.log')
547
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
548
+ datefmt='%m/%d/%Y %H:%M:%S',
549
+ level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
550
+ filename=log_file)
551
+ logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
552
+ args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
553
+ os.system("cp run_pretrain.py %s" % os.path.join(args.output_dir, 'run_pretrain.py'))
554
+ os.system("cp data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py'))
555
+
556
+ # Set seed
557
+ set_seed(args)
558
+ args.task_name = args.task_name.lower()
559
+ if args.task_name not in myprocessors:
560
+ raise ValueError("Task not found: %s" % (args.task_name))
561
+
562
+ args.model_type = args.model_type.lower()
563
+ config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
564
+ config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
565
+ finetuning_task=args.task_name, cache_dir=args.cache_dir)
566
+ tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
567
+ do_lower_case=args.do_lower_case, cache_dir=args.cache_dir)
568
+ model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path),
569
+ config=config, cache_dir=args.cache_dir)
570
+
571
+ count = count_parameters(model)
572
+ print("number of params", count)
573
+
574
+ if args.local_rank == 0:
575
+ torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
576
+
577
+ model.to(args.device)
578
+
579
+ logger.info("Training/evaluation parameters %s", args)
580
+
581
+
582
+ eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
583
+ print("num of eval set", len(eval_dataset))
584
+
585
+ if args.do_train:
586
+ init_result = evaluate(args, model, tokenizer, eval_dataset)
587
+ print(init_result)
588
+
589
+ if args.do_train:
590
+ train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
591
+ print("num train examples", len(train_dataset))
592
+ global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset)
593
+ logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
594
+
595
+ # Evaluation
596
+
597
+ results = {}
598
+ if args.do_eval:
599
+ tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
600
+ model = model_class.from_pretrained(args.output_dir)
601
+ model.eval()
602
+ model.to(args.device)
603
+ result = evaluate(args, model, tokenizer, eval_dataset)
604
+
605
+
606
+ # do extrinsic evaluation
607
+
608
+ if args.do_ext_eval:
609
+ del model
610
+ import gc
611
+ gc.collect()
612
+ torch.cuda.empty_cache()
613
+
614
+
615
+ ext_results = {}
616
+
617
+ for task_name, dataset_path in eval_tasks:
618
+ eval_args = argparse.Namespace()
619
+ eval_args.dataset_file = dataset_path
620
+ eval_args.lm = args.output_dir
621
+ eval_args.out_dir = os.path.join(args.eval_output_dir, os.path.basename( args.output_dir))
622
+ eval_args.device = 0
623
+ eval_args.reader = task_name
624
+ eval_args.overwrite_output_dir = args.overwrite_output_dir
625
+ eval_args.cache_dir = None
626
+ if task_name in ["socialiqa", "winogrande", "piqa", "commonsenseqa", "anli"]:
627
+ acc = evaluate_main(eval_args)
628
+ ext_results[task_name] = acc
629
+ else:
630
+ tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
631
+ model = model_class.from_pretrained(args.output_dir)
632
+ model.eval()
633
+ model.to(args.device)
634
+
635
+ # load data
636
+ examples = []
637
+ with open(dataset_path, "r") as f:
638
+ for row in tqdm(f):
639
+ sample = json.loads(row)
640
+ examples.append(sample)
641
+ features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length)
642
+ eval_dataset = MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id, args.max_words_to_mask)
643
+ result = evaluate(args, model, tokenizer, eval_dataset)
644
+ ext_results[task_name] = result['acc']
645
+
646
+ wandb.log({"ext/"+task_name:acc for task_name, acc in ext_results.items()})
647
+
648
+ # return results
649
+
650
+ if __name__ == "__main__":
651
+ main()
runs/events.out.tfevents.1695471913.car-atm-2i-half-sample-name-1-0-0.28.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a757fc252e0640c729dd7f1f2fcf08dadb6184a9d3570e98e3011cc339dbd081
3
+ size 46125
special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "[UNK]"
9
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
3
+ size 2464616
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "clean_up_tokenization_spaces": true,
4
+ "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
+ "eos_token": "[SEP]",
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 1000000000000000019884624838656,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "sp_model_kwargs": {},
12
+ "split_by_punct": false,
13
+ "tokenizer_class": "DebertaV2Tokenizer",
14
+ "unk_token": "[UNK]",
15
+ "vocab_type": "spm"
16
+ }
train.log ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 09/23/2023 12:10:45 - WARNING - __main__ - Process rank: -1, device: cuda, n_gpu: 1, distributed training: False, 16-bits training: False
2
+ 09/23/2023 12:11:04 - INFO - __main__ - Training/evaluation parameters Namespace(train_file='../../../data/mcqa/atomic/train_atm_n_2i_half_sample_name.jsonl', dev_file='../../../data/mcqa/atomic/dev_random_10k.jsonl', model_type='deberta-mlm', model_name_or_path='microsoft/deberta-v3-large', config_name='', tokenizer_name='', cache_dir='.cache', task_name='atomic', output_dir='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', second_train_file=None, second_dev_file=None, max_seq_length=128, max_words_to_mask=6, max_sequence_per_time=80, do_train=True, do_eval=True, do_ext_eval=True, evaluate_during_training=True, do_lower_case=False, per_gpu_train_batch_size=2, per_gpu_eval_batch_size=16, gradient_accumulation_steps=16, margin=1.0, learning_rate=5e-06, weight_decay=0.01, adam_epsilon=1e-06, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, warmup_steps=0, warmup_proportion=0.05, logging_steps=50, save_steps=500, logits_file='logits_test.txt', results_file='eval_results.txt', no_cuda=False, overwrite_output_dir=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, server_ip='', server_port='', eval_output_dir='./eval_results', n_gpu=1, device=device(type='cuda'))
3
+ 09/23/2023 12:11:13 - INFO - __main__ - ***** Running evaluation *****
4
+ 09/23/2023 12:11:13 - INFO - __main__ - Num examples = 10000
5
+ 09/23/2023 12:11:13 - INFO - __main__ - Batch size = 16
6
+ 09/23/2023 12:15:11 - INFO - __main__ - ***** Eval results *****
7
+ 09/23/2023 12:15:11 - INFO - __main__ - acc = 0.3392
8
+ 09/23/2023 12:25:13 - INFO - __main__ - warm up steps = 835
9
+ 09/23/2023 12:25:13 - INFO - __main__ - ***** Running training *****
10
+ 09/23/2023 12:25:13 - INFO - __main__ - Num examples = 534833
11
+ 09/23/2023 12:25:13 - INFO - __main__ - Num Epochs = 1
12
+ 09/23/2023 12:25:13 - INFO - __main__ - Instantaneous batch size per GPU = 2
13
+ 09/23/2023 12:25:13 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 32
14
+ 09/23/2023 12:25:13 - INFO - __main__ - Gradient Accumulation steps = 16
15
+ 09/23/2023 12:25:13 - INFO - __main__ - Total optimization steps = 16713
16
+ 09/23/2023 12:28:54 - INFO - __main__ - global_step = 50, average loss = 0.6903331369534135
17
+ 09/23/2023 12:32:33 - INFO - __main__ - global_step = 100, average loss = 0.6819266405794769
18
+ 09/23/2023 12:36:13 - INFO - __main__ - global_step = 150, average loss = 0.6690767159638926
19
+ 09/23/2023 12:39:56 - INFO - __main__ - global_step = 200, average loss = 0.6476348407182377
20
+ 09/23/2023 12:43:39 - INFO - __main__ - global_step = 250, average loss = 0.6220815655076877
21
+ 09/23/2023 12:47:19 - INFO - __main__ - global_step = 300, average loss = 0.5299683179453859
22
+ 09/23/2023 12:50:56 - INFO - __main__ - global_step = 350, average loss = 0.39345016410181416
23
+ 09/23/2023 12:54:38 - INFO - __main__ - global_step = 400, average loss = 0.31127411118301096
24
+ 09/23/2023 12:58:19 - INFO - __main__ - global_step = 450, average loss = 0.25150225180907
25
+ 09/23/2023 13:02:00 - INFO - __main__ - global_step = 500, average loss = 0.22586858159028453
26
+ 09/23/2023 13:02:01 - INFO - __main__ - ***** Running evaluation *****
27
+ 09/23/2023 13:02:01 - INFO - __main__ - Num examples = 10000
28
+ 09/23/2023 13:02:01 - INFO - __main__ - Batch size = 16
29
+ 09/23/2023 13:05:56 - INFO - __main__ - ***** Eval results *****
30
+ 09/23/2023 13:05:56 - INFO - __main__ - acc = 0.6996
31
+ 09/23/2023 13:06:23 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
32
+ 09/23/2023 13:10:02 - INFO - __main__ - global_step = 550, average loss = 0.22251796642665794
33
+ 09/23/2023 13:13:46 - INFO - __main__ - global_step = 600, average loss = 0.19366045010890956
34
+ 09/23/2023 13:17:29 - INFO - __main__ - global_step = 650, average loss = 0.18587105088678071
35
+ 09/23/2023 13:21:15 - INFO - __main__ - global_step = 700, average loss = 0.1760789550206391
36
+ 09/23/2023 13:24:59 - INFO - __main__ - global_step = 750, average loss = 0.18312411408871412
37
+ 09/23/2023 13:28:42 - INFO - __main__ - global_step = 800, average loss = 0.15576540186157217
38
+ 09/23/2023 13:32:25 - INFO - __main__ - global_step = 850, average loss = 0.16302873345994157
39
+ 09/23/2023 13:36:07 - INFO - __main__ - global_step = 900, average loss = 0.15725697406036487
40
+ 09/23/2023 13:39:46 - INFO - __main__ - global_step = 950, average loss = 0.15640976145299645
41
+ 09/23/2023 13:43:33 - INFO - __main__ - global_step = 1000, average loss = 0.15606625928507128
42
+ 09/23/2023 13:43:34 - INFO - __main__ - ***** Running evaluation *****
43
+ 09/23/2023 13:43:34 - INFO - __main__ - Num examples = 10000
44
+ 09/23/2023 13:43:34 - INFO - __main__ - Batch size = 16
45
+ 09/23/2023 13:47:30 - INFO - __main__ - ***** Eval results *****
46
+ 09/23/2023 13:47:30 - INFO - __main__ - acc = 0.7961
47
+ 09/23/2023 13:47:58 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
48
+ 09/23/2023 13:51:41 - INFO - __main__ - global_step = 1050, average loss = 0.14431810150181262
49
+ 09/23/2023 13:55:20 - INFO - __main__ - global_step = 1100, average loss = 0.15233074207513708
50
+ 09/23/2023 13:59:01 - INFO - __main__ - global_step = 1150, average loss = 0.1404175848151772
51
+ 09/23/2023 14:02:44 - INFO - __main__ - global_step = 1200, average loss = 0.12134294869215864
52
+ 09/23/2023 14:06:20 - INFO - __main__ - global_step = 1250, average loss = 0.1363200130731275
53
+ 09/23/2023 14:09:59 - INFO - __main__ - global_step = 1300, average loss = 0.13769450530940958
54
+ 09/23/2023 14:13:43 - INFO - __main__ - global_step = 1350, average loss = 0.12156560226379952
55
+ 09/23/2023 14:17:18 - INFO - __main__ - global_step = 1400, average loss = 0.12623315585107775
56
+ 09/23/2023 14:20:59 - INFO - __main__ - global_step = 1450, average loss = 0.14377202547417256
57
+ 09/23/2023 14:24:33 - INFO - __main__ - global_step = 1500, average loss = 0.1286695548933858
58
+ 09/23/2023 14:24:34 - INFO - __main__ - ***** Running evaluation *****
59
+ 09/23/2023 14:24:34 - INFO - __main__ - Num examples = 10000
60
+ 09/23/2023 14:24:34 - INFO - __main__ - Batch size = 16
61
+ 09/23/2023 14:28:29 - INFO - __main__ - ***** Eval results *****
62
+ 09/23/2023 14:28:29 - INFO - __main__ - acc = 0.8048
63
+ 09/23/2023 14:28:56 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
64
+ 09/23/2023 14:32:42 - INFO - __main__ - global_step = 1550, average loss = 0.1198868363915244
65
+ 09/23/2023 14:36:24 - INFO - __main__ - global_step = 1600, average loss = 0.12324378551486007
66
+ 09/23/2023 14:40:00 - INFO - __main__ - global_step = 1650, average loss = 0.11938468464672042
67
+ 09/23/2023 14:43:41 - INFO - __main__ - global_step = 1700, average loss = 0.14236379045556533
68
+ 09/23/2023 14:47:22 - INFO - __main__ - global_step = 1750, average loss = 0.13320694023670512
69
+ 09/23/2023 14:51:02 - INFO - __main__ - global_step = 1800, average loss = 0.13622453257718006
70
+ 09/23/2023 14:54:42 - INFO - __main__ - global_step = 1850, average loss = 0.13987649206645072
71
+ 09/23/2023 14:58:22 - INFO - __main__ - global_step = 1900, average loss = 0.12299754774277971
72
+ 09/23/2023 15:02:05 - INFO - __main__ - global_step = 1950, average loss = 0.11868109124743569
73
+ 09/23/2023 15:05:47 - INFO - __main__ - global_step = 2000, average loss = 0.1415042275990345
74
+ 09/23/2023 15:05:47 - INFO - __main__ - ***** Running evaluation *****
75
+ 09/23/2023 15:05:47 - INFO - __main__ - Num examples = 10000
76
+ 09/23/2023 15:05:47 - INFO - __main__ - Batch size = 16
77
+ 09/23/2023 15:09:43 - INFO - __main__ - ***** Eval results *****
78
+ 09/23/2023 15:09:43 - INFO - __main__ - acc = 0.8063
79
+ 09/23/2023 15:10:10 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
80
+ 09/23/2023 15:13:51 - INFO - __main__ - global_step = 2050, average loss = 0.11399275673671581
81
+ 09/23/2023 15:17:31 - INFO - __main__ - global_step = 2100, average loss = 0.1065546132405143
82
+ 09/23/2023 15:21:11 - INFO - __main__ - global_step = 2150, average loss = 0.12809142941467144
83
+ 09/23/2023 15:24:51 - INFO - __main__ - global_step = 2200, average loss = 0.12454848410692648
84
+ 09/23/2023 15:28:34 - INFO - __main__ - global_step = 2250, average loss = 0.10986286829065647
85
+ 09/23/2023 15:32:14 - INFO - __main__ - global_step = 2300, average loss = 0.11237965747121052
86
+ 09/23/2023 15:35:56 - INFO - __main__ - global_step = 2350, average loss = 0.10897610924319451
87
+ 09/23/2023 15:39:41 - INFO - __main__ - global_step = 2400, average loss = 0.12056981857070241
88
+ 09/23/2023 15:43:24 - INFO - __main__ - global_step = 2450, average loss = 0.13911059297635803
89
+ 09/23/2023 15:47:10 - INFO - __main__ - global_step = 2500, average loss = 0.11335444856034883
90
+ 09/23/2023 15:47:10 - INFO - __main__ - ***** Running evaluation *****
91
+ 09/23/2023 15:47:10 - INFO - __main__ - Num examples = 10000
92
+ 09/23/2023 15:47:10 - INFO - __main__ - Batch size = 16
93
+ 09/23/2023 15:51:06 - INFO - __main__ - ***** Eval results *****
94
+ 09/23/2023 15:51:06 - INFO - __main__ - acc = 0.8234
95
+ 09/23/2023 15:51:32 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
96
+ 09/23/2023 15:55:10 - INFO - __main__ - global_step = 2550, average loss = 0.12103958850973867
97
+ 09/23/2023 15:58:57 - INFO - __main__ - global_step = 2600, average loss = 0.11913071399074397
98
+ 09/23/2023 16:02:38 - INFO - __main__ - global_step = 2650, average loss = 0.11255583499452769
99
+ 09/23/2023 16:06:28 - INFO - __main__ - global_step = 2700, average loss = 0.1006322616293619
100
+ 09/23/2023 16:10:12 - INFO - __main__ - global_step = 2750, average loss = 0.0932968783121487
101
+ 09/23/2023 16:13:51 - INFO - __main__ - global_step = 2800, average loss = 0.11056979637924087
102
+ 09/23/2023 16:17:38 - INFO - __main__ - global_step = 2850, average loss = 0.12318793082176853
103
+ 09/23/2023 16:21:21 - INFO - __main__ - global_step = 2900, average loss = 0.10864610994302439
104
+ 09/23/2023 16:25:03 - INFO - __main__ - global_step = 2950, average loss = 0.11261582636667299
105
+ 09/23/2023 16:28:40 - INFO - __main__ - global_step = 3000, average loss = 0.12150005620278534
106
+ 09/23/2023 16:28:40 - INFO - __main__ - ***** Running evaluation *****
107
+ 09/23/2023 16:28:40 - INFO - __main__ - Num examples = 10000
108
+ 09/23/2023 16:28:40 - INFO - __main__ - Batch size = 16
109
+ 09/23/2023 16:32:35 - INFO - __main__ - ***** Eval results *****
110
+ 09/23/2023 16:32:35 - INFO - __main__ - acc = 0.8261
111
+ 09/23/2023 16:33:02 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
112
+ 09/23/2023 16:36:46 - INFO - __main__ - global_step = 3050, average loss = 0.10565035182957218
113
+ 09/23/2023 16:40:30 - INFO - __main__ - global_step = 3100, average loss = 0.10429829731896462
114
+ 09/23/2023 16:44:14 - INFO - __main__ - global_step = 3150, average loss = 0.10812272985053824
115
+ 09/23/2023 16:47:54 - INFO - __main__ - global_step = 3200, average loss = 0.12238092143270478
116
+ 09/23/2023 16:51:33 - INFO - __main__ - global_step = 3250, average loss = 0.10868940783606376
117
+ 09/23/2023 16:55:14 - INFO - __main__ - global_step = 3300, average loss = 0.1209917226509424
118
+ 09/23/2023 16:58:59 - INFO - __main__ - global_step = 3350, average loss = 0.1191260662042896
119
+ 09/23/2023 17:02:41 - INFO - __main__ - global_step = 3400, average loss = 0.1174743126919202
120
+ 09/23/2023 17:06:26 - INFO - __main__ - global_step = 3450, average loss = 0.100895225374843
121
+ 09/23/2023 17:10:02 - INFO - __main__ - global_step = 3500, average loss = 0.0931866138278565
122
+ 09/23/2023 17:10:03 - INFO - __main__ - ***** Running evaluation *****
123
+ 09/23/2023 17:10:03 - INFO - __main__ - Num examples = 10000
124
+ 09/23/2023 17:10:03 - INFO - __main__ - Batch size = 16
125
+ 09/23/2023 17:13:58 - INFO - __main__ - ***** Eval results *****
126
+ 09/23/2023 17:13:58 - INFO - __main__ - acc = 0.8229
127
+ 09/23/2023 17:17:45 - INFO - __main__ - global_step = 3550, average loss = 0.10633477224648231
128
+ 09/23/2023 17:21:30 - INFO - __main__ - global_step = 3600, average loss = 0.1021722938354651
129
+ 09/23/2023 17:25:11 - INFO - __main__ - global_step = 3650, average loss = 0.10295378862727375
130
+ 09/23/2023 17:28:50 - INFO - __main__ - global_step = 3700, average loss = 0.1024187771679135
131
+ 09/23/2023 17:32:34 - INFO - __main__ - global_step = 3750, average loss = 0.09922411829451448
132
+ 09/23/2023 17:36:14 - INFO - __main__ - global_step = 3800, average loss = 0.11105157318372222
133
+ 09/23/2023 17:39:57 - INFO - __main__ - global_step = 3850, average loss = 0.12378941989987652
134
+ 09/23/2023 17:43:42 - INFO - __main__ - global_step = 3900, average loss = 0.1034327056143593
135
+ 09/23/2023 17:47:25 - INFO - __main__ - global_step = 3950, average loss = 0.09697925167827634
136
+ 09/23/2023 17:51:09 - INFO - __main__ - global_step = 4000, average loss = 0.11230336717126192
137
+ 09/23/2023 17:51:09 - INFO - __main__ - ***** Running evaluation *****
138
+ 09/23/2023 17:51:09 - INFO - __main__ - Num examples = 10000
139
+ 09/23/2023 17:51:09 - INFO - __main__ - Batch size = 16
140
+ 09/23/2023 17:55:05 - INFO - __main__ - ***** Eval results *****
141
+ 09/23/2023 17:55:05 - INFO - __main__ - acc = 0.8371
142
+ 09/23/2023 17:55:32 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
143
+ 09/23/2023 17:59:12 - INFO - __main__ - global_step = 4050, average loss = 0.10925351051962934
144
+ 09/23/2023 18:03:00 - INFO - __main__ - global_step = 4100, average loss = 0.09795216493275802
145
+ 09/23/2023 18:06:43 - INFO - __main__ - global_step = 4150, average loss = 0.09962472554965643
146
+ 09/23/2023 18:10:25 - INFO - __main__ - global_step = 4200, average loss = 0.10342389734141762
147
+ 09/23/2023 18:14:05 - INFO - __main__ - global_step = 4250, average loss = 0.09674815248567029
148
+ 09/23/2023 18:17:48 - INFO - __main__ - global_step = 4300, average loss = 0.10319628210134396
149
+ 09/23/2023 18:21:33 - INFO - __main__ - global_step = 4350, average loss = 0.09340641272166977
150
+ 09/23/2023 18:25:14 - INFO - __main__ - global_step = 4400, average loss = 0.10845618240913608
151
+ 09/23/2023 18:28:59 - INFO - __main__ - global_step = 4450, average loss = 0.11604906246473547
152
+ 09/23/2023 18:32:43 - INFO - __main__ - global_step = 4500, average loss = 0.09590314964269055
153
+ 09/23/2023 18:32:43 - INFO - __main__ - ***** Running evaluation *****
154
+ 09/23/2023 18:32:43 - INFO - __main__ - Num examples = 10000
155
+ 09/23/2023 18:32:43 - INFO - __main__ - Batch size = 16
156
+ 09/23/2023 18:36:38 - INFO - __main__ - ***** Eval results *****
157
+ 09/23/2023 18:36:38 - INFO - __main__ - acc = 0.8305
158
+ 09/23/2023 18:40:22 - INFO - __main__ - global_step = 4550, average loss = 0.09955280199857952
159
+ 09/23/2023 18:44:07 - INFO - __main__ - global_step = 4600, average loss = 0.09018894311768236
160
+ 09/23/2023 18:47:49 - INFO - __main__ - global_step = 4650, average loss = 0.11624654464081687
161
+ 09/23/2023 18:51:30 - INFO - __main__ - global_step = 4700, average loss = 0.11213955332923434
162
+ 09/23/2023 18:55:07 - INFO - __main__ - global_step = 4750, average loss = 0.11335175217776851
163
+ 09/23/2023 18:58:47 - INFO - __main__ - global_step = 4800, average loss = 0.10374061681199237
164
+ 09/23/2023 19:02:34 - INFO - __main__ - global_step = 4850, average loss = 0.09650620453016018
165
+ 09/23/2023 19:06:16 - INFO - __main__ - global_step = 4900, average loss = 0.1034209698169434
166
+ 09/23/2023 19:09:53 - INFO - __main__ - global_step = 4950, average loss = 0.10046588191311458
167
+ 09/23/2023 19:13:34 - INFO - __main__ - global_step = 5000, average loss = 0.10752027794980677
168
+ 09/23/2023 19:13:34 - INFO - __main__ - ***** Running evaluation *****
169
+ 09/23/2023 19:13:34 - INFO - __main__ - Num examples = 10000
170
+ 09/23/2023 19:13:34 - INFO - __main__ - Batch size = 16
171
+ 09/23/2023 19:17:29 - INFO - __main__ - ***** Eval results *****
172
+ 09/23/2023 19:17:29 - INFO - __main__ - acc = 0.8355
173
+ 09/23/2023 19:21:19 - INFO - __main__ - global_step = 5050, average loss = 0.10195030277842307
174
+ 09/23/2023 19:24:58 - INFO - __main__ - global_step = 5100, average loss = 0.10987481483532065
175
+ 09/23/2023 19:28:41 - INFO - __main__ - global_step = 5150, average loss = 0.10906005093554995
176
+ 09/23/2023 19:32:23 - INFO - __main__ - global_step = 5200, average loss = 0.09835696181547973
177
+ 09/23/2023 19:36:06 - INFO - __main__ - global_step = 5250, average loss = 0.10181126694624254
178
+ 09/23/2023 19:39:52 - INFO - __main__ - global_step = 5300, average loss = 0.08663028705283068
179
+ 09/23/2023 19:43:30 - INFO - __main__ - global_step = 5350, average loss = 0.10507196654667496
180
+ 09/23/2023 19:47:18 - INFO - __main__ - global_step = 5400, average loss = 0.108608085659871
181
+ 09/23/2023 19:51:03 - INFO - __main__ - global_step = 5450, average loss = 0.099619501844536
182
+ 09/23/2023 19:54:49 - INFO - __main__ - global_step = 5500, average loss = 0.10225338533447939
183
+ 09/23/2023 19:54:49 - INFO - __main__ - ***** Running evaluation *****
184
+ 09/23/2023 19:54:49 - INFO - __main__ - Num examples = 10000
185
+ 09/23/2023 19:54:49 - INFO - __main__ - Batch size = 16
186
+ 09/23/2023 19:58:45 - INFO - __main__ - ***** Eval results *****
187
+ 09/23/2023 19:58:45 - INFO - __main__ - acc = 0.8279
188
+ 09/23/2023 20:02:26 - INFO - __main__ - global_step = 5550, average loss = 0.10436682683890468
189
+ 09/23/2023 20:06:11 - INFO - __main__ - global_step = 5600, average loss = 0.10477761221260153
190
+ 09/23/2023 20:09:52 - INFO - __main__ - global_step = 5650, average loss = 0.09326410317778937
191
+ 09/23/2023 20:13:31 - INFO - __main__ - global_step = 5700, average loss = 0.11269167278223904
192
+ 09/23/2023 20:17:16 - INFO - __main__ - global_step = 5750, average loss = 0.10188864256499074
193
+ 09/23/2023 20:21:00 - INFO - __main__ - global_step = 5800, average loss = 0.10433580860199981
194
+ 09/23/2023 20:24:43 - INFO - __main__ - global_step = 5850, average loss = 0.08972063858884212
195
+ 09/23/2023 20:28:22 - INFO - __main__ - global_step = 5900, average loss = 0.1065664726671821
196
+ 09/23/2023 20:32:07 - INFO - __main__ - global_step = 5950, average loss = 0.10174332244623656
197
+ 09/23/2023 20:35:49 - INFO - __main__ - global_step = 6000, average loss = 0.08872646622621687
198
+ 09/23/2023 20:35:49 - INFO - __main__ - ***** Running evaluation *****
199
+ 09/23/2023 20:35:49 - INFO - __main__ - Num examples = 10000
200
+ 09/23/2023 20:35:49 - INFO - __main__ - Batch size = 16
201
+ 09/23/2023 20:39:45 - INFO - __main__ - ***** Eval results *****
202
+ 09/23/2023 20:39:45 - INFO - __main__ - acc = 0.8363
203
+ 09/23/2023 20:43:29 - INFO - __main__ - global_step = 6050, average loss = 0.10705330887685705
204
+ 09/23/2023 20:47:16 - INFO - __main__ - global_step = 6100, average loss = 0.09171272950654384
205
+ 09/23/2023 20:50:59 - INFO - __main__ - global_step = 6150, average loss = 0.0861645900901567
206
+ 09/23/2023 20:54:46 - INFO - __main__ - global_step = 6200, average loss = 0.08994678908144124
207
+ 09/23/2023 20:58:32 - INFO - __main__ - global_step = 6250, average loss = 0.08786970607354305
208
+ 09/23/2023 21:02:13 - INFO - __main__ - global_step = 6300, average loss = 0.09656520821336016
209
+ 09/23/2023 21:05:56 - INFO - __main__ - global_step = 6350, average loss = 0.09620310332989902
210
+ 09/23/2023 21:09:42 - INFO - __main__ - global_step = 6400, average loss = 0.09152124080545036
211
+ 09/23/2023 21:13:22 - INFO - __main__ - global_step = 6450, average loss = 0.09472263304131047
212
+ 09/23/2023 21:17:06 - INFO - __main__ - global_step = 6500, average loss = 0.10554198697194807
213
+ 09/23/2023 21:17:06 - INFO - __main__ - ***** Running evaluation *****
214
+ 09/23/2023 21:17:06 - INFO - __main__ - Num examples = 10000
215
+ 09/23/2023 21:17:06 - INFO - __main__ - Batch size = 16
216
+ 09/23/2023 21:21:01 - INFO - __main__ - ***** Eval results *****
217
+ 09/23/2023 21:21:01 - INFO - __main__ - acc = 0.841
218
+ 09/23/2023 21:21:28 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
219
+ 09/23/2023 21:25:14 - INFO - __main__ - global_step = 6550, average loss = 0.09830655160796596
220
+ 09/23/2023 21:28:55 - INFO - __main__ - global_step = 6600, average loss = 0.09539545015402837
221
+ 09/23/2023 21:32:40 - INFO - __main__ - global_step = 6650, average loss = 0.09118585625503328
222
+ 09/23/2023 21:36:18 - INFO - __main__ - global_step = 6700, average loss = 0.09700520555491493
223
+ 09/23/2023 21:40:03 - INFO - __main__ - global_step = 6750, average loss = 0.105271778342576
224
+ 09/23/2023 21:43:45 - INFO - __main__ - global_step = 6800, average loss = 0.10975144471223758
225
+ 09/23/2023 21:47:28 - INFO - __main__ - global_step = 6850, average loss = 0.09920243133579788
226
+ 09/23/2023 21:51:11 - INFO - __main__ - global_step = 6900, average loss = 0.09791661702009151
227
+ 09/23/2023 21:54:51 - INFO - __main__ - global_step = 6950, average loss = 0.08630025177910283
228
+ 09/23/2023 21:58:29 - INFO - __main__ - global_step = 7000, average loss = 0.09660528897402401
229
+ 09/23/2023 21:58:29 - INFO - __main__ - ***** Running evaluation *****
230
+ 09/23/2023 21:58:29 - INFO - __main__ - Num examples = 10000
231
+ 09/23/2023 21:58:29 - INFO - __main__ - Batch size = 16
232
+ 09/23/2023 22:02:25 - INFO - __main__ - ***** Eval results *****
233
+ 09/23/2023 22:02:25 - INFO - __main__ - acc = 0.843
234
+ 09/23/2023 22:02:51 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
235
+ 09/23/2023 22:06:33 - INFO - __main__ - global_step = 7050, average loss = 0.10305566756385814
236
+ 09/23/2023 22:10:07 - INFO - __main__ - global_step = 7100, average loss = 0.10687436608219286
237
+ 09/23/2023 22:13:47 - INFO - __main__ - global_step = 7150, average loss = 0.0946133067667688
238
+ 09/23/2023 22:17:27 - INFO - __main__ - global_step = 7200, average loss = 0.09795189084834419
239
+ 09/23/2023 22:21:17 - INFO - __main__ - global_step = 7250, average loss = 0.09060888570308634
240
+ 09/23/2023 22:24:59 - INFO - __main__ - global_step = 7300, average loss = 0.0877145413684775
241
+ 09/23/2023 22:28:35 - INFO - __main__ - global_step = 7350, average loss = 0.10495714643941029
242
+ 09/23/2023 22:32:21 - INFO - __main__ - global_step = 7400, average loss = 0.07401456630654138
243
+ 09/23/2023 22:36:03 - INFO - __main__ - global_step = 7450, average loss = 0.09523518772701209
244
+ 09/23/2023 22:39:41 - INFO - __main__ - global_step = 7500, average loss = 0.10137952610446518
245
+ 09/23/2023 22:39:41 - INFO - __main__ - ***** Running evaluation *****
246
+ 09/23/2023 22:39:41 - INFO - __main__ - Num examples = 10000
247
+ 09/23/2023 22:39:41 - INFO - __main__ - Batch size = 16
248
+ 09/23/2023 22:43:37 - INFO - __main__ - ***** Eval results *****
249
+ 09/23/2023 22:43:37 - INFO - __main__ - acc = 0.846
250
+ 09/23/2023 22:44:03 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
251
+ 09/23/2023 22:47:46 - INFO - __main__ - global_step = 7550, average loss = 0.09563293447645264
252
+ 09/23/2023 22:51:31 - INFO - __main__ - global_step = 7600, average loss = 0.09618103489105125
253
+ 09/23/2023 22:55:13 - INFO - __main__ - global_step = 7650, average loss = 0.08849806944810552
254
+ 09/23/2023 22:58:54 - INFO - __main__ - global_step = 7700, average loss = 0.10007433392238455
255
+ 09/23/2023 23:02:36 - INFO - __main__ - global_step = 7750, average loss = 0.09035434001329122
256
+ 09/23/2023 23:06:24 - INFO - __main__ - global_step = 7800, average loss = 0.09338357288788757
257
+ 09/23/2023 23:10:04 - INFO - __main__ - global_step = 7850, average loss = 0.09912064949181514
258
+ 09/23/2023 23:13:47 - INFO - __main__ - global_step = 7900, average loss = 0.08827902228244057
259
+ 09/23/2023 23:17:27 - INFO - __main__ - global_step = 7950, average loss = 0.11218067690118914
260
+ 09/23/2023 23:21:09 - INFO - __main__ - global_step = 8000, average loss = 0.08588292430682486
261
+ 09/23/2023 23:21:09 - INFO - __main__ - ***** Running evaluation *****
262
+ 09/23/2023 23:21:09 - INFO - __main__ - Num examples = 10000
263
+ 09/23/2023 23:21:09 - INFO - __main__ - Batch size = 16
264
+ 09/23/2023 23:25:05 - INFO - __main__ - ***** Eval results *****
265
+ 09/23/2023 23:25:05 - INFO - __main__ - acc = 0.8472
266
+ 09/23/2023 23:25:31 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
267
+ 09/23/2023 23:29:08 - INFO - __main__ - global_step = 8050, average loss = 0.09245043838061974
268
+ 09/23/2023 23:32:54 - INFO - __main__ - global_step = 8100, average loss = 0.08283289226481429
269
+ 09/23/2023 23:36:34 - INFO - __main__ - global_step = 8150, average loss = 0.08407623038449856
270
+ 09/23/2023 23:40:17 - INFO - __main__ - global_step = 8200, average loss = 0.09736820162237564
271
+ 09/23/2023 23:44:06 - INFO - __main__ - global_step = 8250, average loss = 0.08463705457368632
272
+ 09/23/2023 23:47:50 - INFO - __main__ - global_step = 8300, average loss = 0.10010304888644896
273
+ 09/23/2023 23:51:35 - INFO - __main__ - global_step = 8350, average loss = 0.09222401980725409
274
+ 09/23/2023 23:55:17 - INFO - __main__ - global_step = 8400, average loss = 0.08634746881416504
275
+ 09/23/2023 23:58:59 - INFO - __main__ - global_step = 8450, average loss = 0.08723288500368653
276
+ 09/24/2023 00:02:37 - INFO - __main__ - global_step = 8500, average loss = 0.10130320921433394
277
+ 09/24/2023 00:02:37 - INFO - __main__ - ***** Running evaluation *****
278
+ 09/24/2023 00:02:37 - INFO - __main__ - Num examples = 10000
279
+ 09/24/2023 00:02:37 - INFO - __main__ - Batch size = 16
280
+ 09/24/2023 00:06:32 - INFO - __main__ - ***** Eval results *****
281
+ 09/24/2023 00:06:32 - INFO - __main__ - acc = 0.8452
282
+ 09/24/2023 00:10:13 - INFO - __main__ - global_step = 8550, average loss = 0.0889340414837352
283
+ 09/24/2023 00:13:53 - INFO - __main__ - global_step = 8600, average loss = 0.0960574367789377
284
+ 09/24/2023 00:17:37 - INFO - __main__ - global_step = 8650, average loss = 0.07860265792332939
285
+ 09/24/2023 00:21:20 - INFO - __main__ - global_step = 8700, average loss = 0.09233207383847912
286
+ 09/24/2023 00:25:05 - INFO - __main__ - global_step = 8750, average loss = 0.09803196908305836
287
+ 09/24/2023 00:28:44 - INFO - __main__ - global_step = 8800, average loss = 0.08913468146740343
288
+ 09/24/2023 00:32:26 - INFO - __main__ - global_step = 8850, average loss = 0.0880054514182666
289
+ 09/24/2023 00:36:11 - INFO - __main__ - global_step = 8900, average loss = 0.0839999437017832
290
+ 09/24/2023 00:39:52 - INFO - __main__ - global_step = 8950, average loss = 0.10094311676693905
291
+ 09/24/2023 00:43:32 - INFO - __main__ - global_step = 9000, average loss = 0.10011614485312748
292
+ 09/24/2023 00:43:32 - INFO - __main__ - ***** Running evaluation *****
293
+ 09/24/2023 00:43:32 - INFO - __main__ - Num examples = 10000
294
+ 09/24/2023 00:43:32 - INFO - __main__ - Batch size = 16
295
+ 09/24/2023 00:47:27 - INFO - __main__ - ***** Eval results *****
296
+ 09/24/2023 00:47:27 - INFO - __main__ - acc = 0.8463
297
+ 09/24/2023 00:51:10 - INFO - __main__ - global_step = 9050, average loss = 0.09407024829903093
298
+ 09/24/2023 00:54:48 - INFO - __main__ - global_step = 9100, average loss = 0.09510339217069032
299
+ 09/24/2023 00:58:27 - INFO - __main__ - global_step = 9150, average loss = 0.09413513723055075
300
+ 09/24/2023 01:02:10 - INFO - __main__ - global_step = 9200, average loss = 0.08488880819528276
301
+ 09/24/2023 01:05:47 - INFO - __main__ - global_step = 9250, average loss = 0.09847264970565447
302
+ 09/24/2023 01:09:28 - INFO - __main__ - global_step = 9300, average loss = 0.08640140883806452
303
+ 09/24/2023 01:13:08 - INFO - __main__ - global_step = 9350, average loss = 0.07884123000112594
304
+ 09/24/2023 01:16:54 - INFO - __main__ - global_step = 9400, average loss = 0.0831154512307694
305
+ 09/24/2023 01:20:32 - INFO - __main__ - global_step = 9450, average loss = 0.09913980022422038
306
+ 09/24/2023 01:24:11 - INFO - __main__ - global_step = 9500, average loss = 0.09805536182444484
307
+ 09/24/2023 01:24:11 - INFO - __main__ - ***** Running evaluation *****
308
+ 09/24/2023 01:24:11 - INFO - __main__ - Num examples = 10000
309
+ 09/24/2023 01:24:11 - INFO - __main__ - Batch size = 16
310
+ 09/24/2023 01:28:07 - INFO - __main__ - ***** Eval results *****
311
+ 09/24/2023 01:28:07 - INFO - __main__ - acc = 0.8463
312
+ 09/24/2023 01:31:55 - INFO - __main__ - global_step = 9550, average loss = 0.0912455873134968
313
+ 09/24/2023 01:35:38 - INFO - __main__ - global_step = 9600, average loss = 0.10278063782119716
314
+ 09/24/2023 01:39:12 - INFO - __main__ - global_step = 9650, average loss = 0.08788584528032516
315
+ 09/24/2023 01:42:53 - INFO - __main__ - global_step = 9700, average loss = 0.08058010207216285
316
+ 09/24/2023 01:46:34 - INFO - __main__ - global_step = 9750, average loss = 0.08765123128723644
317
+ 09/24/2023 01:50:14 - INFO - __main__ - global_step = 9800, average loss = 0.09005017607181799
318
+ 09/24/2023 01:54:03 - INFO - __main__ - global_step = 9850, average loss = 0.07892634223760979
319
+ 09/24/2023 01:57:44 - INFO - __main__ - global_step = 9900, average loss = 0.07999062808303278
320
+ 09/24/2023 02:01:26 - INFO - __main__ - global_step = 9950, average loss = 0.09494447313452838
321
+ 09/24/2023 02:05:06 - INFO - __main__ - global_step = 10000, average loss = 0.0841888710015337
322
+ 09/24/2023 02:05:06 - INFO - __main__ - ***** Running evaluation *****
323
+ 09/24/2023 02:05:06 - INFO - __main__ - Num examples = 10000
324
+ 09/24/2023 02:05:06 - INFO - __main__ - Batch size = 16
325
+ 09/24/2023 02:09:01 - INFO - __main__ - ***** Eval results *****
326
+ 09/24/2023 02:09:01 - INFO - __main__ - acc = 0.8471
327
+ 09/24/2023 02:12:40 - INFO - __main__ - global_step = 10050, average loss = 0.08929907138342968
328
+ 09/24/2023 02:16:20 - INFO - __main__ - global_step = 10100, average loss = 0.10172551687661326
329
+ 09/24/2023 02:20:00 - INFO - __main__ - global_step = 10150, average loss = 0.09577305402533966
330
+ 09/24/2023 02:23:46 - INFO - __main__ - global_step = 10200, average loss = 0.09480085656211486
331
+ 09/24/2023 02:27:27 - INFO - __main__ - global_step = 10250, average loss = 0.07956519629078684
332
+ 09/24/2023 02:31:05 - INFO - __main__ - global_step = 10300, average loss = 0.08291967767250753
333
+ 09/24/2023 02:34:47 - INFO - __main__ - global_step = 10350, average loss = 0.09592102762369904
334
+ 09/24/2023 02:38:29 - INFO - __main__ - global_step = 10400, average loss = 0.08570889301292482
335
+ 09/24/2023 02:42:13 - INFO - __main__ - global_step = 10450, average loss = 0.07362440132081247
336
+ 09/24/2023 02:45:58 - INFO - __main__ - global_step = 10500, average loss = 0.08574875552483718
337
+ 09/24/2023 02:45:58 - INFO - __main__ - ***** Running evaluation *****
338
+ 09/24/2023 02:45:58 - INFO - __main__ - Num examples = 10000
339
+ 09/24/2023 02:45:58 - INFO - __main__ - Batch size = 16
340
+ 09/24/2023 02:49:53 - INFO - __main__ - ***** Eval results *****
341
+ 09/24/2023 02:49:53 - INFO - __main__ - acc = 0.8524
342
+ 09/24/2023 02:50:20 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
343
+ 09/24/2023 02:54:03 - INFO - __main__ - global_step = 10550, average loss = 0.08846153970320302
344
+ 09/24/2023 02:57:43 - INFO - __main__ - global_step = 10600, average loss = 0.08381684645668429
345
+ 09/24/2023 03:01:26 - INFO - __main__ - global_step = 10650, average loss = 0.09288432469184045
346
+ 09/24/2023 03:05:08 - INFO - __main__ - global_step = 10700, average loss = 0.08199916316298186
347
+ 09/24/2023 03:08:56 - INFO - __main__ - global_step = 10750, average loss = 0.09068042659768252
348
+ 09/24/2023 03:12:37 - INFO - __main__ - global_step = 10800, average loss = 0.08719110449641448
349
+ 09/24/2023 03:16:20 - INFO - __main__ - global_step = 10850, average loss = 0.09036207084544003
350
+ 09/24/2023 03:20:04 - INFO - __main__ - global_step = 10900, average loss = 0.095746248819637
351
+ 09/24/2023 03:23:45 - INFO - __main__ - global_step = 10950, average loss = 0.1019882604497252
352
+ 09/24/2023 03:27:25 - INFO - __main__ - global_step = 11000, average loss = 0.08660416512644588
353
+ 09/24/2023 03:27:25 - INFO - __main__ - ***** Running evaluation *****
354
+ 09/24/2023 03:27:25 - INFO - __main__ - Num examples = 10000
355
+ 09/24/2023 03:27:25 - INFO - __main__ - Batch size = 16
356
+ 09/24/2023 03:31:21 - INFO - __main__ - ***** Eval results *****
357
+ 09/24/2023 03:31:21 - INFO - __main__ - acc = 0.8521
358
+ 09/24/2023 03:35:00 - INFO - __main__ - global_step = 11050, average loss = 0.07959849048202158
359
+ 09/24/2023 03:38:42 - INFO - __main__ - global_step = 11100, average loss = 0.08480279741248524
360
+ 09/24/2023 03:42:25 - INFO - __main__ - global_step = 11150, average loss = 0.07940411141982623
361
+ 09/24/2023 03:46:06 - INFO - __main__ - global_step = 11200, average loss = 0.08627346496621613
362
+ 09/24/2023 03:49:48 - INFO - __main__ - global_step = 11250, average loss = 0.08515130840663915
363
+ 09/24/2023 03:53:28 - INFO - __main__ - global_step = 11300, average loss = 0.08047833000106039
364
+ 09/24/2023 03:57:07 - INFO - __main__ - global_step = 11350, average loss = 0.08884227124826338
365
+ 09/24/2023 04:00:47 - INFO - __main__ - global_step = 11400, average loss = 0.09542614945773494
366
+ 09/24/2023 04:04:26 - INFO - __main__ - global_step = 11450, average loss = 0.08332637125422479
367
+ 09/24/2023 04:08:07 - INFO - __main__ - global_step = 11500, average loss = 0.09769482501476887
368
+ 09/24/2023 04:08:07 - INFO - __main__ - ***** Running evaluation *****
369
+ 09/24/2023 04:08:07 - INFO - __main__ - Num examples = 10000
370
+ 09/24/2023 04:08:07 - INFO - __main__ - Batch size = 16
371
+ 09/24/2023 04:12:02 - INFO - __main__ - ***** Eval results *****
372
+ 09/24/2023 04:12:02 - INFO - __main__ - acc = 0.851
373
+ 09/24/2023 04:15:51 - INFO - __main__ - global_step = 11550, average loss = 0.09137944790694746
374
+ 09/24/2023 04:19:38 - INFO - __main__ - global_step = 11600, average loss = 0.07454582622590351
375
+ 09/24/2023 04:23:20 - INFO - __main__ - global_step = 11650, average loss = 0.08284565404814202
376
+ 09/24/2023 04:26:59 - INFO - __main__ - global_step = 11700, average loss = 0.0969824349215196
377
+ 09/24/2023 04:30:41 - INFO - __main__ - global_step = 11750, average loss = 0.09389037321489013
378
+ 09/24/2023 04:34:23 - INFO - __main__ - global_step = 11800, average loss = 0.08608788483528769
379
+ 09/24/2023 04:38:05 - INFO - __main__ - global_step = 11850, average loss = 0.09322659247220144
380
+ 09/24/2023 04:41:49 - INFO - __main__ - global_step = 11900, average loss = 0.09286965438863262
381
+ 09/24/2023 04:45:31 - INFO - __main__ - global_step = 11950, average loss = 0.08214385434631367
382
+ 09/24/2023 04:49:12 - INFO - __main__ - global_step = 12000, average loss = 0.09392224536069989
383
+ 09/24/2023 04:49:12 - INFO - __main__ - ***** Running evaluation *****
384
+ 09/24/2023 04:49:12 - INFO - __main__ - Num examples = 10000
385
+ 09/24/2023 04:49:12 - INFO - __main__ - Batch size = 16
386
+ 09/24/2023 04:53:07 - INFO - __main__ - ***** Eval results *****
387
+ 09/24/2023 04:53:07 - INFO - __main__ - acc = 0.8514
388
+ 09/24/2023 04:56:53 - INFO - __main__ - global_step = 12050, average loss = 0.08019034011129406
389
+ 09/24/2023 05:00:34 - INFO - __main__ - global_step = 12100, average loss = 0.08210711618239656
390
+ 09/24/2023 05:04:16 - INFO - __main__ - global_step = 12150, average loss = 0.08764273267355747
391
+ 09/24/2023 05:08:02 - INFO - __main__ - global_step = 12200, average loss = 0.08758470895321807
392
+ 09/24/2023 05:11:48 - INFO - __main__ - global_step = 12250, average loss = 0.07766548367973883
393
+ 09/24/2023 05:15:27 - INFO - __main__ - global_step = 12300, average loss = 0.08148344823415755
394
+ 09/24/2023 05:19:08 - INFO - __main__ - global_step = 12350, average loss = 0.08814196670609817
395
+ 09/24/2023 05:22:50 - INFO - __main__ - global_step = 12400, average loss = 0.08936668847491092
396
+ 09/24/2023 05:26:29 - INFO - __main__ - global_step = 12450, average loss = 0.08240065188347216
397
+ 09/24/2023 05:30:12 - INFO - __main__ - global_step = 12500, average loss = 0.08683115135392655
398
+ 09/24/2023 05:30:12 - INFO - __main__ - ***** Running evaluation *****
399
+ 09/24/2023 05:30:12 - INFO - __main__ - Num examples = 10000
400
+ 09/24/2023 05:30:12 - INFO - __main__ - Batch size = 16
401
+ 09/24/2023 05:34:07 - INFO - __main__ - ***** Eval results *****
402
+ 09/24/2023 05:34:07 - INFO - __main__ - acc = 0.8515
403
+ 09/24/2023 05:37:53 - INFO - __main__ - global_step = 12550, average loss = 0.08871277472944712
404
+ 09/24/2023 05:41:34 - INFO - __main__ - global_step = 12600, average loss = 0.08797626828309149
405
+ 09/24/2023 05:45:11 - INFO - __main__ - global_step = 12650, average loss = 0.10095825259459616
406
+ 09/24/2023 05:48:58 - INFO - __main__ - global_step = 12700, average loss = 0.07953012495926487
407
+ 09/24/2023 05:52:41 - INFO - __main__ - global_step = 12750, average loss = 0.08843418272979761
408
+ 09/24/2023 05:56:19 - INFO - __main__ - global_step = 12800, average loss = 0.07413991435227217
409
+ 09/24/2023 05:59:59 - INFO - __main__ - global_step = 12850, average loss = 0.07519575585451094
410
+ 09/24/2023 06:03:48 - INFO - __main__ - global_step = 12900, average loss = 0.08996981896292709
411
+ 09/24/2023 06:07:28 - INFO - __main__ - global_step = 12950, average loss = 0.08996171029284597
412
+ 09/24/2023 06:11:11 - INFO - __main__ - global_step = 13000, average loss = 0.08077499923689174
413
+ 09/24/2023 06:11:11 - INFO - __main__ - ***** Running evaluation *****
414
+ 09/24/2023 06:11:11 - INFO - __main__ - Num examples = 10000
415
+ 09/24/2023 06:11:11 - INFO - __main__ - Batch size = 16
416
+ 09/24/2023 06:15:06 - INFO - __main__ - ***** Eval results *****
417
+ 09/24/2023 06:15:06 - INFO - __main__ - acc = 0.8527
418
+ 09/24/2023 06:15:33 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
419
+ 09/24/2023 06:19:13 - INFO - __main__ - global_step = 13050, average loss = 0.08447560470420284
420
+ 09/24/2023 06:22:54 - INFO - __main__ - global_step = 13100, average loss = 0.08299598100831646
421
+ 09/24/2023 06:26:32 - INFO - __main__ - global_step = 13150, average loss = 0.08393764879734135
422
+ 09/24/2023 06:30:08 - INFO - __main__ - global_step = 13200, average loss = 0.09848508099505125
423
+ 09/24/2023 06:33:47 - INFO - __main__ - global_step = 13250, average loss = 0.09162080157435412
424
+ 09/24/2023 06:37:28 - INFO - __main__ - global_step = 13300, average loss = 0.0914362099875143
425
+ 09/24/2023 06:41:09 - INFO - __main__ - global_step = 13350, average loss = 0.07781068138462616
426
+ 09/24/2023 06:44:55 - INFO - __main__ - global_step = 13400, average loss = 0.08868030074576382
427
+ 09/24/2023 06:48:36 - INFO - __main__ - global_step = 13450, average loss = 0.08357623873533157
428
+ 09/24/2023 06:52:18 - INFO - __main__ - global_step = 13500, average loss = 0.08828085365807055
429
+ 09/24/2023 06:52:18 - INFO - __main__ - ***** Running evaluation *****
430
+ 09/24/2023 06:52:18 - INFO - __main__ - Num examples = 10000
431
+ 09/24/2023 06:52:18 - INFO - __main__ - Batch size = 16
432
+ 09/24/2023 06:56:14 - INFO - __main__ - ***** Eval results *****
433
+ 09/24/2023 06:56:14 - INFO - __main__ - acc = 0.8499
434
+ 09/24/2023 06:59:57 - INFO - __main__ - global_step = 13550, average loss = 0.08140521681067185
435
+ 09/24/2023 07:03:37 - INFO - __main__ - global_step = 13600, average loss = 0.08341409597109305
436
+ 09/24/2023 07:07:17 - INFO - __main__ - global_step = 13650, average loss = 0.08142950747031136
437
+ 09/24/2023 07:10:56 - INFO - __main__ - global_step = 13700, average loss = 0.09089667504686076
438
+ 09/24/2023 07:14:45 - INFO - __main__ - global_step = 13750, average loss = 0.07177684095106088
439
+ 09/24/2023 07:18:24 - INFO - __main__ - global_step = 13800, average loss = 0.08592368463818274
440
+ 09/24/2023 07:22:01 - INFO - __main__ - global_step = 13850, average loss = 0.08120634569131653
441
+ 09/24/2023 07:25:48 - INFO - __main__ - global_step = 13900, average loss = 0.08909589071197843
442
+ 09/24/2023 07:29:30 - INFO - __main__ - global_step = 13950, average loss = 0.08629100337015189
443
+ 09/24/2023 07:33:10 - INFO - __main__ - global_step = 14000, average loss = 0.07722124511306902
444
+ 09/24/2023 07:33:10 - INFO - __main__ - ***** Running evaluation *****
445
+ 09/24/2023 07:33:10 - INFO - __main__ - Num examples = 10000
446
+ 09/24/2023 07:33:10 - INFO - __main__ - Batch size = 16
447
+ 09/24/2023 07:37:05 - INFO - __main__ - ***** Eval results *****
448
+ 09/24/2023 07:37:05 - INFO - __main__ - acc = 0.8533
449
+ 09/24/2023 07:37:32 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
450
+ 09/24/2023 07:41:11 - INFO - __main__ - global_step = 14050, average loss = 0.08182521525057382
451
+ 09/24/2023 07:44:48 - INFO - __main__ - global_step = 14100, average loss = 0.0902410151962249
452
+ 09/24/2023 07:48:28 - INFO - __main__ - global_step = 14150, average loss = 0.07409664937826164
453
+ 09/24/2023 07:52:12 - INFO - __main__ - global_step = 14200, average loss = 0.08879891355274594
454
+ 09/24/2023 07:55:53 - INFO - __main__ - global_step = 14250, average loss = 0.09268313445325475
455
+ 09/24/2023 07:59:30 - INFO - __main__ - global_step = 14300, average loss = 0.08798344542199629
456
+ 09/24/2023 08:03:13 - INFO - __main__ - global_step = 14350, average loss = 0.09607475698139752
457
+ 09/24/2023 08:06:59 - INFO - __main__ - global_step = 14400, average loss = 0.07222031111843535
458
+ 09/24/2023 08:10:40 - INFO - __main__ - global_step = 14450, average loss = 0.07480319764195884
459
+ 09/24/2023 08:14:19 - INFO - __main__ - global_step = 14500, average loss = 0.0838716509303049
460
+ 09/24/2023 08:14:19 - INFO - __main__ - ***** Running evaluation *****
461
+ 09/24/2023 08:14:19 - INFO - __main__ - Num examples = 10000
462
+ 09/24/2023 08:14:19 - INFO - __main__ - Batch size = 16
463
+ 09/24/2023 08:18:16 - INFO - __main__ - ***** Eval results *****
464
+ 09/24/2023 08:18:16 - INFO - __main__ - acc = 0.8542
465
+ 09/24/2023 08:18:42 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
466
+ 09/24/2023 08:22:18 - INFO - __main__ - global_step = 14550, average loss = 0.08034001361316769
467
+ 09/24/2023 08:25:55 - INFO - __main__ - global_step = 14600, average loss = 0.07689567271547276
468
+ 09/24/2023 08:29:37 - INFO - __main__ - global_step = 14650, average loss = 0.09093381941405823
469
+ 09/24/2023 08:33:25 - INFO - __main__ - global_step = 14700, average loss = 0.07569706412876258
470
+ 09/24/2023 08:37:04 - INFO - __main__ - global_step = 14750, average loss = 0.07479940189456101
471
+ 09/24/2023 08:40:47 - INFO - __main__ - global_step = 14800, average loss = 0.08522207450543647
472
+ 09/24/2023 08:44:34 - INFO - __main__ - global_step = 14850, average loss = 0.0889268495763099
473
+ 09/24/2023 08:48:16 - INFO - __main__ - global_step = 14900, average loss = 0.08616152721479012
474
+ 09/24/2023 08:51:56 - INFO - __main__ - global_step = 14950, average loss = 0.07867321850848384
475
+ 09/24/2023 08:55:39 - INFO - __main__ - global_step = 15000, average loss = 0.08426695556714549
476
+ 09/24/2023 08:55:39 - INFO - __main__ - ***** Running evaluation *****
477
+ 09/24/2023 08:55:39 - INFO - __main__ - Num examples = 10000
478
+ 09/24/2023 08:55:39 - INFO - __main__ - Batch size = 16
479
+ 09/24/2023 08:59:34 - INFO - __main__ - ***** Eval results *****
480
+ 09/24/2023 08:59:34 - INFO - __main__ - acc = 0.8542
481
+ 09/24/2023 09:03:12 - INFO - __main__ - global_step = 15050, average loss = 0.07868185437655484
482
+ 09/24/2023 09:07:00 - INFO - __main__ - global_step = 15100, average loss = 0.08520105790423259
483
+ 09/24/2023 09:10:42 - INFO - __main__ - global_step = 15150, average loss = 0.09536004922925713
484
+ 09/24/2023 09:14:19 - INFO - __main__ - global_step = 15200, average loss = 0.08502999547665241
485
+ 09/24/2023 09:17:58 - INFO - __main__ - global_step = 15250, average loss = 0.08957034896484402
486
+ 09/24/2023 09:21:34 - INFO - __main__ - global_step = 15300, average loss = 0.07968287494033575
487
+ 09/24/2023 09:25:14 - INFO - __main__ - global_step = 15350, average loss = 0.08545487473544199
488
+ 09/24/2023 09:28:55 - INFO - __main__ - global_step = 15400, average loss = 0.08528959889241378
489
+ 09/24/2023 09:32:38 - INFO - __main__ - global_step = 15450, average loss = 0.08095955706679887
490
+ 09/24/2023 09:36:19 - INFO - __main__ - global_step = 15500, average loss = 0.08725373520917856
491
+ 09/24/2023 09:36:19 - INFO - __main__ - ***** Running evaluation *****
492
+ 09/24/2023 09:36:19 - INFO - __main__ - Num examples = 10000
493
+ 09/24/2023 09:36:19 - INFO - __main__ - Batch size = 16
494
+ 09/24/2023 09:40:15 - INFO - __main__ - ***** Eval results *****
495
+ 09/24/2023 09:40:15 - INFO - __main__ - acc = 0.8545
496
+ 09/24/2023 09:40:42 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
497
+ 09/24/2023 09:44:22 - INFO - __main__ - global_step = 15550, average loss = 0.0843266883040269
498
+ 09/24/2023 09:48:03 - INFO - __main__ - global_step = 15600, average loss = 0.07855528741223679
499
+ 09/24/2023 09:51:47 - INFO - __main__ - global_step = 15650, average loss = 0.09478737017554523
500
+ 09/24/2023 09:55:32 - INFO - __main__ - global_step = 15700, average loss = 0.08910313490487169
501
+ 09/24/2023 09:59:16 - INFO - __main__ - global_step = 15750, average loss = 0.07736712342710234
502
+ 09/24/2023 10:02:53 - INFO - __main__ - global_step = 15800, average loss = 0.08501649839432503
503
+ 09/24/2023 10:06:37 - INFO - __main__ - global_step = 15850, average loss = 0.08495221398276044
504
+ 09/24/2023 10:10:23 - INFO - __main__ - global_step = 15900, average loss = 0.08510145512744202
505
+ 09/24/2023 10:14:07 - INFO - __main__ - global_step = 15950, average loss = 0.08335533107921947
506
+ 09/24/2023 10:17:49 - INFO - __main__ - global_step = 16000, average loss = 0.09103241352764599
507
+ 09/24/2023 10:17:49 - INFO - __main__ - ***** Running evaluation *****
508
+ 09/24/2023 10:17:49 - INFO - __main__ - Num examples = 10000
509
+ 09/24/2023 10:17:49 - INFO - __main__ - Batch size = 16
510
+ 09/24/2023 10:21:45 - INFO - __main__ - ***** Eval results *****
511
+ 09/24/2023 10:21:45 - INFO - __main__ - acc = 0.8549
512
+ 09/24/2023 10:22:12 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
513
+ 09/24/2023 10:25:53 - INFO - __main__ - global_step = 16050, average loss = 0.0808029190406296
514
+ 09/24/2023 10:29:33 - INFO - __main__ - global_step = 16100, average loss = 0.0950222506766113
515
+ 09/24/2023 10:33:15 - INFO - __main__ - global_step = 16150, average loss = 0.08560644885961664
516
+ 09/24/2023 10:36:53 - INFO - __main__ - global_step = 16200, average loss = 0.07925290400889935
517
+ 09/24/2023 10:40:34 - INFO - __main__ - global_step = 16250, average loss = 0.08252620983123052
518
+ 09/24/2023 10:44:15 - INFO - __main__ - global_step = 16300, average loss = 0.08747977073326182
519
+ 09/24/2023 10:47:55 - INFO - __main__ - global_step = 16350, average loss = 0.08805208059333382
520
+ 09/24/2023 10:51:41 - INFO - __main__ - global_step = 16400, average loss = 0.07935831163018064
521
+ 09/24/2023 10:55:23 - INFO - __main__ - global_step = 16450, average loss = 0.0807358610859228
522
+ 09/24/2023 10:59:03 - INFO - __main__ - global_step = 16500, average loss = 0.0775301494665473
523
+ 09/24/2023 10:59:03 - INFO - __main__ - ***** Running evaluation *****
524
+ 09/24/2023 10:59:03 - INFO - __main__ - Num examples = 10000
525
+ 09/24/2023 10:59:03 - INFO - __main__ - Batch size = 16
526
+ 09/24/2023 11:02:59 - INFO - __main__ - ***** Eval results *****
527
+ 09/24/2023 11:02:59 - INFO - __main__ - acc = 0.8532
528
+ 09/24/2023 11:06:39 - INFO - __main__ - global_step = 16550, average loss = 0.06899339191091712
529
+ 09/24/2023 11:10:25 - INFO - __main__ - global_step = 16600, average loss = 0.08612027997849508
530
+ 09/24/2023 11:14:10 - INFO - __main__ - global_step = 16650, average loss = 0.08232147437905951
531
+ 09/24/2023 11:17:50 - INFO - __main__ - global_step = 16700, average loss = 0.08530993062430753
532
+ 09/24/2023 11:18:50 - INFO - __main__ - ***** Running evaluation *****
533
+ 09/24/2023 11:18:50 - INFO - __main__ - Num examples = 10000
534
+ 09/24/2023 11:18:50 - INFO - __main__ - Batch size = 16
535
+ 09/24/2023 11:22:45 - INFO - __main__ - ***** Eval results *****
536
+ 09/24/2023 11:22:45 - INFO - __main__ - acc = 0.8533
537
+ 09/24/2023 11:22:45 - INFO - __main__ - global_step = 16713, average loss = 0.11041826268834619
538
+ 09/24/2023 11:23:18 - INFO - __main__ - ***** Running evaluation *****
539
+ 09/24/2023 11:23:18 - INFO - __main__ - Num examples = 10000
540
+ 09/24/2023 11:23:18 - INFO - __main__ - Batch size = 16
541
+ 09/24/2023 11:27:13 - INFO - __main__ - ***** Eval results *****
542
+ 09/24/2023 11:27:13 - INFO - __main__ - acc = 0.8549
543
+ 09/24/2023 11:27:16 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/socialiqa_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='socialiqa', overwrite_output_dir=False, cache_dir=None)
544
+ 09/24/2023 11:27:16 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
545
+ 09/24/2023 11:34:38 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/winogrande_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='winogrande', overwrite_output_dir=False, cache_dir=None)
546
+ 09/24/2023 11:34:38 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
547
+ 09/24/2023 11:37:05 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/piqa_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='piqa', overwrite_output_dir=False, cache_dir=None)
548
+ 09/24/2023 11:37:05 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
549
+ 09/24/2023 11:43:59 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/commonsenseqa_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='commonsenseqa', overwrite_output_dir=False, cache_dir=None)
550
+ 09/24/2023 11:43:59 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
551
+ 09/24/2023 11:49:43 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/anli_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='anli', overwrite_output_dir=False, cache_dir=None)
552
+ 09/24/2023 11:49:43 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
553
+ 09/24/2023 11:54:31 - INFO - __main__ - ***** Running evaluation *****
554
+ 09/24/2023 11:54:31 - INFO - __main__ - Num examples = 120
555
+ 09/24/2023 11:54:31 - INFO - __main__ - Batch size = 16
556
+ 09/24/2023 11:54:47 - INFO - __main__ - ***** Eval results *****
557
+ 09/24/2023 11:54:47 - INFO - __main__ - acc = 0.525
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb357eae91ca6dee772e1aa051d51d1ac15dfb3d6939fc85c99c233728675db4
3
+ size 1915