Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- added_tokens.json +3 -0
- cached_dev_deberta-mlm_128_atomic +3 -0
- cached_train_deberta-mlm_128_atomic +3 -0
- config.json +36 -0
- data_utils.py +236 -0
- eval_results.txt +1 -0
- logits_test.txt +120 -0
- pytorch_model.bin +3 -0
- run_pretrain.py +651 -0
- runs/events.out.tfevents.1695471913.car-atm-2i-half-sample-name-1-0-0.28.0 +3 -0
- special_tokens_map.json +9 -0
- spm.model +3 -0
- tokenizer_config.json +16 -0
- train.log +557 -0
- training_args.bin +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
cached_dev_deberta-mlm_128_atomic filter=lfs diff=lfs merge=lfs -text
|
37 |
+
cached_train_deberta-mlm_128_atomic filter=lfs diff=lfs merge=lfs -text
|
added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"[MASK]": 128000
|
3 |
+
}
|
cached_dev_deberta-mlm_128_atomic
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de2aeddcf9134d495f3461fedebd548864b101e1920dcc1facf27a6790e27e75
|
3 |
+
size 4501475
|
cached_train_deberta-mlm_128_atomic
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5be198ba32b721dd13ee5c578e9de10d3e49aaa3f273d7407b847bad3ae39e1b
|
3 |
+
size 365724007
|
config.json
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "microsoft/deberta-v3-large",
|
3 |
+
"architectures": [
|
4 |
+
"DebertaV2ForMaskedLM"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"finetuning_task": "atomic",
|
8 |
+
"hidden_act": "gelu",
|
9 |
+
"hidden_dropout_prob": 0.1,
|
10 |
+
"hidden_size": 1024,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 4096,
|
13 |
+
"layer_norm_eps": 1e-07,
|
14 |
+
"max_position_embeddings": 512,
|
15 |
+
"max_relative_positions": -1,
|
16 |
+
"model_type": "deberta-v2",
|
17 |
+
"norm_rel_ebd": "layer_norm",
|
18 |
+
"num_attention_heads": 16,
|
19 |
+
"num_hidden_layers": 24,
|
20 |
+
"pad_token_id": 0,
|
21 |
+
"pooler_dropout": 0,
|
22 |
+
"pooler_hidden_act": "gelu",
|
23 |
+
"pooler_hidden_size": 1024,
|
24 |
+
"pos_att_type": [
|
25 |
+
"p2c",
|
26 |
+
"c2p"
|
27 |
+
],
|
28 |
+
"position_biased_input": false,
|
29 |
+
"position_buckets": 256,
|
30 |
+
"relative_attention": true,
|
31 |
+
"share_att_key": true,
|
32 |
+
"torch_dtype": "float32",
|
33 |
+
"transformers_version": "4.31.0",
|
34 |
+
"type_vocab_size": 0,
|
35 |
+
"vocab_size": 128100
|
36 |
+
}
|
data_utils.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
|
4 |
+
import nltk
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from tqdm import tqdm
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
skip_words = set(stopwords.words('english'))
|
10 |
+
skip_words.add('\'s')
|
11 |
+
skip_words.add('.')
|
12 |
+
skip_words.add(',')
|
13 |
+
PERSON_NAMES = ['Alex', 'Ash', 'Aspen', 'Bali', 'Berkeley', 'Cameron', 'Chris', 'Cody', 'Dana', 'Drew', 'Emory',
|
14 |
+
'Flynn', 'Gale', 'Jamie', 'Jesse',
|
15 |
+
'Kai', 'Kendall', 'Kyle', 'Lee', 'Logan', 'Max', 'Morgan', 'Nico', 'Paris', 'Pat', 'Quinn', 'Ray',
|
16 |
+
'Robin', 'Rowan', 'Rudy', 'Sam', 'Skylar', 'Sydney',
|
17 |
+
'Taylor', 'Tracy', 'West', 'Wynne']
|
18 |
+
logger = logging.getLogger(__name__)
|
19 |
+
|
20 |
+
|
21 |
+
def accuracy(out, labels):
|
22 |
+
return {'acc': (out == labels).mean()}
|
23 |
+
|
24 |
+
|
25 |
+
def handle_words(span, tokenizer, keywords=None, is_start=False):
|
26 |
+
inputs = []
|
27 |
+
labels = []
|
28 |
+
words = nltk.word_tokenize(span)
|
29 |
+
for w_i, w in enumerate(words):
|
30 |
+
if (w_i == 0 and is_start) or w == '.' or w == ',' or w.startswith('\''):
|
31 |
+
w_bpes = tokenizer.tokenize(w)
|
32 |
+
else:
|
33 |
+
w_bpes = tokenizer.tokenize(w, add_prefix_space=True)
|
34 |
+
inputs.extend(w_bpes)
|
35 |
+
if keywords != None:
|
36 |
+
if w in keywords:
|
37 |
+
labels.extend(w_bpes)
|
38 |
+
else:
|
39 |
+
labels.extend([-100] * len(w_bpes))
|
40 |
+
else:
|
41 |
+
if w not in PERSON_NAMES and w not in skip_words and w.lower() not in skip_words:
|
42 |
+
labels.extend(w_bpes)
|
43 |
+
else:
|
44 |
+
labels.extend([-100] * len(w_bpes))
|
45 |
+
return inputs, labels
|
46 |
+
|
47 |
+
|
48 |
+
def handle_underscores(suffix, tokenizer, keywords=None, prefix=False):
|
49 |
+
inputs = []
|
50 |
+
labels = []
|
51 |
+
if '_' in suffix:
|
52 |
+
suffix_parts = [i.strip() for i in suffix.split('___')]
|
53 |
+
for i, part in enumerate(suffix_parts):
|
54 |
+
if part:
|
55 |
+
tmp_inputs, tmp_labels = handle_words(part, tokenizer, keywords=keywords, is_start=(i == 0 and prefix))
|
56 |
+
inputs += tmp_inputs
|
57 |
+
labels += tmp_labels
|
58 |
+
|
59 |
+
if i != len(suffix_parts) - 1 and suffix_parts[i + 1]:
|
60 |
+
inputs.append(tokenizer.mask_token)
|
61 |
+
labels.append(-100)
|
62 |
+
else:
|
63 |
+
inputs.append(tokenizer.mask_token)
|
64 |
+
labels.append(-100)
|
65 |
+
else:
|
66 |
+
inputs, labels = handle_words(suffix, tokenizer, keywords=keywords, is_start=prefix)
|
67 |
+
return inputs, labels
|
68 |
+
|
69 |
+
from tqdm import tqdm
|
70 |
+
def convert_examples_to_features(examples, tokenizer, max_length=512):
|
71 |
+
data = []
|
72 |
+
for example in tqdm(examples, desc="converting examples to features"):
|
73 |
+
inputs, labels = handle_underscores(example['context'], tokenizer, keywords=example.get('keywords', None), prefix=True)
|
74 |
+
choices = [handle_underscores(cand, tokenizer) for cand in example['candidates']]
|
75 |
+
input_ids = [inputs + cand[0] for cand in choices]
|
76 |
+
input_ids = [tokenizer.convert_tokens_to_ids(cand) for cand in input_ids]
|
77 |
+
label_ids = [labels + cand[1] for cand in choices]
|
78 |
+
label_ids = [[t if t == -100 else input_ids[i][t_i] for t_i, t in enumerate(cand)] for i, cand in
|
79 |
+
enumerate(label_ids)]
|
80 |
+
label_ids = [[-100] + cand + [-100] for cand in label_ids]
|
81 |
+
input_ids = [tokenizer.prepare_for_model(cand, max_length=max_length, truncation=True)['input_ids'] for cand in
|
82 |
+
input_ids]
|
83 |
+
data.append([input_ids, label_ids, example['correct']])
|
84 |
+
return data
|
85 |
+
|
86 |
+
|
87 |
+
class ATOMICMLMProcessor(object):
|
88 |
+
def __init__(self, args):
|
89 |
+
self.D = []
|
90 |
+
self.filelist = [args.train_file, args.dev_file]
|
91 |
+
|
92 |
+
def get_train_examples(self):
|
93 |
+
self.load_data(self.filelist[0])
|
94 |
+
return self.D
|
95 |
+
|
96 |
+
def get_dev_examples(self):
|
97 |
+
data = []
|
98 |
+
with open(self.filelist[1], 'r') as f:
|
99 |
+
for row in tqdm(f):
|
100 |
+
sample = json.loads(row)
|
101 |
+
data.append(sample)
|
102 |
+
print(len(data))
|
103 |
+
return data
|
104 |
+
|
105 |
+
def load_data(self, filename):
|
106 |
+
with open(filename, "r") as f:
|
107 |
+
for row in tqdm(f):
|
108 |
+
sample = json.loads(row)
|
109 |
+
self.D.append({'id': sample['id'], 'context': sample['context'],
|
110 |
+
'ending': sample['candidates'][sample['correct']], 'keywords': sample.get('keywords', None)})
|
111 |
+
print(len(self.D))
|
112 |
+
|
113 |
+
|
114 |
+
class ATOMICProcessor(object):
|
115 |
+
def __init__(self, args):
|
116 |
+
print('loading from %s %s' % (args.train_file, args.dev_file))
|
117 |
+
self.filelist = [args.train_file, args.dev_file]
|
118 |
+
self.D = [[], []]
|
119 |
+
|
120 |
+
def get_train_examples(self):
|
121 |
+
self.load_data(self.filelist[0], 0)
|
122 |
+
return self.D[0]
|
123 |
+
|
124 |
+
def get_dev_examples(self):
|
125 |
+
self.load_data(self.filelist[1], 1)
|
126 |
+
return self.D[1]
|
127 |
+
|
128 |
+
def load_data(self, filename, sid):
|
129 |
+
with open(filename, "r") as f:
|
130 |
+
for row in tqdm(f):
|
131 |
+
sample = json.loads(row)
|
132 |
+
self.D[sid].append(sample)
|
133 |
+
print(len(self.D[sid]))
|
134 |
+
|
135 |
+
|
136 |
+
class CWWVProcessor(object):
|
137 |
+
def __init__(self, args):
|
138 |
+
self.answerKey_mapping = {'A': 0, 'B': 1, 'C': 2}
|
139 |
+
self.D = [[], []]
|
140 |
+
if args.task_name == 'cskg':
|
141 |
+
print('loading from %s %s' % (args.second_train_file, args.second_dev_file))
|
142 |
+
self.filelist = [args.second_train_file, args.second_dev_file]
|
143 |
+
else:
|
144 |
+
print('loading from %s %s' % (args.train_file, args.dev_file))
|
145 |
+
self.filelist = [args.train_file, args.dev_file]
|
146 |
+
|
147 |
+
def get_train_examples(self):
|
148 |
+
self.load_data(self.filelist[0], 0)
|
149 |
+
return self.D[0]
|
150 |
+
|
151 |
+
def get_dev_examples(self):
|
152 |
+
self.load_data(self.filelist[1], 1)
|
153 |
+
return self.D[1]
|
154 |
+
|
155 |
+
def load_data(self, filename, sid):
|
156 |
+
skipped = 0
|
157 |
+
with open(filename, "r") as f:
|
158 |
+
for row in tqdm(f):
|
159 |
+
sample = json.loads(row)
|
160 |
+
context = sample['question']['stem']
|
161 |
+
if context.endswith('.'):
|
162 |
+
context = context[:-1]
|
163 |
+
if not context.endswith('[MASK]'):
|
164 |
+
skipped += 1
|
165 |
+
context_parts = context.split('[MASK]')
|
166 |
+
context = context_parts[0].strip()
|
167 |
+
candidates = [c['text'] + context_parts[1] + '.' for c in sample['question']['choices']]
|
168 |
+
else:
|
169 |
+
context = context[:-7]
|
170 |
+
candidates = [c['text'] + '.' for c in sample['question']['choices']]
|
171 |
+
label = self.answerKey_mapping[sample['answerKey']]
|
172 |
+
keywords = nltk.word_tokenize(sample['question']['head'])
|
173 |
+
keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
|
174 |
+
self.D[sid].append({'id': sample['id'], 'context': context, 'correct': label, 'candidates': candidates,
|
175 |
+
'keywords': keywords})
|
176 |
+
print(len(self.D[sid]), skipped)
|
177 |
+
|
178 |
+
|
179 |
+
class CWWVMLMProcessor(object):
|
180 |
+
def __init__(self, args):
|
181 |
+
self.answerKey_mapping = {'A': 0, 'B': 1, 'C': 2}
|
182 |
+
self.D = []
|
183 |
+
self.filelist = [args.train_file, args.dev_file]
|
184 |
+
self.args = args
|
185 |
+
|
186 |
+
def get_train_examples(self):
|
187 |
+
self.load_data(self.filelist[0])
|
188 |
+
return self.D
|
189 |
+
|
190 |
+
def get_dev_examples(self):
|
191 |
+
processor = CSKGProcessor(self.args)
|
192 |
+
return processor.get_dev_examples()
|
193 |
+
|
194 |
+
def load_data(self, filename):
|
195 |
+
skipped = 0
|
196 |
+
with open(filename, "r") as f:
|
197 |
+
for row in tqdm(f):
|
198 |
+
sample = json.loads(row)
|
199 |
+
context = sample['question']['stem']
|
200 |
+
if context.endswith('.'):
|
201 |
+
context = context[:-1]
|
202 |
+
assert context.endswith('[MASK]')
|
203 |
+
context = context[:-7]
|
204 |
+
candidates = [c['text'] + '.' for c in sample['question']['choices']]
|
205 |
+
label = self.answerKey_mapping[sample['answerKey']]
|
206 |
+
keywords = nltk.word_tokenize(sample['question']['head'])
|
207 |
+
keywords = [w for w in keywords if w not in skip_words and w.lower() not in skip_words]
|
208 |
+
self.D.append(
|
209 |
+
{'id': sample['id'], 'context': context, 'ending': candidates[label], 'keywords': keywords})
|
210 |
+
print(len(self.D))
|
211 |
+
|
212 |
+
|
213 |
+
class CSKGProcessor(object):
|
214 |
+
def __init__(self, args):
|
215 |
+
# CWWV set always uses second train/dev file params
|
216 |
+
self.atomicprocessor = ATOMICProcessor(args)
|
217 |
+
self.cwwvprocessor = CWWVProcessor(args)
|
218 |
+
|
219 |
+
def get_train_examples(self):
|
220 |
+
cwwv_questions = self.cwwvprocessor.get_train_examples()
|
221 |
+
atomic_questions = self.atomicprocessor.get_train_examples()
|
222 |
+
return cwwv_questions + atomic_questions
|
223 |
+
|
224 |
+
def get_dev_examples(self):
|
225 |
+
cwwv_questions = self.cwwvprocessor.get_dev_examples()
|
226 |
+
atomic_questions = self.atomicprocessor.get_dev_examples()
|
227 |
+
return cwwv_questions + atomic_questions
|
228 |
+
|
229 |
+
|
230 |
+
myprocessors = {
|
231 |
+
"atomic": ATOMICProcessor,
|
232 |
+
"cwwv": CWWVProcessor,
|
233 |
+
"atomicmlm": ATOMICMLMProcessor,
|
234 |
+
"cwwvmlm": CWWVMLMProcessor,
|
235 |
+
"cskg": CSKGProcessor
|
236 |
+
}
|
eval_results.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
acc = 0.525
|
logits_test.txt
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-13.485865592956543 -12.554702758789062 -16.843822479248047 -17.034975051879883 -15.528239250183105
|
2 |
+
-11.945438385009766 -15.764880180358887 -15.65064811706543 -17.049840927124023 -15.45334243774414
|
3 |
+
-11.587425231933594 -16.005020141601562 -18.591140747070312 -13.294342041015625 -16.296398162841797
|
4 |
+
-10.009797096252441 -15.341827392578125 -10.539949417114258 -10.960021018981934 -14.047414779663086
|
5 |
+
-12.25959300994873 -10.891813278198242 -14.401248931884766 -11.447187423706055 -13.685133934020996
|
6 |
+
-12.804305076599121 -15.47064208984375 -12.90679931640625 -12.644388198852539 -16.551565170288086
|
7 |
+
-12.235984802246094 -17.26417350769043 -15.814994812011719 -14.540316581726074 -16.57213020324707
|
8 |
+
-13.064817428588867 -14.498727798461914 -12.224983215332031 -18.461196899414062 -15.455927848815918
|
9 |
+
-9.443912506103516 -15.756866455078125 -14.860000610351562 -11.844358444213867 -16.200241088867188
|
10 |
+
-11.700218200683594 -15.54508113861084 -18.18265724182129 -17.967411041259766 -15.61465072631836
|
11 |
+
-11.174132347106934 -17.90937614440918 -17.444185256958008 -14.454703330993652 -14.554675102233887
|
12 |
+
-12.42685604095459 -12.927202224731445 -14.598489761352539 -13.699914932250977 -16.133480072021484
|
13 |
+
-10.828542709350586 -11.421213150024414 -14.742795944213867 -16.211345672607422 -15.933958053588867
|
14 |
+
-9.981731414794922 -13.114375114440918 -10.851600646972656 -13.30392074584961 -15.448455810546875
|
15 |
+
-12.899808883666992 -12.504766464233398 -11.667335510253906 -11.96485710144043 -16.87687110900879
|
16 |
+
-10.543583869934082 -10.229654312133789 -11.832517623901367 -13.934309005737305 -14.101360321044922
|
17 |
+
-12.628169059753418 -14.552963256835938 -15.033495903015137 -14.496928215026855 -14.277275085449219
|
18 |
+
-10.540155410766602 -17.203994750976562 -15.181567192077637 -13.87678337097168 -16.33222198486328
|
19 |
+
-12.89242935180664 -12.802375793457031 -11.98313045501709 -12.95468807220459 -14.210517883300781
|
20 |
+
-13.707990646362305 -15.822447776794434 -14.352869033813477 -16.99176597595215 -15.332679748535156
|
21 |
+
-18.221946716308594 -14.591471672058105 -11.751192092895508 -15.28943157196045 -15.79006576538086
|
22 |
+
-10.999690055847168 -17.102828979492188 -11.563447952270508 -16.147525787353516 -15.312467575073242
|
23 |
+
-12.119155883789062 -13.997671127319336 -14.05282974243164 -15.500701904296875 -15.024360656738281
|
24 |
+
-12.406027793884277 -13.902008056640625 -14.89413070678711 -13.278053283691406 -16.163454055786133
|
25 |
+
-12.729238510131836 -15.645377159118652 -14.512473106384277 -16.292064666748047 -14.945101737976074
|
26 |
+
-11.565582275390625 -15.689852714538574 -15.196950912475586 -16.58242416381836 -16.84659194946289
|
27 |
+
-9.898529052734375 -12.66911506652832 -12.563604354858398 -15.67273235321045 -14.335538864135742
|
28 |
+
-9.923110961914062 -11.773245811462402 -11.786827087402344 -17.258573532104492 -15.622135162353516
|
29 |
+
-10.604410171508789 -18.388168334960938 -17.512317657470703 -19.24336051940918 -14.125904083251953
|
30 |
+
-16.949708938598633 -13.706928253173828 -16.456981658935547 -14.97553539276123 -14.764923095703125
|
31 |
+
-11.796066284179688 -17.549203872680664 -16.29804229736328 -14.065374374389648 -14.975845336914062
|
32 |
+
-16.109678268432617 -10.775965690612793 -14.450605392456055 -14.85343074798584 -14.199943542480469
|
33 |
+
-12.714162826538086 -14.389898300170898 -14.631216049194336 -13.585458755493164 -14.533248901367188
|
34 |
+
-11.866647720336914 -14.02271842956543 -16.863229751586914 -16.302730560302734 -15.423881530761719
|
35 |
+
-15.34256362915039 -13.611021995544434 -16.341060638427734 -18.729846954345703 -16.023725509643555
|
36 |
+
-13.729169845581055 -13.358748435974121 -16.144556045532227 -15.138229370117188 -15.170283317565918
|
37 |
+
-15.203313827514648 -15.215215682983398 -15.796660423278809 -13.29316520690918 -14.272794723510742
|
38 |
+
-10.206670761108398 -14.634307861328125 -16.95503807067871 -15.288354873657227 -14.735435485839844
|
39 |
+
-15.958247184753418 -11.922269821166992 -15.554479598999023 -16.529380798339844 -14.621512413024902
|
40 |
+
-11.756235122680664 -13.470741271972656 -14.978282928466797 -12.765861511230469 -15.564665794372559
|
41 |
+
-12.600726127624512 -15.769315719604492 -15.375072479248047 -18.390806198120117 -15.738916397094727
|
42 |
+
-15.593986511230469 -14.472156524658203 -13.198034286499023 -14.962379455566406 -16.735809326171875
|
43 |
+
-14.267927169799805 -12.278310775756836 -14.038803100585938 -15.891279220581055 -15.780318260192871
|
44 |
+
-12.514389038085938 -17.2401180267334 -14.838483810424805 -15.404850006103516 -15.5819091796875
|
45 |
+
-12.566061019897461 -12.968563079833984 -10.93451976776123 -13.795123100280762 -14.745218276977539
|
46 |
+
-21.77191162109375 -12.907783508300781 -14.735198974609375 -16.297962188720703 -16.843549728393555
|
47 |
+
-12.247344970703125 -13.780832290649414 -12.38991928100586 -14.87220573425293 -16.25807762145996
|
48 |
+
-10.589265823364258 -14.178890228271484 -15.336084365844727 -12.817083358764648 -15.201112747192383
|
49 |
+
-11.625978469848633 -12.869548797607422 -12.684764862060547 -13.059203147888184 -14.451128959655762
|
50 |
+
-15.250967979431152 -15.940725326538086 -12.647708892822266 -17.80953598022461 -14.153979301452637
|
51 |
+
-13.933172225952148 -11.741453170776367 -16.345247268676758 -15.836057662963867 -14.906820297241211
|
52 |
+
-11.782374382019043 -14.776283264160156 -14.698659896850586 -18.39876365661621 -15.45709228515625
|
53 |
+
-11.998146057128906 -16.717647552490234 -17.300548553466797 -18.50290298461914 -15.252758026123047
|
54 |
+
-13.041389465332031 -14.309408187866211 -16.311140060424805 -19.84041404724121 -15.644210815429688
|
55 |
+
-11.62677001953125 -14.909978866577148 -18.91885757446289 -16.421764373779297 -15.917211532592773
|
56 |
+
-15.454143524169922 -14.28858757019043 -14.329549789428711 -13.516512870788574 -15.161291122436523
|
57 |
+
-14.137840270996094 -14.48200511932373 -14.00358772277832 -17.134559631347656 -14.228231430053711
|
58 |
+
-14.116720199584961 -15.66433334350586 -12.732897758483887 -13.650927543640137 -13.985754013061523
|
59 |
+
-12.114367485046387 -13.667549133300781 -16.13338279724121 -16.44911766052246 -13.523371696472168
|
60 |
+
-13.346145629882812 -18.75728988647461 -17.335689544677734 -17.35544204711914 -14.020357131958008
|
61 |
+
-12.425326347351074 -16.646942138671875 -14.301322937011719 -15.461494445800781 -15.571342468261719
|
62 |
+
-12.522256851196289 -11.32237434387207 -12.392147064208984 -10.751494407653809 -13.668184280395508
|
63 |
+
-11.427050590515137 -12.420162200927734 -12.610843658447266 -13.642584800720215 -13.698797225952148
|
64 |
+
-10.28366470336914 -10.129068374633789 -17.50717544555664 -18.64311408996582 -14.564188957214355
|
65 |
+
-12.855567932128906 -15.540145874023438 -19.281057357788086 -16.280914306640625 -15.508527755737305
|
66 |
+
-13.631608963012695 -13.476893424987793 -10.74915599822998 -17.62717056274414 -15.61255168914795
|
67 |
+
-14.164091110229492 -14.324302673339844 -17.250370025634766 -13.515758514404297 -15.604305267333984
|
68 |
+
-15.42667293548584 -18.33716583251953 -14.98896598815918 -17.703462600708008 -14.412519454956055
|
69 |
+
-11.1312255859375 -13.484000205993652 -10.89915657043457 -14.660863876342773 -14.351375579833984
|
70 |
+
-11.8013916015625 -15.06019115447998 -14.530506134033203 -14.725985527038574 -15.17402458190918
|
71 |
+
-11.380867004394531 -16.774526596069336 -19.0806941986084 -14.300642013549805 -14.787707328796387
|
72 |
+
-15.317098617553711 -10.536006927490234 -16.74585723876953 -17.00075340270996 -14.233205795288086
|
73 |
+
-12.836723327636719 -14.365041732788086 -13.245519638061523 -14.606501579284668 -15.848045349121094
|
74 |
+
-14.671722412109375 -12.97309398651123 -18.96438980102539 -18.358306884765625 -14.383865356445312
|
75 |
+
-12.006148338317871 -19.101789474487305 -18.057790756225586 -17.27611541748047 -14.254199981689453
|
76 |
+
-12.239377975463867 -16.24175262451172 -18.39486312866211 -17.79523468017578 -15.77902889251709
|
77 |
+
-11.026527404785156 -17.164255142211914 -16.89369773864746 -14.13223648071289 -14.647666931152344
|
78 |
+
-13.34439468383789 -15.085588455200195 -14.015231132507324 -14.57590389251709 -15.548398971557617
|
79 |
+
-8.840858459472656 -13.32087230682373 -11.948570251464844 -12.7437744140625 -12.76047420501709
|
80 |
+
-15.12006664276123 -14.209697723388672 -16.21258544921875 -15.09268569946289 -15.725016593933105
|
81 |
+
-12.218896865844727 -19.344924926757812 -14.896772384643555 -15.128798484802246 -15.085726737976074
|
82 |
+
-13.855364799499512 -14.396207809448242 -14.475112915039062 -16.274978637695312 -16.561267852783203
|
83 |
+
-12.931468963623047 -20.261959075927734 -15.067851066589355 -18.67504119873047 -15.390913963317871
|
84 |
+
-10.541189193725586 -17.481229782104492 -16.9975643157959 -19.102977752685547 -14.517946243286133
|
85 |
+
-12.27475643157959 -16.989179611206055 -16.667797088623047 -15.321989059448242 -13.522154808044434
|
86 |
+
-13.157089233398438 -16.034408569335938 -18.0081729888916 -12.339900016784668 -14.611282348632812
|
87 |
+
-11.697013854980469 -18.469486236572266 -17.101667404174805 -18.21478271484375 -15.592073440551758
|
88 |
+
-10.410983085632324 -12.342016220092773 -19.91473960876465 -13.985015869140625 -13.253973007202148
|
89 |
+
-11.477904319763184 -10.292394638061523 -14.22382926940918 -14.867258071899414 -14.294317245483398
|
90 |
+
-13.928434371948242 -15.648571014404297 -13.287254333496094 -18.776403427124023 -14.822330474853516
|
91 |
+
-15.338903427124023 -17.95416831970215 -19.120832443237305 -16.684314727783203 -16.52161407470703
|
92 |
+
-11.756458282470703 -15.748832702636719 -16.6794376373291 -14.992359161376953 -14.095579147338867
|
93 |
+
-12.05335807800293 -16.373342514038086 -17.24437713623047 -16.61781883239746 -13.433090209960938
|
94 |
+
-10.43917465209961 -17.29708480834961 -14.298304557800293 -18.689228057861328 -13.535030364990234
|
95 |
+
-12.81536865234375 -16.188444137573242 -13.543594360351562 -17.28397560119629 -13.897306442260742
|
96 |
+
-11.524691581726074 -16.597545623779297 -16.99277114868164 -17.697744369506836 -15.411514282226562
|
97 |
+
-12.537017822265625 -15.651408195495605 -17.060205459594727 -16.015666961669922 -14.72602367401123
|
98 |
+
-16.58844757080078 -12.648893356323242 -12.805540084838867 -16.793424606323242 -14.052775382995605
|
99 |
+
-8.198223114013672 -14.490667343139648 -15.456783294677734 -18.407733917236328 -13.935432434082031
|
100 |
+
-12.235084533691406 -13.254622459411621 -17.220083236694336 -17.194355010986328 -16.625526428222656
|
101 |
+
-11.645500183105469 -12.380645751953125 -16.97612762451172 -16.918365478515625 -16.081634521484375
|
102 |
+
-15.551166534423828 -18.355005264282227 -17.210491180419922 -16.315765380859375 -16.989500045776367
|
103 |
+
-11.072067260742188 -14.111711502075195 -14.02056884765625 -13.801989555358887 -14.380411148071289
|
104 |
+
-10.74884033203125 -17.039485931396484 -16.053722381591797 -17.099529266357422 -14.157671928405762
|
105 |
+
-14.432212829589844 -15.668070793151855 -17.897315979003906 -19.030807495117188 -16.767248153686523
|
106 |
+
-10.875746726989746 -16.729921340942383 -16.830699920654297 -10.96748161315918 -14.065666198730469
|
107 |
+
-12.992372512817383 -13.389198303222656 -15.847464561462402 -18.74746322631836 -14.213224411010742
|
108 |
+
-16.27631378173828 -15.575210571289062 -15.83560562133789 -19.659807205200195 -15.64935302734375
|
109 |
+
-14.469172477722168 -17.462440490722656 -13.705839157104492 -14.783185958862305 -15.626546859741211
|
110 |
+
-12.304758071899414 -11.911407470703125 -15.557670593261719 -12.007896423339844 -13.70199966430664
|
111 |
+
-12.623868942260742 -13.375839233398438 -17.584022521972656 -14.576876640319824 -15.300482749938965
|
112 |
+
-12.100661277770996 -15.983474731445312 -13.280969619750977 -16.395549774169922 -13.329227447509766
|
113 |
+
-11.029035568237305 -17.723522186279297 -16.722801208496094 -16.074941635131836 -14.450968742370605
|
114 |
+
-14.947690963745117 -13.623953819274902 -17.58797836303711 -18.477333068847656 -14.811092376708984
|
115 |
+
-15.929779052734375 -12.438506126403809 -13.058927536010742 -18.52623748779297 -14.129800796508789
|
116 |
+
-14.234972953796387 -16.270557403564453 -16.478979110717773 -16.46598243713379 -16.054523468017578
|
117 |
+
-11.928365707397461 -12.55320930480957 -17.857826232910156 -15.371994018554688 -15.036209106445312
|
118 |
+
-13.607194900512695 -15.683792114257812 -16.34781837463379 -15.106184005737305 -13.208913803100586
|
119 |
+
-12.801432609558105 -18.26355743408203 -17.48211669921875 -12.184883117675781 -15.798808097839355
|
120 |
+
-14.617281913757324 -14.461203575134277 -18.154417037963867 -14.951545715332031 -14.540334701538086
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f9e50c31777d2402062072d7ea15663f5a6b50395c09328957f06df9b7f7138
|
3 |
+
size 1740904889
|
run_pretrain.py
ADDED
@@ -0,0 +1,651 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
3 |
+
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
|
17 |
+
from __future__ import absolute_import
|
18 |
+
from __future__ import division
|
19 |
+
from __future__ import print_function
|
20 |
+
|
21 |
+
import argparse
|
22 |
+
import json
|
23 |
+
import logging
|
24 |
+
import os
|
25 |
+
import random
|
26 |
+
import wandb
|
27 |
+
|
28 |
+
import numpy as np
|
29 |
+
import torch
|
30 |
+
from torch.optim import AdamW
|
31 |
+
from torch.utils.data import DataLoader
|
32 |
+
from torch.utils.data import RandomSampler
|
33 |
+
from torch.utils.data import SequentialSampler
|
34 |
+
from torch.utils.data.distributed import DistributedSampler
|
35 |
+
from torch.utils.tensorboard import SummaryWriter
|
36 |
+
from tqdm import tqdm
|
37 |
+
from tqdm import trange
|
38 |
+
from transformers import DebertaV2Config
|
39 |
+
from transformers import DebertaV2ForMaskedLM
|
40 |
+
from transformers import DebertaV2Tokenizer
|
41 |
+
from transformers import RobertaConfig
|
42 |
+
from transformers import RobertaForMaskedLM
|
43 |
+
from transformers import RobertaTokenizer
|
44 |
+
from transformers import get_linear_schedule_with_warmup
|
45 |
+
|
46 |
+
from data_utils import accuracy
|
47 |
+
from data_utils import convert_examples_to_features
|
48 |
+
from data_utils import myprocessors
|
49 |
+
|
50 |
+
from evaluate_DeBERTa import eval_tasks
|
51 |
+
from evaluate_DeBERTa import main as evaluate_main
|
52 |
+
|
53 |
+
logger = logging.getLogger(__name__)
|
54 |
+
|
55 |
+
from transformers import MODEL_WITH_LM_HEAD_MAPPING
|
56 |
+
|
57 |
+
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
|
58 |
+
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
59 |
+
MODEL_CLASSES = {
|
60 |
+
'roberta-mlm': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
|
61 |
+
'deberta-mlm': (DebertaV2Config, DebertaV2ForMaskedLM, DebertaV2Tokenizer)
|
62 |
+
}
|
63 |
+
|
64 |
+
|
65 |
+
class MyDataset(torch.utils.data.Dataset):
|
66 |
+
|
67 |
+
def __init__(self, data, pad_token, mask_token, max_words_to_mask):
|
68 |
+
self.data = data
|
69 |
+
self.pad_token = pad_token
|
70 |
+
self.mask_token = mask_token
|
71 |
+
self.max_words_to_mask = max_words_to_mask
|
72 |
+
|
73 |
+
def __len__(self):
|
74 |
+
return len(self.data)
|
75 |
+
|
76 |
+
def __getitem__(self, idx):
|
77 |
+
sample = self.data[idx]
|
78 |
+
return sample, self.pad_token, self.mask_token, self.max_words_to_mask
|
79 |
+
|
80 |
+
|
81 |
+
def mCollateFn(batch):
|
82 |
+
batch_input_ids = []
|
83 |
+
batch_input_mask = []
|
84 |
+
batch_input_labels = []
|
85 |
+
batch_label_ids = []
|
86 |
+
features = [b[0] for b in batch]
|
87 |
+
pad_token = batch[0][1]
|
88 |
+
mask_token = batch[0][2]
|
89 |
+
MAX_WORDS_TO_MASK = batch[0][3]
|
90 |
+
max_len = max([len(cand) for f in features for cand in f[0]])
|
91 |
+
for f in features:
|
92 |
+
batch_input_ids.append([])
|
93 |
+
batch_input_mask.append([])
|
94 |
+
batch_input_labels.append([])
|
95 |
+
batch_label_ids.append(f[2])
|
96 |
+
for i in range(len(f[0])):
|
97 |
+
masked_sequences = []
|
98 |
+
masked_labels = []
|
99 |
+
this_att_mask = []
|
100 |
+
sequence = f[0][i] + [pad_token] * (max_len - len(f[0][i]))
|
101 |
+
label_sequence = f[1][i] + [-100] * (max_len - len(f[1][i]))
|
102 |
+
valid_indices = [l_i for l_i, l in enumerate(label_sequence) if l != -100]
|
103 |
+
if len(valid_indices) > MAX_WORDS_TO_MASK:
|
104 |
+
rm_indices = random.sample(valid_indices, (len(valid_indices) - MAX_WORDS_TO_MASK))
|
105 |
+
label_sequence = [-100 if l_i in rm_indices else l for l_i, l in enumerate(label_sequence)]
|
106 |
+
for j, t in enumerate(label_sequence):
|
107 |
+
if t == -100:
|
108 |
+
continue
|
109 |
+
masked_sequences.append(sequence)
|
110 |
+
masked_labels.append([-100] * max_len)
|
111 |
+
else:
|
112 |
+
masked_sequences.append(sequence[:j] + [mask_token] + sequence[j + 1:])
|
113 |
+
masked_labels.append([-100] * j + [sequence[j]] + [-100] * (max_len - j - 1))
|
114 |
+
this_att_mask.append([1] * len(f[0][i]) + [0] * (max_len - len(f[0][i])))
|
115 |
+
batch_input_ids[-1].append(torch.tensor(masked_sequences, dtype=torch.long))
|
116 |
+
batch_input_mask[-1].append(torch.tensor(this_att_mask, dtype=torch.long))
|
117 |
+
batch_input_labels[-1].append(torch.tensor(masked_labels, dtype=torch.long))
|
118 |
+
return batch_input_ids, batch_input_mask, batch_input_labels, torch.tensor(batch_label_ids, dtype=torch.long)
|
119 |
+
|
120 |
+
|
121 |
+
def set_seed(args):
|
122 |
+
random.seed(args.seed)
|
123 |
+
np.random.seed(args.seed)
|
124 |
+
torch.manual_seed(args.seed)
|
125 |
+
if args.n_gpu > 0:
|
126 |
+
torch.cuda.manual_seed_all(args.seed)
|
127 |
+
|
128 |
+
|
129 |
+
def count_parameters(model):
|
130 |
+
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
131 |
+
|
132 |
+
|
133 |
+
def train(args, train_dataset, model, tokenizer, eval_dataset):
|
134 |
+
""" Train the model """
|
135 |
+
if args.local_rank in [-1, 0]:
|
136 |
+
tb_writer = SummaryWriter(os.path.join(args.output_dir, 'runs'))
|
137 |
+
|
138 |
+
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
|
139 |
+
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
|
140 |
+
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,
|
141 |
+
collate_fn=mCollateFn)
|
142 |
+
|
143 |
+
if args.max_steps > 0:
|
144 |
+
t_total = args.max_steps
|
145 |
+
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
146 |
+
else:
|
147 |
+
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
148 |
+
|
149 |
+
# Prepare optimizer and schedule (linear warmup and decay)
|
150 |
+
no_decay = ['bias', 'LayerNorm.weight']
|
151 |
+
optimizer_grouped_parameters = [
|
152 |
+
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
153 |
+
'weight_decay': args.weight_decay},
|
154 |
+
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
155 |
+
]
|
156 |
+
|
157 |
+
warmup_steps = args.warmup_steps if args.warmup_steps != 0 else int(args.warmup_proportion * t_total)
|
158 |
+
logger.info("warm up steps = %d", warmup_steps)
|
159 |
+
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, betas=(0.9, 0.98))
|
160 |
+
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
|
161 |
+
|
162 |
+
if args.fp16:
|
163 |
+
try:
|
164 |
+
from apex import amp
|
165 |
+
except ImportError:
|
166 |
+
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
167 |
+
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
168 |
+
|
169 |
+
# multi-gpu training (should be after apex fp16 initialization)
|
170 |
+
if args.n_gpu > 1:
|
171 |
+
model = torch.nn.DataParallel(model)
|
172 |
+
|
173 |
+
# Distributed training (should be after apex fp16 initialization)
|
174 |
+
if args.local_rank != -1:
|
175 |
+
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
176 |
+
output_device=args.local_rank,
|
177 |
+
find_unused_parameters=True)
|
178 |
+
# Train!
|
179 |
+
logger.info("***** Running training *****")
|
180 |
+
logger.info(" Num examples = %d", len(train_dataset))
|
181 |
+
logger.info(" Num Epochs = %d", args.num_train_epochs)
|
182 |
+
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
|
183 |
+
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
|
184 |
+
args.train_batch_size * args.gradient_accumulation_steps * (
|
185 |
+
torch.distributed.get_world_size() if args.local_rank != -1 else 1))
|
186 |
+
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
|
187 |
+
logger.info(" Total optimization steps = %d", t_total)
|
188 |
+
|
189 |
+
global_step = 0
|
190 |
+
tr_loss, logging_loss = 0.0, 0.0
|
191 |
+
model.zero_grad()
|
192 |
+
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
|
193 |
+
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
|
194 |
+
curr_best = 0.0
|
195 |
+
CE = torch.nn.CrossEntropyLoss(reduction='none')
|
196 |
+
loss_fct = torch.nn.MultiMarginLoss(margin=args.margin)
|
197 |
+
for _ in train_iterator:
|
198 |
+
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
|
199 |
+
for step, batch in tqdm(enumerate(epoch_iterator), desc=f"Train Epoch {_}"):
|
200 |
+
model.train()
|
201 |
+
num_cand = len(batch[0][0])
|
202 |
+
choice_loss = []
|
203 |
+
choice_seq_lens = np.array([0] + [len(c) for sample in batch[0] for c in sample])
|
204 |
+
choice_seq_lens = np.cumsum(choice_seq_lens)
|
205 |
+
input_ids = torch.cat([c for sample in batch[0] for c in sample], dim=0).to(args.device)
|
206 |
+
att_mask = torch.cat([c for sample in batch[1] for c in sample], dim=0).to(args.device)
|
207 |
+
input_labels = torch.cat([c for sample in batch[2] for c in sample], dim=0).to(args.device)
|
208 |
+
|
209 |
+
if len(input_ids) < args.max_sequence_per_time:
|
210 |
+
inputs = {'input_ids': input_ids,
|
211 |
+
'attention_mask': att_mask}
|
212 |
+
outputs = model(**inputs)
|
213 |
+
ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels.view(-1))
|
214 |
+
ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1)
|
215 |
+
else:
|
216 |
+
ce_loss = []
|
217 |
+
for chunk in range(0, len(input_ids), args.max_sequence_per_time):
|
218 |
+
inputs = {'input_ids': input_ids[chunk:chunk + args.max_sequence_per_time],
|
219 |
+
'attention_mask': att_mask[chunk:chunk + args.max_sequence_per_time]}
|
220 |
+
outputs = model(**inputs)
|
221 |
+
tmp_ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)),
|
222 |
+
input_labels[chunk:chunk + args.max_sequence_per_time].view(-1))
|
223 |
+
tmp_ce_loss = tmp_ce_loss.view(outputs[0].size(0), -1).sum(1)
|
224 |
+
ce_loss.append(tmp_ce_loss)
|
225 |
+
ce_loss = torch.cat(ce_loss, dim=0)
|
226 |
+
# all tokens are valid
|
227 |
+
for c_i in range(len(choice_seq_lens) - 1):
|
228 |
+
start = choice_seq_lens[c_i]
|
229 |
+
end = choice_seq_lens[c_i + 1]
|
230 |
+
choice_loss.append(-ce_loss[start:end].sum() / (end - start))
|
231 |
+
|
232 |
+
choice_loss = torch.stack(choice_loss)
|
233 |
+
choice_loss = choice_loss.view(-1, num_cand)
|
234 |
+
loss = loss_fct(choice_loss, batch[3].to(args.device))
|
235 |
+
|
236 |
+
if args.n_gpu > 1:
|
237 |
+
loss = loss.mean() # mean() to average on multi-gpu parallel training
|
238 |
+
if args.gradient_accumulation_steps > 1:
|
239 |
+
loss = loss / args.gradient_accumulation_steps
|
240 |
+
|
241 |
+
if args.fp16:
|
242 |
+
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
243 |
+
scaled_loss.backward()
|
244 |
+
else:
|
245 |
+
loss.backward()
|
246 |
+
|
247 |
+
tr_loss += loss.item()
|
248 |
+
|
249 |
+
if (step + 1) % args.gradient_accumulation_steps == 0:
|
250 |
+
optimizer.step()
|
251 |
+
scheduler.step() # Update learning rate schedule
|
252 |
+
model.zero_grad()
|
253 |
+
global_step += 1
|
254 |
+
|
255 |
+
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
|
256 |
+
# Log metrics
|
257 |
+
tb_writer.add_scalar('lr', scheduler.get_last_lr()[0], global_step)
|
258 |
+
tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step)
|
259 |
+
tb_writer.add_scalar('Batch_loss', loss.item() * args.gradient_accumulation_steps, global_step)
|
260 |
+
logger.info(" global_step = %s, average loss = %s", global_step,
|
261 |
+
(tr_loss - logging_loss) / args.logging_steps)
|
262 |
+
wandb.log({"train/loss":loss.item()})
|
263 |
+
logging_loss = tr_loss
|
264 |
+
|
265 |
+
if args.local_rank == -1 and args.evaluate_during_training and global_step % args.save_steps == 0:
|
266 |
+
torch.cuda.empty_cache()
|
267 |
+
results = evaluate(args, model, tokenizer, eval_dataset)
|
268 |
+
wandb.log({"eval/"+k:v for k,v in results.items()})
|
269 |
+
for key, value in results.items():
|
270 |
+
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
271 |
+
if results['acc'] > curr_best:
|
272 |
+
curr_best = results['acc']
|
273 |
+
print("At iteration {}, best acc is {}".format(global_step, curr_best))
|
274 |
+
# Save model checkpoint
|
275 |
+
output_dir = args.output_dir
|
276 |
+
if not os.path.exists(output_dir):
|
277 |
+
os.makedirs(output_dir)
|
278 |
+
model_to_save = model.module if hasattr(model,
|
279 |
+
'module') else model # Take care of distributed/parallel training
|
280 |
+
model_to_save.save_pretrained(output_dir)
|
281 |
+
tokenizer.save_pretrained(output_dir)
|
282 |
+
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
283 |
+
logger.info("Saving model checkpoint to %s", output_dir)
|
284 |
+
|
285 |
+
if args.max_steps > 0 and global_step > args.max_steps:
|
286 |
+
epoch_iterator.close()
|
287 |
+
break
|
288 |
+
if args.max_steps > 0 and global_step > args.max_steps:
|
289 |
+
train_iterator.close()
|
290 |
+
break
|
291 |
+
results = evaluate(args, model, tokenizer, eval_dataset)
|
292 |
+
for key, value in results.items():
|
293 |
+
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
|
294 |
+
if results['acc'] > curr_best:
|
295 |
+
curr_best = results['acc']
|
296 |
+
# Save model checkpoint
|
297 |
+
output_dir = args.output_dir
|
298 |
+
if not os.path.exists(output_dir):
|
299 |
+
os.makedirs(output_dir)
|
300 |
+
model_to_save = model.module if hasattr(model,
|
301 |
+
'module') else model # Take care of distributed/parallel training
|
302 |
+
model_to_save.save_pretrained(output_dir)
|
303 |
+
tokenizer.save_pretrained(output_dir)
|
304 |
+
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
305 |
+
logger.info("Saving model checkpoint to %s", output_dir)
|
306 |
+
if args.local_rank in [-1, 0]:
|
307 |
+
tb_writer.close()
|
308 |
+
return global_step, tr_loss / global_step
|
309 |
+
|
310 |
+
|
311 |
+
def save_logits(logits_all, filename):
|
312 |
+
with open(filename, "w") as f:
|
313 |
+
for i in range(len(logits_all)):
|
314 |
+
for j in range(len(logits_all[i])):
|
315 |
+
f.write(str(logits_all[i][j]))
|
316 |
+
if j == len(logits_all[i]) - 1:
|
317 |
+
f.write("\n")
|
318 |
+
else:
|
319 |
+
f.write(" ")
|
320 |
+
|
321 |
+
|
322 |
+
def evaluate(args, model, tokenizer, eval_dataset):
|
323 |
+
results = {}
|
324 |
+
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
|
325 |
+
os.makedirs(args.output_dir)
|
326 |
+
|
327 |
+
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
|
328 |
+
# Note that DistributedSampler samples randomly
|
329 |
+
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
|
330 |
+
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size,
|
331 |
+
collate_fn=mCollateFn)
|
332 |
+
|
333 |
+
# Eval!
|
334 |
+
logger.info("***** Running evaluation *****")
|
335 |
+
logger.info(" Num examples = %d", len(eval_dataset))
|
336 |
+
logger.info(" Batch size = %d", args.eval_batch_size)
|
337 |
+
CE = torch.nn.CrossEntropyLoss(reduction='none')
|
338 |
+
preds = []
|
339 |
+
out_label_ids = []
|
340 |
+
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
341 |
+
model.eval()
|
342 |
+
with torch.no_grad():
|
343 |
+
num_cand = len(batch[0][0])
|
344 |
+
choice_loss = []
|
345 |
+
choice_seq_lens = np.array([0] + [len(c) for sample in batch[0] for c in sample])
|
346 |
+
choice_seq_lens = np.cumsum(choice_seq_lens)
|
347 |
+
input_ids = torch.cat([c for sample in batch[0] for c in sample], dim=0).to(args.device)
|
348 |
+
att_mask = torch.cat([c for sample in batch[1] for c in sample], dim=0).to(args.device)
|
349 |
+
input_labels = torch.cat([c for sample in batch[2] for c in sample], dim=0).to(args.device)
|
350 |
+
if len(input_ids) < args.max_sequence_per_time:
|
351 |
+
inputs = {'input_ids': input_ids,
|
352 |
+
'attention_mask': att_mask}
|
353 |
+
outputs = model(**inputs)
|
354 |
+
ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)), input_labels.view(-1))
|
355 |
+
ce_loss = ce_loss.view(outputs[0].size(0), -1).sum(1)
|
356 |
+
else:
|
357 |
+
ce_loss = []
|
358 |
+
for chunk in range(0, len(input_ids), args.max_sequence_per_time):
|
359 |
+
inputs = {'input_ids': input_ids[chunk:chunk + args.max_sequence_per_time],
|
360 |
+
'attention_mask': att_mask[chunk:chunk + args.max_sequence_per_time]}
|
361 |
+
outputs = model(**inputs)
|
362 |
+
tmp_ce_loss = CE(outputs[0].view(-1, outputs[0].size(-1)),
|
363 |
+
input_labels[chunk:chunk + args.max_sequence_per_time].view(-1))
|
364 |
+
tmp_ce_loss = tmp_ce_loss.view(outputs[0].size(0), -1).sum(1)
|
365 |
+
ce_loss.append(tmp_ce_loss)
|
366 |
+
ce_loss = torch.cat(ce_loss, dim=0)
|
367 |
+
for c_i in range(len(choice_seq_lens) - 1):
|
368 |
+
start = choice_seq_lens[c_i]
|
369 |
+
end = choice_seq_lens[c_i + 1]
|
370 |
+
choice_loss.append(-ce_loss[start:end].sum() / (end - start))
|
371 |
+
choice_loss = torch.stack(choice_loss)
|
372 |
+
choice_loss = choice_loss.view(-1, num_cand)
|
373 |
+
preds.append(choice_loss)
|
374 |
+
out_label_ids.append(batch[3].numpy())
|
375 |
+
preds = torch.cat(preds, dim=0).cpu().numpy()
|
376 |
+
save_logits(preds.tolist(), os.path.join(args.output_dir, args.logits_file))
|
377 |
+
preds = np.argmax(preds, axis=1)
|
378 |
+
result = accuracy(preds, np.concatenate(out_label_ids, axis=0))
|
379 |
+
results.update(result)
|
380 |
+
output_eval_file = os.path.join(args.output_dir, args.results_file)
|
381 |
+
with open(output_eval_file, "w") as writer:
|
382 |
+
logger.info("***** Eval results *****")
|
383 |
+
for key in sorted(result.keys()):
|
384 |
+
print("%s = %s\n" % (key, str(result[key])))
|
385 |
+
logger.info(" %s = %s", key, str(result[key]))
|
386 |
+
writer.write("%s = %s\n" % (key, str(result[key])))
|
387 |
+
return results
|
388 |
+
|
389 |
+
|
390 |
+
def write_data(filename, data):
|
391 |
+
with open(filename, 'w') as fout:
|
392 |
+
for sample in data:
|
393 |
+
fout.write(json.dumps(sample))
|
394 |
+
fout.write('\n')
|
395 |
+
|
396 |
+
|
397 |
+
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
|
398 |
+
if args.local_rank not in [-1, 0] and not evaluate:
|
399 |
+
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
400 |
+
processor = myprocessors[task](args)
|
401 |
+
cached_features_file = os.path.join(args.output_dir, 'cached_{}_{}_{}_{}'.format(
|
402 |
+
'dev' if evaluate else 'train',
|
403 |
+
str(args.model_type),
|
404 |
+
str(args.max_seq_length),
|
405 |
+
str(task)))
|
406 |
+
if os.path.exists(cached_features_file): # remove evaluate
|
407 |
+
features = torch.load(cached_features_file)
|
408 |
+
else:
|
409 |
+
examples = processor.get_dev_examples() if evaluate else processor.get_train_examples()
|
410 |
+
features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length)
|
411 |
+
# if evaluate:
|
412 |
+
torch.save(features, cached_features_file)
|
413 |
+
if args.local_rank == 0 and not evaluate:
|
414 |
+
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
|
415 |
+
print('max_words_to_mask is %s for pretraining tasks %s' % (args.max_words_to_mask, task))
|
416 |
+
return MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id, args.max_words_to_mask)
|
417 |
+
|
418 |
+
|
419 |
+
def main():
|
420 |
+
parser = argparse.ArgumentParser()
|
421 |
+
|
422 |
+
## Required parameters
|
423 |
+
parser.add_argument("--train_file", default=None, type=str, required=True,
|
424 |
+
help="The train file name")
|
425 |
+
parser.add_argument("--dev_file", default=None, type=str, required=True,
|
426 |
+
help="The dev file name")
|
427 |
+
parser.add_argument("--model_type", default=None, type=str, required=True,
|
428 |
+
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
429 |
+
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
|
430 |
+
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
|
431 |
+
MODEL_TYPES))
|
432 |
+
parser.add_argument("--config_name", default="", type=str,
|
433 |
+
help="Pretrained config name or path if not the same as model_name")
|
434 |
+
parser.add_argument("--tokenizer_name", default="", type=str,
|
435 |
+
help="Pretrained tokenizer name or path if not the same as model_name")
|
436 |
+
parser.add_argument("--cache_dir", default=".cache", type=str,
|
437 |
+
help="Where do you want to store the pre-trained models downloaded")
|
438 |
+
parser.add_argument("--task_name", default=None, type=str, required=True,
|
439 |
+
help="The name of the task to train selected in the list: " + ", ".join(myprocessors.keys()))
|
440 |
+
parser.add_argument("--output_dir", default=None, type=str, required=True,
|
441 |
+
help="The output directory where the model predictions and checkpoints will be written.")
|
442 |
+
|
443 |
+
## Other parameters
|
444 |
+
parser.add_argument("--second_train_file", default=None, type=str,
|
445 |
+
help="Used when combining ATOMIC and CWWV")
|
446 |
+
parser.add_argument("--second_dev_file", default=None, type=str,
|
447 |
+
help="Used when combining ATOMIC and CWWV")
|
448 |
+
parser.add_argument("--max_seq_length", default=128, type=int,
|
449 |
+
help="The maximum total input sequence length after tokenization. Sequences longer "
|
450 |
+
"than this will be truncated, sequences shorter will be padded.")
|
451 |
+
parser.add_argument("--max_words_to_mask", default=6, type=int,
|
452 |
+
help="The maximum number of tokens to mask when computing scores")
|
453 |
+
parser.add_argument("--max_sequence_per_time", default=80, type=int,
|
454 |
+
help="The maximum number of sequences to feed into the model")
|
455 |
+
parser.add_argument("--do_train", action='store_true',
|
456 |
+
help="Whether to run training.")
|
457 |
+
parser.add_argument("--do_eval", action='store_true',
|
458 |
+
help="Whether to run eval on the dev set.")
|
459 |
+
parser.add_argument("--do_ext_eval", action='store_true',
|
460 |
+
help="Whether to run external eval on the downstream mcqa datasets.")
|
461 |
+
parser.add_argument("--evaluate_during_training", action='store_true',
|
462 |
+
help="Run evaluation during training at each logging step.")
|
463 |
+
parser.add_argument("--do_lower_case", action='store_true',
|
464 |
+
help="Set this flag if you are using an uncased model.")
|
465 |
+
parser.add_argument("--per_gpu_train_batch_size", default=1, type=int,
|
466 |
+
help="Batch size per GPU/CPU for training.")
|
467 |
+
parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int,
|
468 |
+
help="Batch size per GPU/CPU for evaluation.")
|
469 |
+
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
|
470 |
+
help="Number of updates steps to accumulate before performing a backward/update pass.")
|
471 |
+
parser.add_argument("--margin", default=1.0, type=float,
|
472 |
+
help="The margin for ranking loss")
|
473 |
+
parser.add_argument("--learning_rate", default=1e-5, type=float,
|
474 |
+
help="The initial learning rate for Adam.")
|
475 |
+
parser.add_argument("--weight_decay", default=0.01, type=float,
|
476 |
+
help="Weight deay if we apply some.")
|
477 |
+
parser.add_argument("--adam_epsilon", default=1e-6, type=float,
|
478 |
+
help="Epsilon for Adam optimizer.")
|
479 |
+
parser.add_argument("--max_grad_norm", default=1.0, type=float,
|
480 |
+
help="Max gradient norm.")
|
481 |
+
parser.add_argument("--num_train_epochs", default=1.0, type=float,
|
482 |
+
help="Total number of training epochs to perform.")
|
483 |
+
parser.add_argument("--max_steps", default=-1, type=int,
|
484 |
+
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
|
485 |
+
parser.add_argument("--warmup_steps", default=0, type=int,
|
486 |
+
help="Linear warmup over warmup_steps.")
|
487 |
+
parser.add_argument("--warmup_proportion", default=0.05, type=float,
|
488 |
+
help="Linear warmup over warmup proportion.")
|
489 |
+
parser.add_argument('--logging_steps', type=int, default=50,
|
490 |
+
help="Log every X updates steps.")
|
491 |
+
parser.add_argument('--save_steps', type=int, default=50,
|
492 |
+
help="Save checkpoint every X updates steps.")
|
493 |
+
parser.add_argument("--logits_file", default='logits_test.txt', type=str,
|
494 |
+
help="The file where prediction logits will be written")
|
495 |
+
parser.add_argument("--results_file", default='eval_results.txt', type=str,
|
496 |
+
help="The file where eval results will be written")
|
497 |
+
parser.add_argument("--no_cuda", action='store_true',
|
498 |
+
help="Avoid using CUDA when available")
|
499 |
+
parser.add_argument('--overwrite_output_dir', action='store_true',
|
500 |
+
help="Overwrite the content of the output directory")
|
501 |
+
parser.add_argument('--seed', type=int, default=42,
|
502 |
+
help="random seed for initialization")
|
503 |
+
parser.add_argument('--fp16', action='store_true',
|
504 |
+
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
|
505 |
+
parser.add_argument('--fp16_opt_level', type=str, default='O1',
|
506 |
+
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
|
507 |
+
"See details at https://nvidia.github.io/apex/amp.html")
|
508 |
+
parser.add_argument("--local_rank", type=int, default=-1,
|
509 |
+
help="For distributed training: local_rank")
|
510 |
+
parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
|
511 |
+
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
|
512 |
+
|
513 |
+
### for extrinsic evaluation
|
514 |
+
|
515 |
+
parser.add_argument("--eval_output_dir", default="./output/eval_results", type=str, required=True,
|
516 |
+
help="output of the predictions")
|
517 |
+
|
518 |
+
args = parser.parse_args()
|
519 |
+
|
520 |
+
wandb.init(project="car_mcqa", config=args)
|
521 |
+
|
522 |
+
if os.path.exists(args.output_dir) and os.listdir(
|
523 |
+
args.output_dir) and not args.overwrite_output_dir and args.do_train:
|
524 |
+
raise ValueError(
|
525 |
+
"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
|
526 |
+
args.output_dir))
|
527 |
+
if not os.path.exists(args.output_dir):
|
528 |
+
os.makedirs(args.output_dir)
|
529 |
+
|
530 |
+
# Setup CUDA, GPU & distributed training
|
531 |
+
if args.local_rank == -1 or args.no_cuda:
|
532 |
+
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
|
533 |
+
args.n_gpu = torch.cuda.device_count()
|
534 |
+
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
|
535 |
+
torch.cuda.set_device(args.local_rank)
|
536 |
+
device = torch.device("cuda", args.local_rank)
|
537 |
+
torch.distributed.init_process_group(backend='nccl')
|
538 |
+
args.n_gpu = 1
|
539 |
+
args.device = device
|
540 |
+
|
541 |
+
if args.do_train:
|
542 |
+
for handler in logging.root.handlers[:]:
|
543 |
+
logging.root.removeHandler(handler)
|
544 |
+
# Setup logging
|
545 |
+
if args.do_train:
|
546 |
+
log_file = os.path.join(args.output_dir, 'train.log')
|
547 |
+
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
548 |
+
datefmt='%m/%d/%Y %H:%M:%S',
|
549 |
+
level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
|
550 |
+
filename=log_file)
|
551 |
+
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
552 |
+
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
|
553 |
+
os.system("cp run_pretrain.py %s" % os.path.join(args.output_dir, 'run_pretrain.py'))
|
554 |
+
os.system("cp data_utils.py %s" % os.path.join(args.output_dir, 'data_utils.py'))
|
555 |
+
|
556 |
+
# Set seed
|
557 |
+
set_seed(args)
|
558 |
+
args.task_name = args.task_name.lower()
|
559 |
+
if args.task_name not in myprocessors:
|
560 |
+
raise ValueError("Task not found: %s" % (args.task_name))
|
561 |
+
|
562 |
+
args.model_type = args.model_type.lower()
|
563 |
+
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
|
564 |
+
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
|
565 |
+
finetuning_task=args.task_name, cache_dir=args.cache_dir)
|
566 |
+
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
|
567 |
+
do_lower_case=args.do_lower_case, cache_dir=args.cache_dir)
|
568 |
+
model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path),
|
569 |
+
config=config, cache_dir=args.cache_dir)
|
570 |
+
|
571 |
+
count = count_parameters(model)
|
572 |
+
print("number of params", count)
|
573 |
+
|
574 |
+
if args.local_rank == 0:
|
575 |
+
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
576 |
+
|
577 |
+
model.to(args.device)
|
578 |
+
|
579 |
+
logger.info("Training/evaluation parameters %s", args)
|
580 |
+
|
581 |
+
|
582 |
+
eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
|
583 |
+
print("num of eval set", len(eval_dataset))
|
584 |
+
|
585 |
+
if args.do_train:
|
586 |
+
init_result = evaluate(args, model, tokenizer, eval_dataset)
|
587 |
+
print(init_result)
|
588 |
+
|
589 |
+
if args.do_train:
|
590 |
+
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
|
591 |
+
print("num train examples", len(train_dataset))
|
592 |
+
global_step, tr_loss = train(args, train_dataset, model, tokenizer, eval_dataset)
|
593 |
+
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
|
594 |
+
|
595 |
+
# Evaluation
|
596 |
+
|
597 |
+
results = {}
|
598 |
+
if args.do_eval:
|
599 |
+
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
600 |
+
model = model_class.from_pretrained(args.output_dir)
|
601 |
+
model.eval()
|
602 |
+
model.to(args.device)
|
603 |
+
result = evaluate(args, model, tokenizer, eval_dataset)
|
604 |
+
|
605 |
+
|
606 |
+
# do extrinsic evaluation
|
607 |
+
|
608 |
+
if args.do_ext_eval:
|
609 |
+
del model
|
610 |
+
import gc
|
611 |
+
gc.collect()
|
612 |
+
torch.cuda.empty_cache()
|
613 |
+
|
614 |
+
|
615 |
+
ext_results = {}
|
616 |
+
|
617 |
+
for task_name, dataset_path in eval_tasks:
|
618 |
+
eval_args = argparse.Namespace()
|
619 |
+
eval_args.dataset_file = dataset_path
|
620 |
+
eval_args.lm = args.output_dir
|
621 |
+
eval_args.out_dir = os.path.join(args.eval_output_dir, os.path.basename( args.output_dir))
|
622 |
+
eval_args.device = 0
|
623 |
+
eval_args.reader = task_name
|
624 |
+
eval_args.overwrite_output_dir = args.overwrite_output_dir
|
625 |
+
eval_args.cache_dir = None
|
626 |
+
if task_name in ["socialiqa", "winogrande", "piqa", "commonsenseqa", "anli"]:
|
627 |
+
acc = evaluate_main(eval_args)
|
628 |
+
ext_results[task_name] = acc
|
629 |
+
else:
|
630 |
+
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
|
631 |
+
model = model_class.from_pretrained(args.output_dir)
|
632 |
+
model.eval()
|
633 |
+
model.to(args.device)
|
634 |
+
|
635 |
+
# load data
|
636 |
+
examples = []
|
637 |
+
with open(dataset_path, "r") as f:
|
638 |
+
for row in tqdm(f):
|
639 |
+
sample = json.loads(row)
|
640 |
+
examples.append(sample)
|
641 |
+
features = convert_examples_to_features(examples, tokenizer, max_length=args.max_seq_length)
|
642 |
+
eval_dataset = MyDataset(features, tokenizer.pad_token_id, tokenizer.mask_token_id, args.max_words_to_mask)
|
643 |
+
result = evaluate(args, model, tokenizer, eval_dataset)
|
644 |
+
ext_results[task_name] = result['acc']
|
645 |
+
|
646 |
+
wandb.log({"ext/"+task_name:acc for task_name, acc in ext_results.items()})
|
647 |
+
|
648 |
+
# return results
|
649 |
+
|
650 |
+
if __name__ == "__main__":
|
651 |
+
main()
|
runs/events.out.tfevents.1695471913.car-atm-2i-half-sample-name-1-0-0.28.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a757fc252e0640c729dd7f1f2fcf08dadb6184a9d3570e98e3011cc339dbd081
|
3 |
+
size 46125
|
special_tokens_map.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "[CLS]",
|
3 |
+
"cls_token": "[CLS]",
|
4 |
+
"eos_token": "[SEP]",
|
5 |
+
"mask_token": "[MASK]",
|
6 |
+
"pad_token": "[PAD]",
|
7 |
+
"sep_token": "[SEP]",
|
8 |
+
"unk_token": "[UNK]"
|
9 |
+
}
|
spm.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c679fbf93643d19aab7ee10c0b99e460bdbc02fedf34b92b05af343b4af586fd
|
3 |
+
size 2464616
|
tokenizer_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "[CLS]",
|
3 |
+
"clean_up_tokenization_spaces": true,
|
4 |
+
"cls_token": "[CLS]",
|
5 |
+
"do_lower_case": false,
|
6 |
+
"eos_token": "[SEP]",
|
7 |
+
"mask_token": "[MASK]",
|
8 |
+
"model_max_length": 1000000000000000019884624838656,
|
9 |
+
"pad_token": "[PAD]",
|
10 |
+
"sep_token": "[SEP]",
|
11 |
+
"sp_model_kwargs": {},
|
12 |
+
"split_by_punct": false,
|
13 |
+
"tokenizer_class": "DebertaV2Tokenizer",
|
14 |
+
"unk_token": "[UNK]",
|
15 |
+
"vocab_type": "spm"
|
16 |
+
}
|
train.log
ADDED
@@ -0,0 +1,557 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
09/23/2023 12:10:45 - WARNING - __main__ - Process rank: -1, device: cuda, n_gpu: 1, distributed training: False, 16-bits training: False
|
2 |
+
09/23/2023 12:11:04 - INFO - __main__ - Training/evaluation parameters Namespace(train_file='../../../data/mcqa/atomic/train_atm_n_2i_half_sample_name.jsonl', dev_file='../../../data/mcqa/atomic/dev_random_10k.jsonl', model_type='deberta-mlm', model_name_or_path='microsoft/deberta-v3-large', config_name='', tokenizer_name='', cache_dir='.cache', task_name='atomic', output_dir='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', second_train_file=None, second_dev_file=None, max_seq_length=128, max_words_to_mask=6, max_sequence_per_time=80, do_train=True, do_eval=True, do_ext_eval=True, evaluate_during_training=True, do_lower_case=False, per_gpu_train_batch_size=2, per_gpu_eval_batch_size=16, gradient_accumulation_steps=16, margin=1.0, learning_rate=5e-06, weight_decay=0.01, adam_epsilon=1e-06, max_grad_norm=1.0, num_train_epochs=1.0, max_steps=-1, warmup_steps=0, warmup_proportion=0.05, logging_steps=50, save_steps=500, logits_file='logits_test.txt', results_file='eval_results.txt', no_cuda=False, overwrite_output_dir=False, seed=42, fp16=False, fp16_opt_level='O1', local_rank=-1, server_ip='', server_port='', eval_output_dir='./eval_results', n_gpu=1, device=device(type='cuda'))
|
3 |
+
09/23/2023 12:11:13 - INFO - __main__ - ***** Running evaluation *****
|
4 |
+
09/23/2023 12:11:13 - INFO - __main__ - Num examples = 10000
|
5 |
+
09/23/2023 12:11:13 - INFO - __main__ - Batch size = 16
|
6 |
+
09/23/2023 12:15:11 - INFO - __main__ - ***** Eval results *****
|
7 |
+
09/23/2023 12:15:11 - INFO - __main__ - acc = 0.3392
|
8 |
+
09/23/2023 12:25:13 - INFO - __main__ - warm up steps = 835
|
9 |
+
09/23/2023 12:25:13 - INFO - __main__ - ***** Running training *****
|
10 |
+
09/23/2023 12:25:13 - INFO - __main__ - Num examples = 534833
|
11 |
+
09/23/2023 12:25:13 - INFO - __main__ - Num Epochs = 1
|
12 |
+
09/23/2023 12:25:13 - INFO - __main__ - Instantaneous batch size per GPU = 2
|
13 |
+
09/23/2023 12:25:13 - INFO - __main__ - Total train batch size (w. parallel, distributed & accumulation) = 32
|
14 |
+
09/23/2023 12:25:13 - INFO - __main__ - Gradient Accumulation steps = 16
|
15 |
+
09/23/2023 12:25:13 - INFO - __main__ - Total optimization steps = 16713
|
16 |
+
09/23/2023 12:28:54 - INFO - __main__ - global_step = 50, average loss = 0.6903331369534135
|
17 |
+
09/23/2023 12:32:33 - INFO - __main__ - global_step = 100, average loss = 0.6819266405794769
|
18 |
+
09/23/2023 12:36:13 - INFO - __main__ - global_step = 150, average loss = 0.6690767159638926
|
19 |
+
09/23/2023 12:39:56 - INFO - __main__ - global_step = 200, average loss = 0.6476348407182377
|
20 |
+
09/23/2023 12:43:39 - INFO - __main__ - global_step = 250, average loss = 0.6220815655076877
|
21 |
+
09/23/2023 12:47:19 - INFO - __main__ - global_step = 300, average loss = 0.5299683179453859
|
22 |
+
09/23/2023 12:50:56 - INFO - __main__ - global_step = 350, average loss = 0.39345016410181416
|
23 |
+
09/23/2023 12:54:38 - INFO - __main__ - global_step = 400, average loss = 0.31127411118301096
|
24 |
+
09/23/2023 12:58:19 - INFO - __main__ - global_step = 450, average loss = 0.25150225180907
|
25 |
+
09/23/2023 13:02:00 - INFO - __main__ - global_step = 500, average loss = 0.22586858159028453
|
26 |
+
09/23/2023 13:02:01 - INFO - __main__ - ***** Running evaluation *****
|
27 |
+
09/23/2023 13:02:01 - INFO - __main__ - Num examples = 10000
|
28 |
+
09/23/2023 13:02:01 - INFO - __main__ - Batch size = 16
|
29 |
+
09/23/2023 13:05:56 - INFO - __main__ - ***** Eval results *****
|
30 |
+
09/23/2023 13:05:56 - INFO - __main__ - acc = 0.6996
|
31 |
+
09/23/2023 13:06:23 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
32 |
+
09/23/2023 13:10:02 - INFO - __main__ - global_step = 550, average loss = 0.22251796642665794
|
33 |
+
09/23/2023 13:13:46 - INFO - __main__ - global_step = 600, average loss = 0.19366045010890956
|
34 |
+
09/23/2023 13:17:29 - INFO - __main__ - global_step = 650, average loss = 0.18587105088678071
|
35 |
+
09/23/2023 13:21:15 - INFO - __main__ - global_step = 700, average loss = 0.1760789550206391
|
36 |
+
09/23/2023 13:24:59 - INFO - __main__ - global_step = 750, average loss = 0.18312411408871412
|
37 |
+
09/23/2023 13:28:42 - INFO - __main__ - global_step = 800, average loss = 0.15576540186157217
|
38 |
+
09/23/2023 13:32:25 - INFO - __main__ - global_step = 850, average loss = 0.16302873345994157
|
39 |
+
09/23/2023 13:36:07 - INFO - __main__ - global_step = 900, average loss = 0.15725697406036487
|
40 |
+
09/23/2023 13:39:46 - INFO - __main__ - global_step = 950, average loss = 0.15640976145299645
|
41 |
+
09/23/2023 13:43:33 - INFO - __main__ - global_step = 1000, average loss = 0.15606625928507128
|
42 |
+
09/23/2023 13:43:34 - INFO - __main__ - ***** Running evaluation *****
|
43 |
+
09/23/2023 13:43:34 - INFO - __main__ - Num examples = 10000
|
44 |
+
09/23/2023 13:43:34 - INFO - __main__ - Batch size = 16
|
45 |
+
09/23/2023 13:47:30 - INFO - __main__ - ***** Eval results *****
|
46 |
+
09/23/2023 13:47:30 - INFO - __main__ - acc = 0.7961
|
47 |
+
09/23/2023 13:47:58 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
48 |
+
09/23/2023 13:51:41 - INFO - __main__ - global_step = 1050, average loss = 0.14431810150181262
|
49 |
+
09/23/2023 13:55:20 - INFO - __main__ - global_step = 1100, average loss = 0.15233074207513708
|
50 |
+
09/23/2023 13:59:01 - INFO - __main__ - global_step = 1150, average loss = 0.1404175848151772
|
51 |
+
09/23/2023 14:02:44 - INFO - __main__ - global_step = 1200, average loss = 0.12134294869215864
|
52 |
+
09/23/2023 14:06:20 - INFO - __main__ - global_step = 1250, average loss = 0.1363200130731275
|
53 |
+
09/23/2023 14:09:59 - INFO - __main__ - global_step = 1300, average loss = 0.13769450530940958
|
54 |
+
09/23/2023 14:13:43 - INFO - __main__ - global_step = 1350, average loss = 0.12156560226379952
|
55 |
+
09/23/2023 14:17:18 - INFO - __main__ - global_step = 1400, average loss = 0.12623315585107775
|
56 |
+
09/23/2023 14:20:59 - INFO - __main__ - global_step = 1450, average loss = 0.14377202547417256
|
57 |
+
09/23/2023 14:24:33 - INFO - __main__ - global_step = 1500, average loss = 0.1286695548933858
|
58 |
+
09/23/2023 14:24:34 - INFO - __main__ - ***** Running evaluation *****
|
59 |
+
09/23/2023 14:24:34 - INFO - __main__ - Num examples = 10000
|
60 |
+
09/23/2023 14:24:34 - INFO - __main__ - Batch size = 16
|
61 |
+
09/23/2023 14:28:29 - INFO - __main__ - ***** Eval results *****
|
62 |
+
09/23/2023 14:28:29 - INFO - __main__ - acc = 0.8048
|
63 |
+
09/23/2023 14:28:56 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
64 |
+
09/23/2023 14:32:42 - INFO - __main__ - global_step = 1550, average loss = 0.1198868363915244
|
65 |
+
09/23/2023 14:36:24 - INFO - __main__ - global_step = 1600, average loss = 0.12324378551486007
|
66 |
+
09/23/2023 14:40:00 - INFO - __main__ - global_step = 1650, average loss = 0.11938468464672042
|
67 |
+
09/23/2023 14:43:41 - INFO - __main__ - global_step = 1700, average loss = 0.14236379045556533
|
68 |
+
09/23/2023 14:47:22 - INFO - __main__ - global_step = 1750, average loss = 0.13320694023670512
|
69 |
+
09/23/2023 14:51:02 - INFO - __main__ - global_step = 1800, average loss = 0.13622453257718006
|
70 |
+
09/23/2023 14:54:42 - INFO - __main__ - global_step = 1850, average loss = 0.13987649206645072
|
71 |
+
09/23/2023 14:58:22 - INFO - __main__ - global_step = 1900, average loss = 0.12299754774277971
|
72 |
+
09/23/2023 15:02:05 - INFO - __main__ - global_step = 1950, average loss = 0.11868109124743569
|
73 |
+
09/23/2023 15:05:47 - INFO - __main__ - global_step = 2000, average loss = 0.1415042275990345
|
74 |
+
09/23/2023 15:05:47 - INFO - __main__ - ***** Running evaluation *****
|
75 |
+
09/23/2023 15:05:47 - INFO - __main__ - Num examples = 10000
|
76 |
+
09/23/2023 15:05:47 - INFO - __main__ - Batch size = 16
|
77 |
+
09/23/2023 15:09:43 - INFO - __main__ - ***** Eval results *****
|
78 |
+
09/23/2023 15:09:43 - INFO - __main__ - acc = 0.8063
|
79 |
+
09/23/2023 15:10:10 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
80 |
+
09/23/2023 15:13:51 - INFO - __main__ - global_step = 2050, average loss = 0.11399275673671581
|
81 |
+
09/23/2023 15:17:31 - INFO - __main__ - global_step = 2100, average loss = 0.1065546132405143
|
82 |
+
09/23/2023 15:21:11 - INFO - __main__ - global_step = 2150, average loss = 0.12809142941467144
|
83 |
+
09/23/2023 15:24:51 - INFO - __main__ - global_step = 2200, average loss = 0.12454848410692648
|
84 |
+
09/23/2023 15:28:34 - INFO - __main__ - global_step = 2250, average loss = 0.10986286829065647
|
85 |
+
09/23/2023 15:32:14 - INFO - __main__ - global_step = 2300, average loss = 0.11237965747121052
|
86 |
+
09/23/2023 15:35:56 - INFO - __main__ - global_step = 2350, average loss = 0.10897610924319451
|
87 |
+
09/23/2023 15:39:41 - INFO - __main__ - global_step = 2400, average loss = 0.12056981857070241
|
88 |
+
09/23/2023 15:43:24 - INFO - __main__ - global_step = 2450, average loss = 0.13911059297635803
|
89 |
+
09/23/2023 15:47:10 - INFO - __main__ - global_step = 2500, average loss = 0.11335444856034883
|
90 |
+
09/23/2023 15:47:10 - INFO - __main__ - ***** Running evaluation *****
|
91 |
+
09/23/2023 15:47:10 - INFO - __main__ - Num examples = 10000
|
92 |
+
09/23/2023 15:47:10 - INFO - __main__ - Batch size = 16
|
93 |
+
09/23/2023 15:51:06 - INFO - __main__ - ***** Eval results *****
|
94 |
+
09/23/2023 15:51:06 - INFO - __main__ - acc = 0.8234
|
95 |
+
09/23/2023 15:51:32 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
96 |
+
09/23/2023 15:55:10 - INFO - __main__ - global_step = 2550, average loss = 0.12103958850973867
|
97 |
+
09/23/2023 15:58:57 - INFO - __main__ - global_step = 2600, average loss = 0.11913071399074397
|
98 |
+
09/23/2023 16:02:38 - INFO - __main__ - global_step = 2650, average loss = 0.11255583499452769
|
99 |
+
09/23/2023 16:06:28 - INFO - __main__ - global_step = 2700, average loss = 0.1006322616293619
|
100 |
+
09/23/2023 16:10:12 - INFO - __main__ - global_step = 2750, average loss = 0.0932968783121487
|
101 |
+
09/23/2023 16:13:51 - INFO - __main__ - global_step = 2800, average loss = 0.11056979637924087
|
102 |
+
09/23/2023 16:17:38 - INFO - __main__ - global_step = 2850, average loss = 0.12318793082176853
|
103 |
+
09/23/2023 16:21:21 - INFO - __main__ - global_step = 2900, average loss = 0.10864610994302439
|
104 |
+
09/23/2023 16:25:03 - INFO - __main__ - global_step = 2950, average loss = 0.11261582636667299
|
105 |
+
09/23/2023 16:28:40 - INFO - __main__ - global_step = 3000, average loss = 0.12150005620278534
|
106 |
+
09/23/2023 16:28:40 - INFO - __main__ - ***** Running evaluation *****
|
107 |
+
09/23/2023 16:28:40 - INFO - __main__ - Num examples = 10000
|
108 |
+
09/23/2023 16:28:40 - INFO - __main__ - Batch size = 16
|
109 |
+
09/23/2023 16:32:35 - INFO - __main__ - ***** Eval results *****
|
110 |
+
09/23/2023 16:32:35 - INFO - __main__ - acc = 0.8261
|
111 |
+
09/23/2023 16:33:02 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
112 |
+
09/23/2023 16:36:46 - INFO - __main__ - global_step = 3050, average loss = 0.10565035182957218
|
113 |
+
09/23/2023 16:40:30 - INFO - __main__ - global_step = 3100, average loss = 0.10429829731896462
|
114 |
+
09/23/2023 16:44:14 - INFO - __main__ - global_step = 3150, average loss = 0.10812272985053824
|
115 |
+
09/23/2023 16:47:54 - INFO - __main__ - global_step = 3200, average loss = 0.12238092143270478
|
116 |
+
09/23/2023 16:51:33 - INFO - __main__ - global_step = 3250, average loss = 0.10868940783606376
|
117 |
+
09/23/2023 16:55:14 - INFO - __main__ - global_step = 3300, average loss = 0.1209917226509424
|
118 |
+
09/23/2023 16:58:59 - INFO - __main__ - global_step = 3350, average loss = 0.1191260662042896
|
119 |
+
09/23/2023 17:02:41 - INFO - __main__ - global_step = 3400, average loss = 0.1174743126919202
|
120 |
+
09/23/2023 17:06:26 - INFO - __main__ - global_step = 3450, average loss = 0.100895225374843
|
121 |
+
09/23/2023 17:10:02 - INFO - __main__ - global_step = 3500, average loss = 0.0931866138278565
|
122 |
+
09/23/2023 17:10:03 - INFO - __main__ - ***** Running evaluation *****
|
123 |
+
09/23/2023 17:10:03 - INFO - __main__ - Num examples = 10000
|
124 |
+
09/23/2023 17:10:03 - INFO - __main__ - Batch size = 16
|
125 |
+
09/23/2023 17:13:58 - INFO - __main__ - ***** Eval results *****
|
126 |
+
09/23/2023 17:13:58 - INFO - __main__ - acc = 0.8229
|
127 |
+
09/23/2023 17:17:45 - INFO - __main__ - global_step = 3550, average loss = 0.10633477224648231
|
128 |
+
09/23/2023 17:21:30 - INFO - __main__ - global_step = 3600, average loss = 0.1021722938354651
|
129 |
+
09/23/2023 17:25:11 - INFO - __main__ - global_step = 3650, average loss = 0.10295378862727375
|
130 |
+
09/23/2023 17:28:50 - INFO - __main__ - global_step = 3700, average loss = 0.1024187771679135
|
131 |
+
09/23/2023 17:32:34 - INFO - __main__ - global_step = 3750, average loss = 0.09922411829451448
|
132 |
+
09/23/2023 17:36:14 - INFO - __main__ - global_step = 3800, average loss = 0.11105157318372222
|
133 |
+
09/23/2023 17:39:57 - INFO - __main__ - global_step = 3850, average loss = 0.12378941989987652
|
134 |
+
09/23/2023 17:43:42 - INFO - __main__ - global_step = 3900, average loss = 0.1034327056143593
|
135 |
+
09/23/2023 17:47:25 - INFO - __main__ - global_step = 3950, average loss = 0.09697925167827634
|
136 |
+
09/23/2023 17:51:09 - INFO - __main__ - global_step = 4000, average loss = 0.11230336717126192
|
137 |
+
09/23/2023 17:51:09 - INFO - __main__ - ***** Running evaluation *****
|
138 |
+
09/23/2023 17:51:09 - INFO - __main__ - Num examples = 10000
|
139 |
+
09/23/2023 17:51:09 - INFO - __main__ - Batch size = 16
|
140 |
+
09/23/2023 17:55:05 - INFO - __main__ - ***** Eval results *****
|
141 |
+
09/23/2023 17:55:05 - INFO - __main__ - acc = 0.8371
|
142 |
+
09/23/2023 17:55:32 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
143 |
+
09/23/2023 17:59:12 - INFO - __main__ - global_step = 4050, average loss = 0.10925351051962934
|
144 |
+
09/23/2023 18:03:00 - INFO - __main__ - global_step = 4100, average loss = 0.09795216493275802
|
145 |
+
09/23/2023 18:06:43 - INFO - __main__ - global_step = 4150, average loss = 0.09962472554965643
|
146 |
+
09/23/2023 18:10:25 - INFO - __main__ - global_step = 4200, average loss = 0.10342389734141762
|
147 |
+
09/23/2023 18:14:05 - INFO - __main__ - global_step = 4250, average loss = 0.09674815248567029
|
148 |
+
09/23/2023 18:17:48 - INFO - __main__ - global_step = 4300, average loss = 0.10319628210134396
|
149 |
+
09/23/2023 18:21:33 - INFO - __main__ - global_step = 4350, average loss = 0.09340641272166977
|
150 |
+
09/23/2023 18:25:14 - INFO - __main__ - global_step = 4400, average loss = 0.10845618240913608
|
151 |
+
09/23/2023 18:28:59 - INFO - __main__ - global_step = 4450, average loss = 0.11604906246473547
|
152 |
+
09/23/2023 18:32:43 - INFO - __main__ - global_step = 4500, average loss = 0.09590314964269055
|
153 |
+
09/23/2023 18:32:43 - INFO - __main__ - ***** Running evaluation *****
|
154 |
+
09/23/2023 18:32:43 - INFO - __main__ - Num examples = 10000
|
155 |
+
09/23/2023 18:32:43 - INFO - __main__ - Batch size = 16
|
156 |
+
09/23/2023 18:36:38 - INFO - __main__ - ***** Eval results *****
|
157 |
+
09/23/2023 18:36:38 - INFO - __main__ - acc = 0.8305
|
158 |
+
09/23/2023 18:40:22 - INFO - __main__ - global_step = 4550, average loss = 0.09955280199857952
|
159 |
+
09/23/2023 18:44:07 - INFO - __main__ - global_step = 4600, average loss = 0.09018894311768236
|
160 |
+
09/23/2023 18:47:49 - INFO - __main__ - global_step = 4650, average loss = 0.11624654464081687
|
161 |
+
09/23/2023 18:51:30 - INFO - __main__ - global_step = 4700, average loss = 0.11213955332923434
|
162 |
+
09/23/2023 18:55:07 - INFO - __main__ - global_step = 4750, average loss = 0.11335175217776851
|
163 |
+
09/23/2023 18:58:47 - INFO - __main__ - global_step = 4800, average loss = 0.10374061681199237
|
164 |
+
09/23/2023 19:02:34 - INFO - __main__ - global_step = 4850, average loss = 0.09650620453016018
|
165 |
+
09/23/2023 19:06:16 - INFO - __main__ - global_step = 4900, average loss = 0.1034209698169434
|
166 |
+
09/23/2023 19:09:53 - INFO - __main__ - global_step = 4950, average loss = 0.10046588191311458
|
167 |
+
09/23/2023 19:13:34 - INFO - __main__ - global_step = 5000, average loss = 0.10752027794980677
|
168 |
+
09/23/2023 19:13:34 - INFO - __main__ - ***** Running evaluation *****
|
169 |
+
09/23/2023 19:13:34 - INFO - __main__ - Num examples = 10000
|
170 |
+
09/23/2023 19:13:34 - INFO - __main__ - Batch size = 16
|
171 |
+
09/23/2023 19:17:29 - INFO - __main__ - ***** Eval results *****
|
172 |
+
09/23/2023 19:17:29 - INFO - __main__ - acc = 0.8355
|
173 |
+
09/23/2023 19:21:19 - INFO - __main__ - global_step = 5050, average loss = 0.10195030277842307
|
174 |
+
09/23/2023 19:24:58 - INFO - __main__ - global_step = 5100, average loss = 0.10987481483532065
|
175 |
+
09/23/2023 19:28:41 - INFO - __main__ - global_step = 5150, average loss = 0.10906005093554995
|
176 |
+
09/23/2023 19:32:23 - INFO - __main__ - global_step = 5200, average loss = 0.09835696181547973
|
177 |
+
09/23/2023 19:36:06 - INFO - __main__ - global_step = 5250, average loss = 0.10181126694624254
|
178 |
+
09/23/2023 19:39:52 - INFO - __main__ - global_step = 5300, average loss = 0.08663028705283068
|
179 |
+
09/23/2023 19:43:30 - INFO - __main__ - global_step = 5350, average loss = 0.10507196654667496
|
180 |
+
09/23/2023 19:47:18 - INFO - __main__ - global_step = 5400, average loss = 0.108608085659871
|
181 |
+
09/23/2023 19:51:03 - INFO - __main__ - global_step = 5450, average loss = 0.099619501844536
|
182 |
+
09/23/2023 19:54:49 - INFO - __main__ - global_step = 5500, average loss = 0.10225338533447939
|
183 |
+
09/23/2023 19:54:49 - INFO - __main__ - ***** Running evaluation *****
|
184 |
+
09/23/2023 19:54:49 - INFO - __main__ - Num examples = 10000
|
185 |
+
09/23/2023 19:54:49 - INFO - __main__ - Batch size = 16
|
186 |
+
09/23/2023 19:58:45 - INFO - __main__ - ***** Eval results *****
|
187 |
+
09/23/2023 19:58:45 - INFO - __main__ - acc = 0.8279
|
188 |
+
09/23/2023 20:02:26 - INFO - __main__ - global_step = 5550, average loss = 0.10436682683890468
|
189 |
+
09/23/2023 20:06:11 - INFO - __main__ - global_step = 5600, average loss = 0.10477761221260153
|
190 |
+
09/23/2023 20:09:52 - INFO - __main__ - global_step = 5650, average loss = 0.09326410317778937
|
191 |
+
09/23/2023 20:13:31 - INFO - __main__ - global_step = 5700, average loss = 0.11269167278223904
|
192 |
+
09/23/2023 20:17:16 - INFO - __main__ - global_step = 5750, average loss = 0.10188864256499074
|
193 |
+
09/23/2023 20:21:00 - INFO - __main__ - global_step = 5800, average loss = 0.10433580860199981
|
194 |
+
09/23/2023 20:24:43 - INFO - __main__ - global_step = 5850, average loss = 0.08972063858884212
|
195 |
+
09/23/2023 20:28:22 - INFO - __main__ - global_step = 5900, average loss = 0.1065664726671821
|
196 |
+
09/23/2023 20:32:07 - INFO - __main__ - global_step = 5950, average loss = 0.10174332244623656
|
197 |
+
09/23/2023 20:35:49 - INFO - __main__ - global_step = 6000, average loss = 0.08872646622621687
|
198 |
+
09/23/2023 20:35:49 - INFO - __main__ - ***** Running evaluation *****
|
199 |
+
09/23/2023 20:35:49 - INFO - __main__ - Num examples = 10000
|
200 |
+
09/23/2023 20:35:49 - INFO - __main__ - Batch size = 16
|
201 |
+
09/23/2023 20:39:45 - INFO - __main__ - ***** Eval results *****
|
202 |
+
09/23/2023 20:39:45 - INFO - __main__ - acc = 0.8363
|
203 |
+
09/23/2023 20:43:29 - INFO - __main__ - global_step = 6050, average loss = 0.10705330887685705
|
204 |
+
09/23/2023 20:47:16 - INFO - __main__ - global_step = 6100, average loss = 0.09171272950654384
|
205 |
+
09/23/2023 20:50:59 - INFO - __main__ - global_step = 6150, average loss = 0.0861645900901567
|
206 |
+
09/23/2023 20:54:46 - INFO - __main__ - global_step = 6200, average loss = 0.08994678908144124
|
207 |
+
09/23/2023 20:58:32 - INFO - __main__ - global_step = 6250, average loss = 0.08786970607354305
|
208 |
+
09/23/2023 21:02:13 - INFO - __main__ - global_step = 6300, average loss = 0.09656520821336016
|
209 |
+
09/23/2023 21:05:56 - INFO - __main__ - global_step = 6350, average loss = 0.09620310332989902
|
210 |
+
09/23/2023 21:09:42 - INFO - __main__ - global_step = 6400, average loss = 0.09152124080545036
|
211 |
+
09/23/2023 21:13:22 - INFO - __main__ - global_step = 6450, average loss = 0.09472263304131047
|
212 |
+
09/23/2023 21:17:06 - INFO - __main__ - global_step = 6500, average loss = 0.10554198697194807
|
213 |
+
09/23/2023 21:17:06 - INFO - __main__ - ***** Running evaluation *****
|
214 |
+
09/23/2023 21:17:06 - INFO - __main__ - Num examples = 10000
|
215 |
+
09/23/2023 21:17:06 - INFO - __main__ - Batch size = 16
|
216 |
+
09/23/2023 21:21:01 - INFO - __main__ - ***** Eval results *****
|
217 |
+
09/23/2023 21:21:01 - INFO - __main__ - acc = 0.841
|
218 |
+
09/23/2023 21:21:28 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
219 |
+
09/23/2023 21:25:14 - INFO - __main__ - global_step = 6550, average loss = 0.09830655160796596
|
220 |
+
09/23/2023 21:28:55 - INFO - __main__ - global_step = 6600, average loss = 0.09539545015402837
|
221 |
+
09/23/2023 21:32:40 - INFO - __main__ - global_step = 6650, average loss = 0.09118585625503328
|
222 |
+
09/23/2023 21:36:18 - INFO - __main__ - global_step = 6700, average loss = 0.09700520555491493
|
223 |
+
09/23/2023 21:40:03 - INFO - __main__ - global_step = 6750, average loss = 0.105271778342576
|
224 |
+
09/23/2023 21:43:45 - INFO - __main__ - global_step = 6800, average loss = 0.10975144471223758
|
225 |
+
09/23/2023 21:47:28 - INFO - __main__ - global_step = 6850, average loss = 0.09920243133579788
|
226 |
+
09/23/2023 21:51:11 - INFO - __main__ - global_step = 6900, average loss = 0.09791661702009151
|
227 |
+
09/23/2023 21:54:51 - INFO - __main__ - global_step = 6950, average loss = 0.08630025177910283
|
228 |
+
09/23/2023 21:58:29 - INFO - __main__ - global_step = 7000, average loss = 0.09660528897402401
|
229 |
+
09/23/2023 21:58:29 - INFO - __main__ - ***** Running evaluation *****
|
230 |
+
09/23/2023 21:58:29 - INFO - __main__ - Num examples = 10000
|
231 |
+
09/23/2023 21:58:29 - INFO - __main__ - Batch size = 16
|
232 |
+
09/23/2023 22:02:25 - INFO - __main__ - ***** Eval results *****
|
233 |
+
09/23/2023 22:02:25 - INFO - __main__ - acc = 0.843
|
234 |
+
09/23/2023 22:02:51 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
235 |
+
09/23/2023 22:06:33 - INFO - __main__ - global_step = 7050, average loss = 0.10305566756385814
|
236 |
+
09/23/2023 22:10:07 - INFO - __main__ - global_step = 7100, average loss = 0.10687436608219286
|
237 |
+
09/23/2023 22:13:47 - INFO - __main__ - global_step = 7150, average loss = 0.0946133067667688
|
238 |
+
09/23/2023 22:17:27 - INFO - __main__ - global_step = 7200, average loss = 0.09795189084834419
|
239 |
+
09/23/2023 22:21:17 - INFO - __main__ - global_step = 7250, average loss = 0.09060888570308634
|
240 |
+
09/23/2023 22:24:59 - INFO - __main__ - global_step = 7300, average loss = 0.0877145413684775
|
241 |
+
09/23/2023 22:28:35 - INFO - __main__ - global_step = 7350, average loss = 0.10495714643941029
|
242 |
+
09/23/2023 22:32:21 - INFO - __main__ - global_step = 7400, average loss = 0.07401456630654138
|
243 |
+
09/23/2023 22:36:03 - INFO - __main__ - global_step = 7450, average loss = 0.09523518772701209
|
244 |
+
09/23/2023 22:39:41 - INFO - __main__ - global_step = 7500, average loss = 0.10137952610446518
|
245 |
+
09/23/2023 22:39:41 - INFO - __main__ - ***** Running evaluation *****
|
246 |
+
09/23/2023 22:39:41 - INFO - __main__ - Num examples = 10000
|
247 |
+
09/23/2023 22:39:41 - INFO - __main__ - Batch size = 16
|
248 |
+
09/23/2023 22:43:37 - INFO - __main__ - ***** Eval results *****
|
249 |
+
09/23/2023 22:43:37 - INFO - __main__ - acc = 0.846
|
250 |
+
09/23/2023 22:44:03 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
251 |
+
09/23/2023 22:47:46 - INFO - __main__ - global_step = 7550, average loss = 0.09563293447645264
|
252 |
+
09/23/2023 22:51:31 - INFO - __main__ - global_step = 7600, average loss = 0.09618103489105125
|
253 |
+
09/23/2023 22:55:13 - INFO - __main__ - global_step = 7650, average loss = 0.08849806944810552
|
254 |
+
09/23/2023 22:58:54 - INFO - __main__ - global_step = 7700, average loss = 0.10007433392238455
|
255 |
+
09/23/2023 23:02:36 - INFO - __main__ - global_step = 7750, average loss = 0.09035434001329122
|
256 |
+
09/23/2023 23:06:24 - INFO - __main__ - global_step = 7800, average loss = 0.09338357288788757
|
257 |
+
09/23/2023 23:10:04 - INFO - __main__ - global_step = 7850, average loss = 0.09912064949181514
|
258 |
+
09/23/2023 23:13:47 - INFO - __main__ - global_step = 7900, average loss = 0.08827902228244057
|
259 |
+
09/23/2023 23:17:27 - INFO - __main__ - global_step = 7950, average loss = 0.11218067690118914
|
260 |
+
09/23/2023 23:21:09 - INFO - __main__ - global_step = 8000, average loss = 0.08588292430682486
|
261 |
+
09/23/2023 23:21:09 - INFO - __main__ - ***** Running evaluation *****
|
262 |
+
09/23/2023 23:21:09 - INFO - __main__ - Num examples = 10000
|
263 |
+
09/23/2023 23:21:09 - INFO - __main__ - Batch size = 16
|
264 |
+
09/23/2023 23:25:05 - INFO - __main__ - ***** Eval results *****
|
265 |
+
09/23/2023 23:25:05 - INFO - __main__ - acc = 0.8472
|
266 |
+
09/23/2023 23:25:31 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
267 |
+
09/23/2023 23:29:08 - INFO - __main__ - global_step = 8050, average loss = 0.09245043838061974
|
268 |
+
09/23/2023 23:32:54 - INFO - __main__ - global_step = 8100, average loss = 0.08283289226481429
|
269 |
+
09/23/2023 23:36:34 - INFO - __main__ - global_step = 8150, average loss = 0.08407623038449856
|
270 |
+
09/23/2023 23:40:17 - INFO - __main__ - global_step = 8200, average loss = 0.09736820162237564
|
271 |
+
09/23/2023 23:44:06 - INFO - __main__ - global_step = 8250, average loss = 0.08463705457368632
|
272 |
+
09/23/2023 23:47:50 - INFO - __main__ - global_step = 8300, average loss = 0.10010304888644896
|
273 |
+
09/23/2023 23:51:35 - INFO - __main__ - global_step = 8350, average loss = 0.09222401980725409
|
274 |
+
09/23/2023 23:55:17 - INFO - __main__ - global_step = 8400, average loss = 0.08634746881416504
|
275 |
+
09/23/2023 23:58:59 - INFO - __main__ - global_step = 8450, average loss = 0.08723288500368653
|
276 |
+
09/24/2023 00:02:37 - INFO - __main__ - global_step = 8500, average loss = 0.10130320921433394
|
277 |
+
09/24/2023 00:02:37 - INFO - __main__ - ***** Running evaluation *****
|
278 |
+
09/24/2023 00:02:37 - INFO - __main__ - Num examples = 10000
|
279 |
+
09/24/2023 00:02:37 - INFO - __main__ - Batch size = 16
|
280 |
+
09/24/2023 00:06:32 - INFO - __main__ - ***** Eval results *****
|
281 |
+
09/24/2023 00:06:32 - INFO - __main__ - acc = 0.8452
|
282 |
+
09/24/2023 00:10:13 - INFO - __main__ - global_step = 8550, average loss = 0.0889340414837352
|
283 |
+
09/24/2023 00:13:53 - INFO - __main__ - global_step = 8600, average loss = 0.0960574367789377
|
284 |
+
09/24/2023 00:17:37 - INFO - __main__ - global_step = 8650, average loss = 0.07860265792332939
|
285 |
+
09/24/2023 00:21:20 - INFO - __main__ - global_step = 8700, average loss = 0.09233207383847912
|
286 |
+
09/24/2023 00:25:05 - INFO - __main__ - global_step = 8750, average loss = 0.09803196908305836
|
287 |
+
09/24/2023 00:28:44 - INFO - __main__ - global_step = 8800, average loss = 0.08913468146740343
|
288 |
+
09/24/2023 00:32:26 - INFO - __main__ - global_step = 8850, average loss = 0.0880054514182666
|
289 |
+
09/24/2023 00:36:11 - INFO - __main__ - global_step = 8900, average loss = 0.0839999437017832
|
290 |
+
09/24/2023 00:39:52 - INFO - __main__ - global_step = 8950, average loss = 0.10094311676693905
|
291 |
+
09/24/2023 00:43:32 - INFO - __main__ - global_step = 9000, average loss = 0.10011614485312748
|
292 |
+
09/24/2023 00:43:32 - INFO - __main__ - ***** Running evaluation *****
|
293 |
+
09/24/2023 00:43:32 - INFO - __main__ - Num examples = 10000
|
294 |
+
09/24/2023 00:43:32 - INFO - __main__ - Batch size = 16
|
295 |
+
09/24/2023 00:47:27 - INFO - __main__ - ***** Eval results *****
|
296 |
+
09/24/2023 00:47:27 - INFO - __main__ - acc = 0.8463
|
297 |
+
09/24/2023 00:51:10 - INFO - __main__ - global_step = 9050, average loss = 0.09407024829903093
|
298 |
+
09/24/2023 00:54:48 - INFO - __main__ - global_step = 9100, average loss = 0.09510339217069032
|
299 |
+
09/24/2023 00:58:27 - INFO - __main__ - global_step = 9150, average loss = 0.09413513723055075
|
300 |
+
09/24/2023 01:02:10 - INFO - __main__ - global_step = 9200, average loss = 0.08488880819528276
|
301 |
+
09/24/2023 01:05:47 - INFO - __main__ - global_step = 9250, average loss = 0.09847264970565447
|
302 |
+
09/24/2023 01:09:28 - INFO - __main__ - global_step = 9300, average loss = 0.08640140883806452
|
303 |
+
09/24/2023 01:13:08 - INFO - __main__ - global_step = 9350, average loss = 0.07884123000112594
|
304 |
+
09/24/2023 01:16:54 - INFO - __main__ - global_step = 9400, average loss = 0.0831154512307694
|
305 |
+
09/24/2023 01:20:32 - INFO - __main__ - global_step = 9450, average loss = 0.09913980022422038
|
306 |
+
09/24/2023 01:24:11 - INFO - __main__ - global_step = 9500, average loss = 0.09805536182444484
|
307 |
+
09/24/2023 01:24:11 - INFO - __main__ - ***** Running evaluation *****
|
308 |
+
09/24/2023 01:24:11 - INFO - __main__ - Num examples = 10000
|
309 |
+
09/24/2023 01:24:11 - INFO - __main__ - Batch size = 16
|
310 |
+
09/24/2023 01:28:07 - INFO - __main__ - ***** Eval results *****
|
311 |
+
09/24/2023 01:28:07 - INFO - __main__ - acc = 0.8463
|
312 |
+
09/24/2023 01:31:55 - INFO - __main__ - global_step = 9550, average loss = 0.0912455873134968
|
313 |
+
09/24/2023 01:35:38 - INFO - __main__ - global_step = 9600, average loss = 0.10278063782119716
|
314 |
+
09/24/2023 01:39:12 - INFO - __main__ - global_step = 9650, average loss = 0.08788584528032516
|
315 |
+
09/24/2023 01:42:53 - INFO - __main__ - global_step = 9700, average loss = 0.08058010207216285
|
316 |
+
09/24/2023 01:46:34 - INFO - __main__ - global_step = 9750, average loss = 0.08765123128723644
|
317 |
+
09/24/2023 01:50:14 - INFO - __main__ - global_step = 9800, average loss = 0.09005017607181799
|
318 |
+
09/24/2023 01:54:03 - INFO - __main__ - global_step = 9850, average loss = 0.07892634223760979
|
319 |
+
09/24/2023 01:57:44 - INFO - __main__ - global_step = 9900, average loss = 0.07999062808303278
|
320 |
+
09/24/2023 02:01:26 - INFO - __main__ - global_step = 9950, average loss = 0.09494447313452838
|
321 |
+
09/24/2023 02:05:06 - INFO - __main__ - global_step = 10000, average loss = 0.0841888710015337
|
322 |
+
09/24/2023 02:05:06 - INFO - __main__ - ***** Running evaluation *****
|
323 |
+
09/24/2023 02:05:06 - INFO - __main__ - Num examples = 10000
|
324 |
+
09/24/2023 02:05:06 - INFO - __main__ - Batch size = 16
|
325 |
+
09/24/2023 02:09:01 - INFO - __main__ - ***** Eval results *****
|
326 |
+
09/24/2023 02:09:01 - INFO - __main__ - acc = 0.8471
|
327 |
+
09/24/2023 02:12:40 - INFO - __main__ - global_step = 10050, average loss = 0.08929907138342968
|
328 |
+
09/24/2023 02:16:20 - INFO - __main__ - global_step = 10100, average loss = 0.10172551687661326
|
329 |
+
09/24/2023 02:20:00 - INFO - __main__ - global_step = 10150, average loss = 0.09577305402533966
|
330 |
+
09/24/2023 02:23:46 - INFO - __main__ - global_step = 10200, average loss = 0.09480085656211486
|
331 |
+
09/24/2023 02:27:27 - INFO - __main__ - global_step = 10250, average loss = 0.07956519629078684
|
332 |
+
09/24/2023 02:31:05 - INFO - __main__ - global_step = 10300, average loss = 0.08291967767250753
|
333 |
+
09/24/2023 02:34:47 - INFO - __main__ - global_step = 10350, average loss = 0.09592102762369904
|
334 |
+
09/24/2023 02:38:29 - INFO - __main__ - global_step = 10400, average loss = 0.08570889301292482
|
335 |
+
09/24/2023 02:42:13 - INFO - __main__ - global_step = 10450, average loss = 0.07362440132081247
|
336 |
+
09/24/2023 02:45:58 - INFO - __main__ - global_step = 10500, average loss = 0.08574875552483718
|
337 |
+
09/24/2023 02:45:58 - INFO - __main__ - ***** Running evaluation *****
|
338 |
+
09/24/2023 02:45:58 - INFO - __main__ - Num examples = 10000
|
339 |
+
09/24/2023 02:45:58 - INFO - __main__ - Batch size = 16
|
340 |
+
09/24/2023 02:49:53 - INFO - __main__ - ***** Eval results *****
|
341 |
+
09/24/2023 02:49:53 - INFO - __main__ - acc = 0.8524
|
342 |
+
09/24/2023 02:50:20 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
343 |
+
09/24/2023 02:54:03 - INFO - __main__ - global_step = 10550, average loss = 0.08846153970320302
|
344 |
+
09/24/2023 02:57:43 - INFO - __main__ - global_step = 10600, average loss = 0.08381684645668429
|
345 |
+
09/24/2023 03:01:26 - INFO - __main__ - global_step = 10650, average loss = 0.09288432469184045
|
346 |
+
09/24/2023 03:05:08 - INFO - __main__ - global_step = 10700, average loss = 0.08199916316298186
|
347 |
+
09/24/2023 03:08:56 - INFO - __main__ - global_step = 10750, average loss = 0.09068042659768252
|
348 |
+
09/24/2023 03:12:37 - INFO - __main__ - global_step = 10800, average loss = 0.08719110449641448
|
349 |
+
09/24/2023 03:16:20 - INFO - __main__ - global_step = 10850, average loss = 0.09036207084544003
|
350 |
+
09/24/2023 03:20:04 - INFO - __main__ - global_step = 10900, average loss = 0.095746248819637
|
351 |
+
09/24/2023 03:23:45 - INFO - __main__ - global_step = 10950, average loss = 0.1019882604497252
|
352 |
+
09/24/2023 03:27:25 - INFO - __main__ - global_step = 11000, average loss = 0.08660416512644588
|
353 |
+
09/24/2023 03:27:25 - INFO - __main__ - ***** Running evaluation *****
|
354 |
+
09/24/2023 03:27:25 - INFO - __main__ - Num examples = 10000
|
355 |
+
09/24/2023 03:27:25 - INFO - __main__ - Batch size = 16
|
356 |
+
09/24/2023 03:31:21 - INFO - __main__ - ***** Eval results *****
|
357 |
+
09/24/2023 03:31:21 - INFO - __main__ - acc = 0.8521
|
358 |
+
09/24/2023 03:35:00 - INFO - __main__ - global_step = 11050, average loss = 0.07959849048202158
|
359 |
+
09/24/2023 03:38:42 - INFO - __main__ - global_step = 11100, average loss = 0.08480279741248524
|
360 |
+
09/24/2023 03:42:25 - INFO - __main__ - global_step = 11150, average loss = 0.07940411141982623
|
361 |
+
09/24/2023 03:46:06 - INFO - __main__ - global_step = 11200, average loss = 0.08627346496621613
|
362 |
+
09/24/2023 03:49:48 - INFO - __main__ - global_step = 11250, average loss = 0.08515130840663915
|
363 |
+
09/24/2023 03:53:28 - INFO - __main__ - global_step = 11300, average loss = 0.08047833000106039
|
364 |
+
09/24/2023 03:57:07 - INFO - __main__ - global_step = 11350, average loss = 0.08884227124826338
|
365 |
+
09/24/2023 04:00:47 - INFO - __main__ - global_step = 11400, average loss = 0.09542614945773494
|
366 |
+
09/24/2023 04:04:26 - INFO - __main__ - global_step = 11450, average loss = 0.08332637125422479
|
367 |
+
09/24/2023 04:08:07 - INFO - __main__ - global_step = 11500, average loss = 0.09769482501476887
|
368 |
+
09/24/2023 04:08:07 - INFO - __main__ - ***** Running evaluation *****
|
369 |
+
09/24/2023 04:08:07 - INFO - __main__ - Num examples = 10000
|
370 |
+
09/24/2023 04:08:07 - INFO - __main__ - Batch size = 16
|
371 |
+
09/24/2023 04:12:02 - INFO - __main__ - ***** Eval results *****
|
372 |
+
09/24/2023 04:12:02 - INFO - __main__ - acc = 0.851
|
373 |
+
09/24/2023 04:15:51 - INFO - __main__ - global_step = 11550, average loss = 0.09137944790694746
|
374 |
+
09/24/2023 04:19:38 - INFO - __main__ - global_step = 11600, average loss = 0.07454582622590351
|
375 |
+
09/24/2023 04:23:20 - INFO - __main__ - global_step = 11650, average loss = 0.08284565404814202
|
376 |
+
09/24/2023 04:26:59 - INFO - __main__ - global_step = 11700, average loss = 0.0969824349215196
|
377 |
+
09/24/2023 04:30:41 - INFO - __main__ - global_step = 11750, average loss = 0.09389037321489013
|
378 |
+
09/24/2023 04:34:23 - INFO - __main__ - global_step = 11800, average loss = 0.08608788483528769
|
379 |
+
09/24/2023 04:38:05 - INFO - __main__ - global_step = 11850, average loss = 0.09322659247220144
|
380 |
+
09/24/2023 04:41:49 - INFO - __main__ - global_step = 11900, average loss = 0.09286965438863262
|
381 |
+
09/24/2023 04:45:31 - INFO - __main__ - global_step = 11950, average loss = 0.08214385434631367
|
382 |
+
09/24/2023 04:49:12 - INFO - __main__ - global_step = 12000, average loss = 0.09392224536069989
|
383 |
+
09/24/2023 04:49:12 - INFO - __main__ - ***** Running evaluation *****
|
384 |
+
09/24/2023 04:49:12 - INFO - __main__ - Num examples = 10000
|
385 |
+
09/24/2023 04:49:12 - INFO - __main__ - Batch size = 16
|
386 |
+
09/24/2023 04:53:07 - INFO - __main__ - ***** Eval results *****
|
387 |
+
09/24/2023 04:53:07 - INFO - __main__ - acc = 0.8514
|
388 |
+
09/24/2023 04:56:53 - INFO - __main__ - global_step = 12050, average loss = 0.08019034011129406
|
389 |
+
09/24/2023 05:00:34 - INFO - __main__ - global_step = 12100, average loss = 0.08210711618239656
|
390 |
+
09/24/2023 05:04:16 - INFO - __main__ - global_step = 12150, average loss = 0.08764273267355747
|
391 |
+
09/24/2023 05:08:02 - INFO - __main__ - global_step = 12200, average loss = 0.08758470895321807
|
392 |
+
09/24/2023 05:11:48 - INFO - __main__ - global_step = 12250, average loss = 0.07766548367973883
|
393 |
+
09/24/2023 05:15:27 - INFO - __main__ - global_step = 12300, average loss = 0.08148344823415755
|
394 |
+
09/24/2023 05:19:08 - INFO - __main__ - global_step = 12350, average loss = 0.08814196670609817
|
395 |
+
09/24/2023 05:22:50 - INFO - __main__ - global_step = 12400, average loss = 0.08936668847491092
|
396 |
+
09/24/2023 05:26:29 - INFO - __main__ - global_step = 12450, average loss = 0.08240065188347216
|
397 |
+
09/24/2023 05:30:12 - INFO - __main__ - global_step = 12500, average loss = 0.08683115135392655
|
398 |
+
09/24/2023 05:30:12 - INFO - __main__ - ***** Running evaluation *****
|
399 |
+
09/24/2023 05:30:12 - INFO - __main__ - Num examples = 10000
|
400 |
+
09/24/2023 05:30:12 - INFO - __main__ - Batch size = 16
|
401 |
+
09/24/2023 05:34:07 - INFO - __main__ - ***** Eval results *****
|
402 |
+
09/24/2023 05:34:07 - INFO - __main__ - acc = 0.8515
|
403 |
+
09/24/2023 05:37:53 - INFO - __main__ - global_step = 12550, average loss = 0.08871277472944712
|
404 |
+
09/24/2023 05:41:34 - INFO - __main__ - global_step = 12600, average loss = 0.08797626828309149
|
405 |
+
09/24/2023 05:45:11 - INFO - __main__ - global_step = 12650, average loss = 0.10095825259459616
|
406 |
+
09/24/2023 05:48:58 - INFO - __main__ - global_step = 12700, average loss = 0.07953012495926487
|
407 |
+
09/24/2023 05:52:41 - INFO - __main__ - global_step = 12750, average loss = 0.08843418272979761
|
408 |
+
09/24/2023 05:56:19 - INFO - __main__ - global_step = 12800, average loss = 0.07413991435227217
|
409 |
+
09/24/2023 05:59:59 - INFO - __main__ - global_step = 12850, average loss = 0.07519575585451094
|
410 |
+
09/24/2023 06:03:48 - INFO - __main__ - global_step = 12900, average loss = 0.08996981896292709
|
411 |
+
09/24/2023 06:07:28 - INFO - __main__ - global_step = 12950, average loss = 0.08996171029284597
|
412 |
+
09/24/2023 06:11:11 - INFO - __main__ - global_step = 13000, average loss = 0.08077499923689174
|
413 |
+
09/24/2023 06:11:11 - INFO - __main__ - ***** Running evaluation *****
|
414 |
+
09/24/2023 06:11:11 - INFO - __main__ - Num examples = 10000
|
415 |
+
09/24/2023 06:11:11 - INFO - __main__ - Batch size = 16
|
416 |
+
09/24/2023 06:15:06 - INFO - __main__ - ***** Eval results *****
|
417 |
+
09/24/2023 06:15:06 - INFO - __main__ - acc = 0.8527
|
418 |
+
09/24/2023 06:15:33 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
419 |
+
09/24/2023 06:19:13 - INFO - __main__ - global_step = 13050, average loss = 0.08447560470420284
|
420 |
+
09/24/2023 06:22:54 - INFO - __main__ - global_step = 13100, average loss = 0.08299598100831646
|
421 |
+
09/24/2023 06:26:32 - INFO - __main__ - global_step = 13150, average loss = 0.08393764879734135
|
422 |
+
09/24/2023 06:30:08 - INFO - __main__ - global_step = 13200, average loss = 0.09848508099505125
|
423 |
+
09/24/2023 06:33:47 - INFO - __main__ - global_step = 13250, average loss = 0.09162080157435412
|
424 |
+
09/24/2023 06:37:28 - INFO - __main__ - global_step = 13300, average loss = 0.0914362099875143
|
425 |
+
09/24/2023 06:41:09 - INFO - __main__ - global_step = 13350, average loss = 0.07781068138462616
|
426 |
+
09/24/2023 06:44:55 - INFO - __main__ - global_step = 13400, average loss = 0.08868030074576382
|
427 |
+
09/24/2023 06:48:36 - INFO - __main__ - global_step = 13450, average loss = 0.08357623873533157
|
428 |
+
09/24/2023 06:52:18 - INFO - __main__ - global_step = 13500, average loss = 0.08828085365807055
|
429 |
+
09/24/2023 06:52:18 - INFO - __main__ - ***** Running evaluation *****
|
430 |
+
09/24/2023 06:52:18 - INFO - __main__ - Num examples = 10000
|
431 |
+
09/24/2023 06:52:18 - INFO - __main__ - Batch size = 16
|
432 |
+
09/24/2023 06:56:14 - INFO - __main__ - ***** Eval results *****
|
433 |
+
09/24/2023 06:56:14 - INFO - __main__ - acc = 0.8499
|
434 |
+
09/24/2023 06:59:57 - INFO - __main__ - global_step = 13550, average loss = 0.08140521681067185
|
435 |
+
09/24/2023 07:03:37 - INFO - __main__ - global_step = 13600, average loss = 0.08341409597109305
|
436 |
+
09/24/2023 07:07:17 - INFO - __main__ - global_step = 13650, average loss = 0.08142950747031136
|
437 |
+
09/24/2023 07:10:56 - INFO - __main__ - global_step = 13700, average loss = 0.09089667504686076
|
438 |
+
09/24/2023 07:14:45 - INFO - __main__ - global_step = 13750, average loss = 0.07177684095106088
|
439 |
+
09/24/2023 07:18:24 - INFO - __main__ - global_step = 13800, average loss = 0.08592368463818274
|
440 |
+
09/24/2023 07:22:01 - INFO - __main__ - global_step = 13850, average loss = 0.08120634569131653
|
441 |
+
09/24/2023 07:25:48 - INFO - __main__ - global_step = 13900, average loss = 0.08909589071197843
|
442 |
+
09/24/2023 07:29:30 - INFO - __main__ - global_step = 13950, average loss = 0.08629100337015189
|
443 |
+
09/24/2023 07:33:10 - INFO - __main__ - global_step = 14000, average loss = 0.07722124511306902
|
444 |
+
09/24/2023 07:33:10 - INFO - __main__ - ***** Running evaluation *****
|
445 |
+
09/24/2023 07:33:10 - INFO - __main__ - Num examples = 10000
|
446 |
+
09/24/2023 07:33:10 - INFO - __main__ - Batch size = 16
|
447 |
+
09/24/2023 07:37:05 - INFO - __main__ - ***** Eval results *****
|
448 |
+
09/24/2023 07:37:05 - INFO - __main__ - acc = 0.8533
|
449 |
+
09/24/2023 07:37:32 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
450 |
+
09/24/2023 07:41:11 - INFO - __main__ - global_step = 14050, average loss = 0.08182521525057382
|
451 |
+
09/24/2023 07:44:48 - INFO - __main__ - global_step = 14100, average loss = 0.0902410151962249
|
452 |
+
09/24/2023 07:48:28 - INFO - __main__ - global_step = 14150, average loss = 0.07409664937826164
|
453 |
+
09/24/2023 07:52:12 - INFO - __main__ - global_step = 14200, average loss = 0.08879891355274594
|
454 |
+
09/24/2023 07:55:53 - INFO - __main__ - global_step = 14250, average loss = 0.09268313445325475
|
455 |
+
09/24/2023 07:59:30 - INFO - __main__ - global_step = 14300, average loss = 0.08798344542199629
|
456 |
+
09/24/2023 08:03:13 - INFO - __main__ - global_step = 14350, average loss = 0.09607475698139752
|
457 |
+
09/24/2023 08:06:59 - INFO - __main__ - global_step = 14400, average loss = 0.07222031111843535
|
458 |
+
09/24/2023 08:10:40 - INFO - __main__ - global_step = 14450, average loss = 0.07480319764195884
|
459 |
+
09/24/2023 08:14:19 - INFO - __main__ - global_step = 14500, average loss = 0.0838716509303049
|
460 |
+
09/24/2023 08:14:19 - INFO - __main__ - ***** Running evaluation *****
|
461 |
+
09/24/2023 08:14:19 - INFO - __main__ - Num examples = 10000
|
462 |
+
09/24/2023 08:14:19 - INFO - __main__ - Batch size = 16
|
463 |
+
09/24/2023 08:18:16 - INFO - __main__ - ***** Eval results *****
|
464 |
+
09/24/2023 08:18:16 - INFO - __main__ - acc = 0.8542
|
465 |
+
09/24/2023 08:18:42 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
466 |
+
09/24/2023 08:22:18 - INFO - __main__ - global_step = 14550, average loss = 0.08034001361316769
|
467 |
+
09/24/2023 08:25:55 - INFO - __main__ - global_step = 14600, average loss = 0.07689567271547276
|
468 |
+
09/24/2023 08:29:37 - INFO - __main__ - global_step = 14650, average loss = 0.09093381941405823
|
469 |
+
09/24/2023 08:33:25 - INFO - __main__ - global_step = 14700, average loss = 0.07569706412876258
|
470 |
+
09/24/2023 08:37:04 - INFO - __main__ - global_step = 14750, average loss = 0.07479940189456101
|
471 |
+
09/24/2023 08:40:47 - INFO - __main__ - global_step = 14800, average loss = 0.08522207450543647
|
472 |
+
09/24/2023 08:44:34 - INFO - __main__ - global_step = 14850, average loss = 0.0889268495763099
|
473 |
+
09/24/2023 08:48:16 - INFO - __main__ - global_step = 14900, average loss = 0.08616152721479012
|
474 |
+
09/24/2023 08:51:56 - INFO - __main__ - global_step = 14950, average loss = 0.07867321850848384
|
475 |
+
09/24/2023 08:55:39 - INFO - __main__ - global_step = 15000, average loss = 0.08426695556714549
|
476 |
+
09/24/2023 08:55:39 - INFO - __main__ - ***** Running evaluation *****
|
477 |
+
09/24/2023 08:55:39 - INFO - __main__ - Num examples = 10000
|
478 |
+
09/24/2023 08:55:39 - INFO - __main__ - Batch size = 16
|
479 |
+
09/24/2023 08:59:34 - INFO - __main__ - ***** Eval results *****
|
480 |
+
09/24/2023 08:59:34 - INFO - __main__ - acc = 0.8542
|
481 |
+
09/24/2023 09:03:12 - INFO - __main__ - global_step = 15050, average loss = 0.07868185437655484
|
482 |
+
09/24/2023 09:07:00 - INFO - __main__ - global_step = 15100, average loss = 0.08520105790423259
|
483 |
+
09/24/2023 09:10:42 - INFO - __main__ - global_step = 15150, average loss = 0.09536004922925713
|
484 |
+
09/24/2023 09:14:19 - INFO - __main__ - global_step = 15200, average loss = 0.08502999547665241
|
485 |
+
09/24/2023 09:17:58 - INFO - __main__ - global_step = 15250, average loss = 0.08957034896484402
|
486 |
+
09/24/2023 09:21:34 - INFO - __main__ - global_step = 15300, average loss = 0.07968287494033575
|
487 |
+
09/24/2023 09:25:14 - INFO - __main__ - global_step = 15350, average loss = 0.08545487473544199
|
488 |
+
09/24/2023 09:28:55 - INFO - __main__ - global_step = 15400, average loss = 0.08528959889241378
|
489 |
+
09/24/2023 09:32:38 - INFO - __main__ - global_step = 15450, average loss = 0.08095955706679887
|
490 |
+
09/24/2023 09:36:19 - INFO - __main__ - global_step = 15500, average loss = 0.08725373520917856
|
491 |
+
09/24/2023 09:36:19 - INFO - __main__ - ***** Running evaluation *****
|
492 |
+
09/24/2023 09:36:19 - INFO - __main__ - Num examples = 10000
|
493 |
+
09/24/2023 09:36:19 - INFO - __main__ - Batch size = 16
|
494 |
+
09/24/2023 09:40:15 - INFO - __main__ - ***** Eval results *****
|
495 |
+
09/24/2023 09:40:15 - INFO - __main__ - acc = 0.8545
|
496 |
+
09/24/2023 09:40:42 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
497 |
+
09/24/2023 09:44:22 - INFO - __main__ - global_step = 15550, average loss = 0.0843266883040269
|
498 |
+
09/24/2023 09:48:03 - INFO - __main__ - global_step = 15600, average loss = 0.07855528741223679
|
499 |
+
09/24/2023 09:51:47 - INFO - __main__ - global_step = 15650, average loss = 0.09478737017554523
|
500 |
+
09/24/2023 09:55:32 - INFO - __main__ - global_step = 15700, average loss = 0.08910313490487169
|
501 |
+
09/24/2023 09:59:16 - INFO - __main__ - global_step = 15750, average loss = 0.07736712342710234
|
502 |
+
09/24/2023 10:02:53 - INFO - __main__ - global_step = 15800, average loss = 0.08501649839432503
|
503 |
+
09/24/2023 10:06:37 - INFO - __main__ - global_step = 15850, average loss = 0.08495221398276044
|
504 |
+
09/24/2023 10:10:23 - INFO - __main__ - global_step = 15900, average loss = 0.08510145512744202
|
505 |
+
09/24/2023 10:14:07 - INFO - __main__ - global_step = 15950, average loss = 0.08335533107921947
|
506 |
+
09/24/2023 10:17:49 - INFO - __main__ - global_step = 16000, average loss = 0.09103241352764599
|
507 |
+
09/24/2023 10:17:49 - INFO - __main__ - ***** Running evaluation *****
|
508 |
+
09/24/2023 10:17:49 - INFO - __main__ - Num examples = 10000
|
509 |
+
09/24/2023 10:17:49 - INFO - __main__ - Batch size = 16
|
510 |
+
09/24/2023 10:21:45 - INFO - __main__ - ***** Eval results *****
|
511 |
+
09/24/2023 10:21:45 - INFO - __main__ - acc = 0.8549
|
512 |
+
09/24/2023 10:22:12 - INFO - __main__ - Saving model checkpoint to output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
513 |
+
09/24/2023 10:25:53 - INFO - __main__ - global_step = 16050, average loss = 0.0808029190406296
|
514 |
+
09/24/2023 10:29:33 - INFO - __main__ - global_step = 16100, average loss = 0.0950222506766113
|
515 |
+
09/24/2023 10:33:15 - INFO - __main__ - global_step = 16150, average loss = 0.08560644885961664
|
516 |
+
09/24/2023 10:36:53 - INFO - __main__ - global_step = 16200, average loss = 0.07925290400889935
|
517 |
+
09/24/2023 10:40:34 - INFO - __main__ - global_step = 16250, average loss = 0.08252620983123052
|
518 |
+
09/24/2023 10:44:15 - INFO - __main__ - global_step = 16300, average loss = 0.08747977073326182
|
519 |
+
09/24/2023 10:47:55 - INFO - __main__ - global_step = 16350, average loss = 0.08805208059333382
|
520 |
+
09/24/2023 10:51:41 - INFO - __main__ - global_step = 16400, average loss = 0.07935831163018064
|
521 |
+
09/24/2023 10:55:23 - INFO - __main__ - global_step = 16450, average loss = 0.0807358610859228
|
522 |
+
09/24/2023 10:59:03 - INFO - __main__ - global_step = 16500, average loss = 0.0775301494665473
|
523 |
+
09/24/2023 10:59:03 - INFO - __main__ - ***** Running evaluation *****
|
524 |
+
09/24/2023 10:59:03 - INFO - __main__ - Num examples = 10000
|
525 |
+
09/24/2023 10:59:03 - INFO - __main__ - Batch size = 16
|
526 |
+
09/24/2023 11:02:59 - INFO - __main__ - ***** Eval results *****
|
527 |
+
09/24/2023 11:02:59 - INFO - __main__ - acc = 0.8532
|
528 |
+
09/24/2023 11:06:39 - INFO - __main__ - global_step = 16550, average loss = 0.06899339191091712
|
529 |
+
09/24/2023 11:10:25 - INFO - __main__ - global_step = 16600, average loss = 0.08612027997849508
|
530 |
+
09/24/2023 11:14:10 - INFO - __main__ - global_step = 16650, average loss = 0.08232147437905951
|
531 |
+
09/24/2023 11:17:50 - INFO - __main__ - global_step = 16700, average loss = 0.08530993062430753
|
532 |
+
09/24/2023 11:18:50 - INFO - __main__ - ***** Running evaluation *****
|
533 |
+
09/24/2023 11:18:50 - INFO - __main__ - Num examples = 10000
|
534 |
+
09/24/2023 11:18:50 - INFO - __main__ - Batch size = 16
|
535 |
+
09/24/2023 11:22:45 - INFO - __main__ - ***** Eval results *****
|
536 |
+
09/24/2023 11:22:45 - INFO - __main__ - acc = 0.8533
|
537 |
+
09/24/2023 11:22:45 - INFO - __main__ - global_step = 16713, average loss = 0.11041826268834619
|
538 |
+
09/24/2023 11:23:18 - INFO - __main__ - ***** Running evaluation *****
|
539 |
+
09/24/2023 11:23:18 - INFO - __main__ - Num examples = 10000
|
540 |
+
09/24/2023 11:23:18 - INFO - __main__ - Batch size = 16
|
541 |
+
09/24/2023 11:27:13 - INFO - __main__ - ***** Eval results *****
|
542 |
+
09/24/2023 11:27:13 - INFO - __main__ - acc = 0.8549
|
543 |
+
09/24/2023 11:27:16 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/socialiqa_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='socialiqa', overwrite_output_dir=False, cache_dir=None)
|
544 |
+
09/24/2023 11:27:16 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
545 |
+
09/24/2023 11:34:38 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/winogrande_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='winogrande', overwrite_output_dir=False, cache_dir=None)
|
546 |
+
09/24/2023 11:34:38 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
547 |
+
09/24/2023 11:37:05 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/piqa_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='piqa', overwrite_output_dir=False, cache_dir=None)
|
548 |
+
09/24/2023 11:37:05 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
549 |
+
09/24/2023 11:43:59 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/commonsenseqa_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='commonsenseqa', overwrite_output_dir=False, cache_dir=None)
|
550 |
+
09/24/2023 11:43:59 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
551 |
+
09/24/2023 11:49:43 - INFO - evaluate_DeBERTa - Namespace(dataset_file='../../../data/mcqa/eval/anli_dev.jsonl', lm='output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6', out_dir='./eval_results/deberta-v3-large_2i_atm_half_sample_name_5e-6', device=0, reader='anli', overwrite_output_dir=False, cache_dir=None)
|
552 |
+
09/24/2023 11:49:43 - INFO - evaluate_DeBERTa - Initializing output/Output_ATOMIC-pseudo-wWC/deberta-v3-large_2i_atm_half_sample_name_5e-6
|
553 |
+
09/24/2023 11:54:31 - INFO - __main__ - ***** Running evaluation *****
|
554 |
+
09/24/2023 11:54:31 - INFO - __main__ - Num examples = 120
|
555 |
+
09/24/2023 11:54:31 - INFO - __main__ - Batch size = 16
|
556 |
+
09/24/2023 11:54:47 - INFO - __main__ - ***** Eval results *****
|
557 |
+
09/24/2023 11:54:47 - INFO - __main__ - acc = 0.525
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb357eae91ca6dee772e1aa051d51d1ac15dfb3d6939fc85c99c233728675db4
|
3 |
+
size 1915
|