picocreator
commited on
Commit
•
ba750a1
1
Parent(s):
550e990
8fa61cb158cb691aa1bb84da41ff32d779b7b4be6043cf0e368ee6c84931c0f0
Browse files
experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb
CHANGED
@@ -3,13 +3,13 @@
|
|
3 |
{
|
4 |
"attachments": {},
|
5 |
"cell_type": "markdown",
|
6 |
-
"id": "
|
7 |
"metadata": {
|
8 |
"papermill": {
|
9 |
-
"duration": 0.
|
10 |
-
"end_time": "2023-09-
|
11 |
"exception": false,
|
12 |
-
"start_time": "2023-09-
|
13 |
"status": "completed"
|
14 |
},
|
15 |
"tags": []
|
@@ -23,13 +23,13 @@
|
|
23 |
{
|
24 |
"attachments": {},
|
25 |
"cell_type": "markdown",
|
26 |
-
"id": "
|
27 |
"metadata": {
|
28 |
"papermill": {
|
29 |
-
"duration": 0.
|
30 |
-
"end_time": "2023-09-
|
31 |
"exception": false,
|
32 |
-
"start_time": "2023-09-
|
33 |
"status": "completed"
|
34 |
},
|
35 |
"tags": []
|
@@ -41,19 +41,19 @@
|
|
41 |
{
|
42 |
"cell_type": "code",
|
43 |
"execution_count": 1,
|
44 |
-
"id": "
|
45 |
"metadata": {
|
46 |
"execution": {
|
47 |
-
"iopub.execute_input": "2023-09-
|
48 |
-
"iopub.status.busy": "2023-09-
|
49 |
-
"iopub.status.idle": "2023-09-
|
50 |
-
"shell.execute_reply": "2023-09-
|
51 |
},
|
52 |
"papermill": {
|
53 |
-
"duration": 0.
|
54 |
-
"end_time": "2023-09-
|
55 |
"exception": false,
|
56 |
-
"start_time": "2023-09-
|
57 |
"status": "completed"
|
58 |
},
|
59 |
"tags": []
|
@@ -69,19 +69,19 @@
|
|
69 |
{
|
70 |
"cell_type": "code",
|
71 |
"execution_count": 2,
|
72 |
-
"id": "
|
73 |
"metadata": {
|
74 |
"execution": {
|
75 |
-
"iopub.execute_input": "2023-09-
|
76 |
-
"iopub.status.busy": "2023-09-
|
77 |
-
"iopub.status.idle": "2023-09-
|
78 |
-
"shell.execute_reply": "2023-09-
|
79 |
},
|
80 |
"papermill": {
|
81 |
-
"duration": 0.
|
82 |
-
"end_time": "2023-09-
|
83 |
"exception": false,
|
84 |
-
"start_time": "2023-09-
|
85 |
"status": "completed"
|
86 |
},
|
87 |
"tags": []
|
@@ -140,19 +140,19 @@
|
|
140 |
{
|
141 |
"cell_type": "code",
|
142 |
"execution_count": 3,
|
143 |
-
"id": "
|
144 |
"metadata": {
|
145 |
"execution": {
|
146 |
-
"iopub.execute_input": "2023-09-
|
147 |
-
"iopub.status.busy": "2023-09-
|
148 |
-
"iopub.status.idle": "2023-09-
|
149 |
-
"shell.execute_reply": "2023-09-
|
150 |
},
|
151 |
"papermill": {
|
152 |
-
"duration":
|
153 |
-
"end_time": "2023-09-
|
154 |
"exception": false,
|
155 |
-
"start_time": "2023-09-
|
156 |
"status": "completed"
|
157 |
},
|
158 |
"tags": []
|
@@ -162,14 +162,20 @@
|
|
162 |
"name": "stdout",
|
163 |
"output_type": "stream",
|
164 |
"text": [
|
165 |
-
"[2023-09-29
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
]
|
167 |
},
|
168 |
{
|
169 |
"name": "stdout",
|
170 |
"output_type": "stream",
|
171 |
"text": [
|
172 |
-
"[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
|
173 |
"---- Initializing model ----\r\n",
|
174 |
"No of layers: 6\r\n",
|
175 |
"Embedding size: 2048\r\n",
|
@@ -228,42 +234,42 @@
|
|
228 |
"output_type": "stream",
|
229 |
"text": [
|
230 |
"2048 2048 0 blocks.0.ffn.receptance.weight\r\n",
|
231 |
-
"2048 7168 0 blocks.0.ffn.value.weight\r\n"
|
|
|
232 |
]
|
233 |
},
|
234 |
{
|
235 |
"name": "stdout",
|
236 |
"output_type": "stream",
|
237 |
"text": [
|
238 |
-
"2048 2048 1.0 blocks.1.att.
|
239 |
]
|
240 |
},
|
241 |
{
|
242 |
"name": "stdout",
|
243 |
"output_type": "stream",
|
244 |
"text": [
|
245 |
-
"2048 2048 1.0 blocks.1.att.
|
246 |
]
|
247 |
},
|
248 |
{
|
249 |
"name": "stdout",
|
250 |
"output_type": "stream",
|
251 |
"text": [
|
252 |
-
"2048 2048 1.0 blocks.1.att.
|
253 |
]
|
254 |
},
|
255 |
{
|
256 |
"name": "stdout",
|
257 |
"output_type": "stream",
|
258 |
"text": [
|
259 |
-
"2048 2048
|
260 |
]
|
261 |
},
|
262 |
{
|
263 |
"name": "stdout",
|
264 |
"output_type": "stream",
|
265 |
"text": [
|
266 |
-
"2048 2048 0 blocks.1.att.output.weight\r\n",
|
267 |
"7168 2048 1.0 blocks.1.ffn.key.weight\r\n"
|
268 |
]
|
269 |
},
|
@@ -272,42 +278,42 @@
|
|
272 |
"output_type": "stream",
|
273 |
"text": [
|
274 |
"2048 2048 0 blocks.1.ffn.receptance.weight\r\n",
|
275 |
-
"2048 7168 0 blocks.1.ffn.value.weight\r\n"
|
276 |
-
"2048 2048 1.0 blocks.2.att.gate.weight\r\n"
|
277 |
]
|
278 |
},
|
279 |
{
|
280 |
"name": "stdout",
|
281 |
"output_type": "stream",
|
282 |
"text": [
|
283 |
-
"2048 2048 1.0 blocks.2.att.
|
284 |
]
|
285 |
},
|
286 |
{
|
287 |
"name": "stdout",
|
288 |
"output_type": "stream",
|
289 |
"text": [
|
290 |
-
"2048 2048 1.0 blocks.2.att.
|
291 |
]
|
292 |
},
|
293 |
{
|
294 |
"name": "stdout",
|
295 |
"output_type": "stream",
|
296 |
"text": [
|
297 |
-
"2048 2048 1.0 blocks.2.att.
|
298 |
]
|
299 |
},
|
300 |
{
|
301 |
"name": "stdout",
|
302 |
"output_type": "stream",
|
303 |
"text": [
|
304 |
-
"2048 2048 0
|
305 |
]
|
306 |
},
|
307 |
{
|
308 |
"name": "stdout",
|
309 |
"output_type": "stream",
|
310 |
"text": [
|
|
|
311 |
"7168 2048 1.0 blocks.2.ffn.key.weight\r\n"
|
312 |
]
|
313 |
},
|
@@ -360,13 +366,7 @@
|
|
360 |
"output_type": "stream",
|
361 |
"text": [
|
362 |
"2048 2048 0 blocks.3.ffn.receptance.weight\r\n",
|
363 |
-
"2048 7168 0 blocks.3.ffn.value.weight\r\n"
|
364 |
-
]
|
365 |
-
},
|
366 |
-
{
|
367 |
-
"name": "stdout",
|
368 |
-
"output_type": "stream",
|
369 |
-
"text": [
|
370 |
"2048 2048 1.0 blocks.4.att.gate.weight\r\n"
|
371 |
]
|
372 |
},
|
@@ -404,13 +404,7 @@
|
|
404 |
"output_type": "stream",
|
405 |
"text": [
|
406 |
"2048 2048 0 blocks.4.ffn.receptance.weight\r\n",
|
407 |
-
"2048 7168 0 blocks.4.ffn.value.weight\r\n"
|
408 |
-
]
|
409 |
-
},
|
410 |
-
{
|
411 |
-
"name": "stdout",
|
412 |
-
"output_type": "stream",
|
413 |
-
"text": [
|
414 |
"2048 2048 1.0 blocks.5.att.gate.weight\r\n"
|
415 |
]
|
416 |
},
|
@@ -439,13 +433,7 @@
|
|
439 |
"name": "stdout",
|
440 |
"output_type": "stream",
|
441 |
"text": [
|
442 |
-
"2048 2048 0 blocks.5.att.output.weight\r\n"
|
443 |
-
]
|
444 |
-
},
|
445 |
-
{
|
446 |
-
"name": "stdout",
|
447 |
-
"output_type": "stream",
|
448 |
-
"text": [
|
449 |
"7168 2048 1.0 blocks.5.ffn.key.weight\r\n"
|
450 |
]
|
451 |
},
|
@@ -471,13 +459,13 @@
|
|
471 |
},
|
472 |
{
|
473 |
"cell_type": "markdown",
|
474 |
-
"id": "
|
475 |
"metadata": {
|
476 |
"papermill": {
|
477 |
-
"duration": 0.
|
478 |
-
"end_time": "2023-09-
|
479 |
"exception": false,
|
480 |
-
"start_time": "2023-09-
|
481 |
"status": "completed"
|
482 |
},
|
483 |
"tags": []
|
@@ -489,19 +477,19 @@
|
|
489 |
{
|
490 |
"cell_type": "code",
|
491 |
"execution_count": 4,
|
492 |
-
"id": "
|
493 |
"metadata": {
|
494 |
"execution": {
|
495 |
-
"iopub.execute_input": "2023-09-
|
496 |
-
"iopub.status.busy": "2023-09-
|
497 |
-
"iopub.status.idle": "2023-09-
|
498 |
-
"shell.execute_reply": "2023-09-
|
499 |
},
|
500 |
"papermill": {
|
501 |
-
"duration": 5.
|
502 |
-
"end_time": "2023-09-
|
503 |
"exception": false,
|
504 |
-
"start_time": "2023-09-
|
505 |
"status": "completed"
|
506 |
},
|
507 |
"tags": []
|
@@ -527,19 +515,19 @@
|
|
527 |
{
|
528 |
"cell_type": "code",
|
529 |
"execution_count": 5,
|
530 |
-
"id": "
|
531 |
"metadata": {
|
532 |
"execution": {
|
533 |
-
"iopub.execute_input": "2023-09-
|
534 |
-
"iopub.status.busy": "2023-09-
|
535 |
-
"iopub.status.idle": "2023-09-
|
536 |
-
"shell.execute_reply": "2023-09-
|
537 |
},
|
538 |
"papermill": {
|
539 |
-
"duration": 0.
|
540 |
-
"end_time": "2023-09-
|
541 |
"exception": false,
|
542 |
-
"start_time": "2023-09-
|
543 |
"status": "completed"
|
544 |
},
|
545 |
"tags": []
|
@@ -549,17 +537,16 @@
|
|
549 |
"name": "stdout",
|
550 |
"output_type": "stream",
|
551 |
"text": [
|
552 |
-
"/usr/bin/sh: 1:
|
553 |
]
|
554 |
}
|
555 |
],
|
556 |
"source": [
|
557 |
"# Start the foundation model training\n",
|
558 |
"!cd \"{TRAINER_DIR}\" && \\\n",
|
559 |
-
" export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
|
560 |
" export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
|
561 |
" python lightning_trainer.py fit \\\n",
|
562 |
-
" -c \"{NOTEBOOK_DIR}/v5base-enwiki-4k.yaml\" \\\n",
|
563 |
" --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
|
564 |
" --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
|
565 |
" --trainer.devices=\"{GPU_DEVICES}\" \\\n",
|
@@ -572,19 +559,19 @@
|
|
572 |
{
|
573 |
"cell_type": "code",
|
574 |
"execution_count": 6,
|
575 |
-
"id": "
|
576 |
"metadata": {
|
577 |
"execution": {
|
578 |
-
"iopub.execute_input": "2023-09-
|
579 |
-
"iopub.status.busy": "2023-09-
|
580 |
-
"iopub.status.idle": "2023-09-
|
581 |
-
"shell.execute_reply": "2023-09-
|
582 |
},
|
583 |
"papermill": {
|
584 |
-
"duration": 0.
|
585 |
-
"end_time": "2023-09-
|
586 |
"exception": false,
|
587 |
-
"start_time": "2023-09-
|
588 |
"status": "completed"
|
589 |
},
|
590 |
"tags": []
|
@@ -615,19 +602,19 @@
|
|
615 |
{
|
616 |
"cell_type": "code",
|
617 |
"execution_count": 7,
|
618 |
-
"id": "
|
619 |
"metadata": {
|
620 |
"execution": {
|
621 |
-
"iopub.execute_input": "2023-09-
|
622 |
-
"iopub.status.busy": "2023-09-
|
623 |
-
"iopub.status.idle": "2023-09-
|
624 |
-
"shell.execute_reply": "2023-09-
|
625 |
},
|
626 |
"papermill": {
|
627 |
-
"duration":
|
628 |
-
"end_time": "2023-09-
|
629 |
"exception": false,
|
630 |
-
"start_time": "2023-09-
|
631 |
"status": "completed"
|
632 |
},
|
633 |
"tags": []
|
@@ -637,14 +624,28 @@
|
|
637 |
"name": "stdout",
|
638 |
"output_type": "stream",
|
639 |
"text": [
|
640 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
641 |
]
|
642 |
}
|
643 |
],
|
644 |
"source": [
|
645 |
"# # Lets do a quick dragon prompt validation\n",
|
646 |
"!cd \"{INFERENCE_DIR}\" && \\\n",
|
647 |
-
" export RWKV_WAVENET_LAYERS=\"{RWKV_WAVENET_LAYERS}\" && \\\n",
|
648 |
" python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\""
|
649 |
]
|
650 |
}
|
@@ -669,14 +670,14 @@
|
|
669 |
},
|
670 |
"papermill": {
|
671 |
"default_parameters": {},
|
672 |
-
"duration":
|
673 |
-
"end_time": "2023-09-
|
674 |
"environment_variables": {},
|
675 |
"exception": null,
|
676 |
"input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
|
677 |
"output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
|
678 |
"parameters": {},
|
679 |
-
"start_time": "2023-09-
|
680 |
"version": "2.4.0"
|
681 |
}
|
682 |
},
|
|
|
3 |
{
|
4 |
"attachments": {},
|
5 |
"cell_type": "markdown",
|
6 |
+
"id": "ef458e0c",
|
7 |
"metadata": {
|
8 |
"papermill": {
|
9 |
+
"duration": 0.002614,
|
10 |
+
"end_time": "2023-09-29T05:06:25.725060",
|
11 |
"exception": false,
|
12 |
+
"start_time": "2023-09-29T05:06:25.722446",
|
13 |
"status": "completed"
|
14 |
},
|
15 |
"tags": []
|
|
|
23 |
{
|
24 |
"attachments": {},
|
25 |
"cell_type": "markdown",
|
26 |
+
"id": "58eb3f3e",
|
27 |
"metadata": {
|
28 |
"papermill": {
|
29 |
+
"duration": 0.00201,
|
30 |
+
"end_time": "2023-09-29T05:06:25.730966",
|
31 |
"exception": false,
|
32 |
+
"start_time": "2023-09-29T05:06:25.728956",
|
33 |
"status": "completed"
|
34 |
},
|
35 |
"tags": []
|
|
|
41 |
{
|
42 |
"cell_type": "code",
|
43 |
"execution_count": 1,
|
44 |
+
"id": "e0abbad9",
|
45 |
"metadata": {
|
46 |
"execution": {
|
47 |
+
"iopub.execute_input": "2023-09-29T05:06:25.737449Z",
|
48 |
+
"iopub.status.busy": "2023-09-29T05:06:25.736495Z",
|
49 |
+
"iopub.status.idle": "2023-09-29T05:06:26.482958Z",
|
50 |
+
"shell.execute_reply": "2023-09-29T05:06:26.482054Z"
|
51 |
},
|
52 |
"papermill": {
|
53 |
+
"duration": 0.751859,
|
54 |
+
"end_time": "2023-09-29T05:06:26.485032",
|
55 |
"exception": false,
|
56 |
+
"start_time": "2023-09-29T05:06:25.733173",
|
57 |
"status": "completed"
|
58 |
},
|
59 |
"tags": []
|
|
|
69 |
{
|
70 |
"cell_type": "code",
|
71 |
"execution_count": 2,
|
72 |
+
"id": "42d56a7f",
|
73 |
"metadata": {
|
74 |
"execution": {
|
75 |
+
"iopub.execute_input": "2023-09-29T05:06:26.491452Z",
|
76 |
+
"iopub.status.busy": "2023-09-29T05:06:26.490928Z",
|
77 |
+
"iopub.status.idle": "2023-09-29T05:06:26.499148Z",
|
78 |
+
"shell.execute_reply": "2023-09-29T05:06:26.498384Z"
|
79 |
},
|
80 |
"papermill": {
|
81 |
+
"duration": 0.013307,
|
82 |
+
"end_time": "2023-09-29T05:06:26.500768",
|
83 |
"exception": false,
|
84 |
+
"start_time": "2023-09-29T05:06:26.487461",
|
85 |
"status": "completed"
|
86 |
},
|
87 |
"tags": []
|
|
|
140 |
{
|
141 |
"cell_type": "code",
|
142 |
"execution_count": 3,
|
143 |
+
"id": "5514ed91",
|
144 |
"metadata": {
|
145 |
"execution": {
|
146 |
+
"iopub.execute_input": "2023-09-29T05:06:26.507274Z",
|
147 |
+
"iopub.status.busy": "2023-09-29T05:06:26.506786Z",
|
148 |
+
"iopub.status.idle": "2023-09-29T05:06:55.991075Z",
|
149 |
+
"shell.execute_reply": "2023-09-29T05:06:55.990231Z"
|
150 |
},
|
151 |
"papermill": {
|
152 |
+
"duration": 29.490941,
|
153 |
+
"end_time": "2023-09-29T05:06:55.994238",
|
154 |
"exception": false,
|
155 |
+
"start_time": "2023-09-29T05:06:26.503297",
|
156 |
"status": "completed"
|
157 |
},
|
158 |
"tags": []
|
|
|
162 |
"name": "stdout",
|
163 |
"output_type": "stream",
|
164 |
"text": [
|
165 |
+
"[2023-09-29 05:06:30,625] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
|
166 |
+
]
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"name": "stdout",
|
170 |
+
"output_type": "stream",
|
171 |
+
"text": [
|
172 |
+
"[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n"
|
173 |
]
|
174 |
},
|
175 |
{
|
176 |
"name": "stdout",
|
177 |
"output_type": "stream",
|
178 |
"text": [
|
|
|
179 |
"---- Initializing model ----\r\n",
|
180 |
"No of layers: 6\r\n",
|
181 |
"Embedding size: 2048\r\n",
|
|
|
234 |
"output_type": "stream",
|
235 |
"text": [
|
236 |
"2048 2048 0 blocks.0.ffn.receptance.weight\r\n",
|
237 |
+
"2048 7168 0 blocks.0.ffn.value.weight\r\n",
|
238 |
+
"2048 2048 1.0 blocks.1.att.gate.weight\r\n"
|
239 |
]
|
240 |
},
|
241 |
{
|
242 |
"name": "stdout",
|
243 |
"output_type": "stream",
|
244 |
"text": [
|
245 |
+
"2048 2048 1.0 blocks.1.att.receptance.weight\r\n"
|
246 |
]
|
247 |
},
|
248 |
{
|
249 |
"name": "stdout",
|
250 |
"output_type": "stream",
|
251 |
"text": [
|
252 |
+
"2048 2048 1.0 blocks.1.att.key.weight\r\n"
|
253 |
]
|
254 |
},
|
255 |
{
|
256 |
"name": "stdout",
|
257 |
"output_type": "stream",
|
258 |
"text": [
|
259 |
+
"2048 2048 1.0 blocks.1.att.value.weight\r\n"
|
260 |
]
|
261 |
},
|
262 |
{
|
263 |
"name": "stdout",
|
264 |
"output_type": "stream",
|
265 |
"text": [
|
266 |
+
"2048 2048 0 blocks.1.att.output.weight\r\n"
|
267 |
]
|
268 |
},
|
269 |
{
|
270 |
"name": "stdout",
|
271 |
"output_type": "stream",
|
272 |
"text": [
|
|
|
273 |
"7168 2048 1.0 blocks.1.ffn.key.weight\r\n"
|
274 |
]
|
275 |
},
|
|
|
278 |
"output_type": "stream",
|
279 |
"text": [
|
280 |
"2048 2048 0 blocks.1.ffn.receptance.weight\r\n",
|
281 |
+
"2048 7168 0 blocks.1.ffn.value.weight\r\n"
|
|
|
282 |
]
|
283 |
},
|
284 |
{
|
285 |
"name": "stdout",
|
286 |
"output_type": "stream",
|
287 |
"text": [
|
288 |
+
"2048 2048 1.0 blocks.2.att.gate.weight\r\n"
|
289 |
]
|
290 |
},
|
291 |
{
|
292 |
"name": "stdout",
|
293 |
"output_type": "stream",
|
294 |
"text": [
|
295 |
+
"2048 2048 1.0 blocks.2.att.receptance.weight\r\n"
|
296 |
]
|
297 |
},
|
298 |
{
|
299 |
"name": "stdout",
|
300 |
"output_type": "stream",
|
301 |
"text": [
|
302 |
+
"2048 2048 1.0 blocks.2.att.key.weight\r\n"
|
303 |
]
|
304 |
},
|
305 |
{
|
306 |
"name": "stdout",
|
307 |
"output_type": "stream",
|
308 |
"text": [
|
309 |
+
"2048 2048 1.0 blocks.2.att.value.weight\r\n"
|
310 |
]
|
311 |
},
|
312 |
{
|
313 |
"name": "stdout",
|
314 |
"output_type": "stream",
|
315 |
"text": [
|
316 |
+
"2048 2048 0 blocks.2.att.output.weight\r\n",
|
317 |
"7168 2048 1.0 blocks.2.ffn.key.weight\r\n"
|
318 |
]
|
319 |
},
|
|
|
366 |
"output_type": "stream",
|
367 |
"text": [
|
368 |
"2048 2048 0 blocks.3.ffn.receptance.weight\r\n",
|
369 |
+
"2048 7168 0 blocks.3.ffn.value.weight\r\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
"2048 2048 1.0 blocks.4.att.gate.weight\r\n"
|
371 |
]
|
372 |
},
|
|
|
404 |
"output_type": "stream",
|
405 |
"text": [
|
406 |
"2048 2048 0 blocks.4.ffn.receptance.weight\r\n",
|
407 |
+
"2048 7168 0 blocks.4.ffn.value.weight\r\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
408 |
"2048 2048 1.0 blocks.5.att.gate.weight\r\n"
|
409 |
]
|
410 |
},
|
|
|
433 |
"name": "stdout",
|
434 |
"output_type": "stream",
|
435 |
"text": [
|
436 |
+
"2048 2048 0 blocks.5.att.output.weight\r\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
"7168 2048 1.0 blocks.5.ffn.key.weight\r\n"
|
438 |
]
|
439 |
},
|
|
|
459 |
},
|
460 |
{
|
461 |
"cell_type": "markdown",
|
462 |
+
"id": "8afd9e50",
|
463 |
"metadata": {
|
464 |
"papermill": {
|
465 |
+
"duration": 0.005752,
|
466 |
+
"end_time": "2023-09-29T05:06:56.006385",
|
467 |
"exception": false,
|
468 |
+
"start_time": "2023-09-29T05:06:56.000633",
|
469 |
"status": "completed"
|
470 |
},
|
471 |
"tags": []
|
|
|
477 |
{
|
478 |
"cell_type": "code",
|
479 |
"execution_count": 4,
|
480 |
+
"id": "ff78d2bd",
|
481 |
"metadata": {
|
482 |
"execution": {
|
483 |
+
"iopub.execute_input": "2023-09-29T05:06:56.020959Z",
|
484 |
+
"iopub.status.busy": "2023-09-29T05:06:56.020447Z",
|
485 |
+
"iopub.status.idle": "2023-09-29T05:07:01.579575Z",
|
486 |
+
"shell.execute_reply": "2023-09-29T05:07:01.578476Z"
|
487 |
},
|
488 |
"papermill": {
|
489 |
+
"duration": 5.569483,
|
490 |
+
"end_time": "2023-09-29T05:07:01.582319",
|
491 |
"exception": false,
|
492 |
+
"start_time": "2023-09-29T05:06:56.012836",
|
493 |
"status": "completed"
|
494 |
},
|
495 |
"tags": []
|
|
|
515 |
{
|
516 |
"cell_type": "code",
|
517 |
"execution_count": 5,
|
518 |
+
"id": "f656d56b",
|
519 |
"metadata": {
|
520 |
"execution": {
|
521 |
+
"iopub.execute_input": "2023-09-29T05:07:01.598719Z",
|
522 |
+
"iopub.status.busy": "2023-09-29T05:07:01.597947Z",
|
523 |
+
"iopub.status.idle": "2023-09-29T05:07:01.851778Z",
|
524 |
+
"shell.execute_reply": "2023-09-29T05:07:01.850738Z"
|
525 |
},
|
526 |
"papermill": {
|
527 |
+
"duration": 0.265316,
|
528 |
+
"end_time": "2023-09-29T05:07:01.854564",
|
529 |
"exception": false,
|
530 |
+
"start_time": "2023-09-29T05:07:01.589248",
|
531 |
"status": "completed"
|
532 |
},
|
533 |
"tags": []
|
|
|
537 |
"name": "stdout",
|
538 |
"output_type": "stream",
|
539 |
"text": [
|
540 |
+
"/usr/bin/sh: 1: python: not found\r\n"
|
541 |
]
|
542 |
}
|
543 |
],
|
544 |
"source": [
|
545 |
"# Start the foundation model training\n",
|
546 |
"!cd \"{TRAINER_DIR}\" && \\\n",
|
|
|
547 |
" export WANDB_MODE=\"{WANDB_MODE}\" && \\\n",
|
548 |
" python lightning_trainer.py fit \\\n",
|
549 |
+
" -c \"{NOTEBOOK_DIR}/v5base-enwiki-4k-part1.yaml\" \\\n",
|
550 |
" --trainer.logger.init_args.name=\"{WANDB_PREFIX} - Enwiki-4k Part 1 (train-ctx=4k, {DEEPSPEED_STRAT})\" \\\n",
|
551 |
" --trainer.strategy=\"{DEEPSPEED_STRAT}\" \\\n",
|
552 |
" --trainer.devices=\"{GPU_DEVICES}\" \\\n",
|
|
|
559 |
{
|
560 |
"cell_type": "code",
|
561 |
"execution_count": 6,
|
562 |
+
"id": "c7b46f94",
|
563 |
"metadata": {
|
564 |
"execution": {
|
565 |
+
"iopub.execute_input": "2023-09-29T05:07:01.871225Z",
|
566 |
+
"iopub.status.busy": "2023-09-29T05:07:01.870345Z",
|
567 |
+
"iopub.status.idle": "2023-09-29T05:07:02.373808Z",
|
568 |
+
"shell.execute_reply": "2023-09-29T05:07:02.372753Z"
|
569 |
},
|
570 |
"papermill": {
|
571 |
+
"duration": 0.51526,
|
572 |
+
"end_time": "2023-09-29T05:07:02.376685",
|
573 |
"exception": false,
|
574 |
+
"start_time": "2023-09-29T05:07:01.861425",
|
575 |
"status": "completed"
|
576 |
},
|
577 |
"tags": []
|
|
|
602 |
{
|
603 |
"cell_type": "code",
|
604 |
"execution_count": 7,
|
605 |
+
"id": "9f558c57",
|
606 |
"metadata": {
|
607 |
"execution": {
|
608 |
+
"iopub.execute_input": "2023-09-29T05:07:02.393471Z",
|
609 |
+
"iopub.status.busy": "2023-09-29T05:07:02.392695Z",
|
610 |
+
"iopub.status.idle": "2023-09-29T05:07:08.804315Z",
|
611 |
+
"shell.execute_reply": "2023-09-29T05:07:08.803244Z"
|
612 |
},
|
613 |
"papermill": {
|
614 |
+
"duration": 6.42299,
|
615 |
+
"end_time": "2023-09-29T05:07:08.806769",
|
616 |
"exception": false,
|
617 |
+
"start_time": "2023-09-29T05:07:02.383779",
|
618 |
"status": "completed"
|
619 |
},
|
620 |
"tags": []
|
|
|
624 |
"name": "stdout",
|
625 |
"output_type": "stream",
|
626 |
"text": [
|
627 |
+
"[2023-09-29 05:07:06,749] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\r\n"
|
628 |
+
]
|
629 |
+
},
|
630 |
+
{
|
631 |
+
"name": "stdout",
|
632 |
+
"output_type": "stream",
|
633 |
+
"text": [
|
634 |
+
"[RWKV.model] Running RWKV model using 'torch-jit' with torch '2.0.1+cu118'\r\n",
|
635 |
+
"Traceback (most recent call last):\r\n",
|
636 |
+
" File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/dragon_test.py\", line 52, in <module>\r\n",
|
637 |
+
" model = SimpleRWKV(MODEL_PATH, device=DEVICE)\r\n",
|
638 |
+
" File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 1420, in __init__\r\n",
|
639 |
+
" self.model = RWKV(**model_config)\r\n",
|
640 |
+
" File \"/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/RWKV-v5/src/model.py\", line 566, in __init__\r\n",
|
641 |
+
" raise ValueError(f\"load_model file '{load_model}' does not exist\")\r\n",
|
642 |
+
"ValueError: load_model file '../model/v5-L6-D2048-E0_01-enwiki-4k-p1.pth' does not exist\r\n"
|
643 |
]
|
644 |
}
|
645 |
],
|
646 |
"source": [
|
647 |
"# # Lets do a quick dragon prompt validation\n",
|
648 |
"!cd \"{INFERENCE_DIR}\" && \\\n",
|
|
|
649 |
" python3 dragon_test.py \"../model/{FILENAME_PREFIX}-enwiki-4k-p1.pth\" \"cuda fp32\""
|
650 |
]
|
651 |
}
|
|
|
670 |
},
|
671 |
"papermill": {
|
672 |
"default_parameters": {},
|
673 |
+
"duration": 44.644446,
|
674 |
+
"end_time": "2023-09-29T05:07:09.133994",
|
675 |
"environment_variables": {},
|
676 |
"exception": null,
|
677 |
"input_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/notebook/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
|
678 |
"output_path": "/actions-runner/_work/RWKV-infctx-trainer/RWKV-infctx-trainer/output/experiment/rwkv-x-exp/multi-size-train/v5-L6-D2048-part1.ipynb",
|
679 |
"parameters": {},
|
680 |
+
"start_time": "2023-09-29T05:06:24.489548",
|
681 |
"version": "2.4.0"
|
682 |
}
|
683 |
},
|