RyanYr commited on Sep 9

Commit

daa9896

•

1 Parent(s): 0c01d6d

Training in progress, step 300, checkpoint

Browse files

Files changed (19) hide show

last-checkpoint/config.json +1 -1
last-checkpoint/generation_config.json +1 -1
last-checkpoint/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/global_step300/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/global_step300/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/global_step300/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
last-checkpoint/global_step300/mp_rank_00_model_states.pt +1 -1
last-checkpoint/latest +1 -1
last-checkpoint/model-00001-of-00002.safetensors +1 -1
last-checkpoint/model-00002-of-00002.safetensors +1 -1
last-checkpoint/rng_state_0.pth +1 -1
last-checkpoint/rng_state_1.pth +1 -1
last-checkpoint/rng_state_2.pth +1 -1
last-checkpoint/rng_state_3.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/tokenizer.json +1 -1
last-checkpoint/trainer_state.json +0 -0
last-checkpoint/training_args.bin +1 -1
last-checkpoint/zero_to_fp32.py +13 -30

last-checkpoint/config.json CHANGED Viewed

@@ -30,7 +30,7 @@
   "rope_theta": 10000.0,
   "sliding_window": 4096,
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.44.2",
   "use_cache": false,
   "vocab_size": 256001
 }

   "rope_theta": 10000.0,
   "sliding_window": 4096,
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.4",
   "use_cache": false,
   "vocab_size": 256001
 }

last-checkpoint/generation_config.json CHANGED Viewed

@@ -7,5 +7,5 @@
     107
   ],
   "pad_token_id": 0,
-  "transformers_version": "4.44.2"
 }

     107
   ],
   "pad_token_id": 0,
+  "transformers_version": "4.43.4"
 }

last-checkpoint/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4dfa001510f1245942dafc19e25f44adac764fdeb5d88d10eddfcdf09286fea
 size 7843036668

 version https://git-lfs.github.com/spec/v1
+oid sha256:490395e6ddb61a55d1f5621fad1deedde177291f376a1cab36fc05a41e4c8ca6
 size 7843036668

last-checkpoint/global_step300/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0100073d56a06b572003930c4647b70a538edc7ac67241b40f5f833e3040b10f
 size 7843043580

 version https://git-lfs.github.com/spec/v1
+oid sha256:8cb520c96f9c73c5d0868140ecc4ea16fea2f0da04ee42d790c9a77c67e41f62
 size 7843043580

last-checkpoint/global_step300/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0fe3b6430e4ff02932e1ebb8820df6c78493f72a94d73674b3a337a24567dfb1
 size 7843043004

 version https://git-lfs.github.com/spec/v1
+oid sha256:27e83bf3193945caa8218113e40161c4401aee5092fda15fbdb265676a30c2fa
 size 7843043004

last-checkpoint/global_step300/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b0bfb0cc7923c504ff6a6eb192b3921cca2b134cd36cb6b634c59853033cab99
 size 7843043388

 version https://git-lfs.github.com/spec/v1
+oid sha256:44d4dd32831c7a9646cd49d1a6dbc7df7f8738372bd5a23dd0f3c1b95f5118cb
 size 7843043388

last-checkpoint/global_step300/mp_rank_00_model_states.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:df4c97c6d836cd16ac31fff03c6e56d8d7867734ff9d9ee4f55f68df14601d63
 size 5228775200

 version https://git-lfs.github.com/spec/v1
+oid sha256:f855eb13a1f303c00e73dda783c536d3bba9a9118989fde748f77807ba0c98bd
 size 5228775200

last-checkpoint/latest CHANGED Viewed

	@@ -1 +1 @@
1	- ~~global_step1606~~


1	+ global_step300

last-checkpoint/model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8b6ef8a2b6bc11bee46b965fda81f699192d9ea76dcc92e5446ae71eb00afd97
 size 4988030368

 version https://git-lfs.github.com/spec/v1
+oid sha256:0a95052f77e0ee3c96f0f7cae869217e2cc93260d3e50d9852b75c2f0521adcb
 size 4988030368

last-checkpoint/model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8decfa8858e28a8f92e2d8ec671fa359841b2180e19beb1dd64dc77862ba7ef1
 size 1420344488

 version https://git-lfs.github.com/spec/v1
+oid sha256:7c5a4fd382cb8a85c300b8a5648653039f0191901b30d466d7a75044be659a1d
 size 1420344488

last-checkpoint/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad
 size 15024

 version https://git-lfs.github.com/spec/v1
+oid sha256:92cc13315f24c28015d695b6cde08bb1cd6fea4cbc435998485ed6fbe4c91285
 size 15024

last-checkpoint/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57
 size 15024

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4c154b6a63e0b1f98f7d2847944398f99f1657d35e8eddf7fdf0ae2c24b0552
 size 15024

last-checkpoint/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d
 size 15024

 version https://git-lfs.github.com/spec/v1
+oid sha256:f784c6a9507b51189f2caffbd178ea9882103b75852e31c15f47fdae6a43af1d
 size 15024

last-checkpoint/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039
 size 15024

 version https://git-lfs.github.com/spec/v1
+oid sha256:34b023e05bc2d12b91dc436d4922b990d50ec8dc56d40dc3e36b3bb34fc81341
 size 15024

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5004e0a8a423ac7767518d3c523b7ff1e2ad0ddfefc734167783821a2bfb69a6
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:3c9281bdbed11a5fa989179d3990f8ea1577b41ba21b300ef8adcb469edf99b5
 size 1064

last-checkpoint/tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ad81132a729860bdb9e4d2e22e3ae09f317f539aac46d8acc4e17c9412f0870
 size 17525539

 version https://git-lfs.github.com/spec/v1
+oid sha256:987ad1b8e70d3ba898f587a434ba487d544c2800b1b9dcf020ffcbe7a5ac1d12
 size 17525539

last-checkpoint/trainer_state.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46f9a52e7c282501a8e3028dacc99bc68c098efa306a7cbd4983a9b5a70c118a
 size 7096

 version https://git-lfs.github.com/spec/v1
+oid sha256:45001beea15a94e59d46abe242dc94b7cf1ec836fd936b0dfb68a48a54f36abe
 size 7096

last-checkpoint/zero_to_fp32.py CHANGED Viewed

@@ -191,7 +191,7 @@ def parse_optim_states(files, ds_checkpoint_dir):
     return zero_stage, world_size, fp32_flat_groups
-def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
     """
     Returns fp32 state_dict reconstructed from ds checkpoint
@@ -211,11 +211,9 @@ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_
     print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
     if zero_stage <= 2:
-        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
-                                                          exclude_frozen_parameters)
     elif zero_stage == 3:
-        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
-                                                          exclude_frozen_parameters)
 def _zero2_merge_frozen_params(state_dict, zero_model_states):
@@ -250,11 +248,6 @@ def _zero2_merge_frozen_params(state_dict, zero_model_states):
     print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
-def _has_callable(obj, fn):
-    attr = getattr(obj, fn, None)
-    return callable(attr)
 def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
     param_shapes = zero_model_states[0].param_shapes
@@ -294,7 +287,7 @@ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
         avail_numel = full_single_fp32_vector.numel()
         for name, shape in shapes.items():
-            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
             total_numel += unpartitioned_numel
             total_params += 1
@@ -328,8 +321,7 @@ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
     print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
-def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
-                                               exclude_frozen_parameters):
     state_dict = OrderedDict()
     # buffers
@@ -338,8 +330,7 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zer
     if debug:
         print(f"added {len(buffers)} buffers")
-    if not exclude_frozen_parameters:
-        _zero2_merge_frozen_params(state_dict, zero_model_states)
     _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
@@ -448,8 +439,7 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
     print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
-def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
-                                               exclude_frozen_parameters):
     state_dict = OrderedDict()
     # buffers
@@ -458,8 +448,7 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zer
     if debug:
         print(f"added {len(buffers)} buffers")
-    if not exclude_frozen_parameters:
-        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
     _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
@@ -471,7 +460,7 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zer
     return state_dict
-def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
     ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
@@ -480,7 +469,6 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
     Args:
         - ``checkpoint_dir``: path to the desired checkpoint folder
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
-        - ``exclude_frozen_parameters``: exclude frozen parameters
     Returns:
         - pytorch ``state_dict``
@@ -518,10 +506,10 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_f
     if not os.path.isdir(ds_checkpoint_dir):
         raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
-    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
-def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
     loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
@@ -530,10 +518,9 @@ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=
         - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
         - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
-        - ``exclude_frozen_parameters``: exclude frozen parameters
     """
-    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
     print(f"Saving fp32 state dict to {output_file}")
     torch.save(state_dict, output_file)
@@ -592,13 +579,9 @@ if __name__ == "__main__":
                         type=str,
                         default=None,
                         help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
-    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
     parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
     args = parser.parse_args()
     debug = args.debug
-    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
-                                               args.output_file,
-                                               tag=args.tag,
-                                               exclude_frozen_parameters=args.exclude_frozen_parameters)

     return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
     """
     Returns fp32 state_dict reconstructed from ds checkpoint
     print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
     if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
     elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
 def _zero2_merge_frozen_params(state_dict, zero_model_states):
     print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
 def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
     param_shapes = zero_model_states[0].param_shapes
         avail_numel = full_single_fp32_vector.numel()
         for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel()
             total_numel += unpartitioned_numel
             total_params += 1
     print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
     state_dict = OrderedDict()
     # buffers
     if debug:
         print(f"added {len(buffers)} buffers")
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
     _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
     print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
     state_dict = OrderedDict()
     # buffers
     if debug:
         print(f"added {len(buffers)} buffers")
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
     _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
     return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
     ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
     Args:
         - ``checkpoint_dir``: path to the desired checkpoint folder
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
     Returns:
         - pytorch ``state_dict``
     if not os.path.isdir(ds_checkpoint_dir):
         raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
     loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
         - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
         - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
     """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
     print(f"Saving fp32 state dict to {output_file}")
     torch.save(state_dict, output_file)
                         type=str,
                         default=None,
                         help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
     parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
     args = parser.parse_args()
     debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)