# 19M parameter model, & local setup with some additional simplifications { # Settings to make the test setup as lightweight as possible "data_path": "data/enwik8/enwik8_text_document", "vocab_file": "data/gpt2-vocab.json", "merge_file": "data/gpt2-merges.txt", "lr_decay_iters": 20, "train_iters": 20, "hostfile": "None", "include": "localhost:1", "use_wandb": False, # Settings copied from 19M parameter config (some modifications above, meaning we can't use configs/19M.yml directly) "pipe_parallel_size": 1, "model_parallel_size": 1, # model settings "num_layers": 2, "hidden_size": 8, "num_attention_heads": 4, "seq_length": 1024, "max_position_embeddings": 1024, "pos_emb": "rotary", "no_weight_tying": true, "gpt_j_residual": false, "output_layer_parallelism": "column", "scaled_upper_triang_masked_softmax_fusion": false, "bias_gelu_fusion": false, "rope_fusion": false, "layernorm_fusion": false, # Optimizer "optimizer": { "type": "sm3", "params": {}, }, # precision "precision": "fp16", # init methods "init_method": "small_init", "output_layer_init_method": "wang_init", "train_micro_batch_size_per_gpu": 4, "gradient_accumulation_steps": 1, "data_impl": "mmap", "num_workers": 1, # activation checkpointing "checkpoint_activations": true, "checkpoint_num_layers": 1, "partition_activations": true, "synchronize_each_layer": true, # regularization "gradient_clipping": 1.0, "weight_decay": 0.1, "hidden_dropout": 0, "attention_dropout": 0, "distributed_backend": "nccl", "lr_decay_style": "cosine", "warmup": 0.01, "checkpoint_factor": 1000, "eval_interval": 100000, "eval_iters": 10, "log_interval": 10, "steps_per_print": 10, "wall_clock_breakdown": true, # additional deepspeed args not specified above "deepspeed_extra_args": { "comms_logger": { "enabled": true, "verbose": true, "prof_all": true, "debug": false }, } }