# Suggested data paths when using GPT-NeoX locally | |
{ | |
"data_path": "/workspace/gpt-neox-main/data/enwik8/enwik8_text_document", | |
# or for weighted datasets: | |
# "train-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], | |
# "test-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], | |
# "valid-data-paths": ["data/enwik8/enwik8_text_document", "data/enwik8/enwik8_text_document"], | |
# "train-data-weights": [1., 2.], | |
# "test-data-weights": [2., 1.], | |
# "valid-data-weights": [0.5, 0.4], | |
# If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group. | |
# WARNING: setting this to True will override any user provided weights | |
# "weight_by_num_documents": false, | |
# "weighted_sampler_alpha": 0.3, | |
"vocab_file": "/workspace/gpt-neox-main/data/gpt2-vocab.json", | |
"merge_file": "/workspace/gpt-neox-main/data/gpt2-merges.txt", | |
"save": "checkpoints", | |
"load": "checkpoints", | |
"checkpoint_validation_with_forward_pass": False, | |
"tensorboard_dir": "tensorboard", | |
"log_dir": "logs", | |
"use_comet": True, | |
# "comet_workspace": "test_workspace", # CHANGE ME | |
"comet_project": "test_project", | |
"comet_experiment_name": "test_experiment", | |
"comet_tags": ["test_tag1", "test_tag2"], | |
"comet_others": {"test_others"}, | |
} | |