akswelh
/

NEOX

Model card Files Files and versions Community

akswelh commited on 24 days ago

Commit

d90b3a8

•

1 Parent(s): dbd7285

Upload 251 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.clang-format +155 -0
.dockerignore +1 -0
.gitattributes +1 -0
.github/CODEOWNERS +1 -0
.github/ISSUE_TEMPLATE/bug_report.md +34 -0
.github/ISSUE_TEMPLATE/feature_request.md +20 -0
.github/workflows/.cpu_ci_on_pr.yml +19 -0
.github/workflows/coverity_scan.yml +61 -0
.github/workflows/cpu_ci.yml +34 -0
.github/workflows/cpu_ci_dispatch.yml +20 -0
.github/workflows/docker_build.yml +50 -0
.github/workflows/pull_request.yml +60 -0
.gitignore +157 -0
.pre-commit-config.yaml +40 -0
CITATION.cff +79 -0
CONTRIBUTING.md +86 -0
Dockerfile +90 -0
LICENSE +467 -0
MANIFEST.in +2 -0
README-MUP.md +49 -0
README.md +863 -0
configs/1-3B.yml +93 -0
configs/125M-dmoe.yml +101 -0
configs/125M-json.yml +80 -0
configs/125M-moe.yml +101 -0
configs/125M.yml +96 -0
configs/13B.yml +94 -0
configs/175B.yml +92 -0
configs/19M.yml +97 -0
configs/2-7B.yml +93 -0
configs/20B.yml +113 -0
configs/350M.yml +92 -0
configs/49M.yml +93 -0
configs/6-7B.yml +93 -0
configs/760M.yml +93 -0
configs/800M.yml +86 -0
configs/README.md +368 -0
configs/autotuning_configs/small_tune.json +78 -0
configs/autotuning_configs/tune.json +72 -0
configs/autotuning_configs/tune_1-3B.json +86 -0
configs/autotuning_configs/tune_6-7B.json +77 -0
configs/bf16_125M.yml +80 -0
configs/bnb_125M.yml +87 -0
configs/cpu_mock_config.yml +5 -0
configs/docker/pythia-paths.yml +12 -0
configs/eleutherai_cluster.yml +29 -0
configs/finetuning_configs/6-9B.yml +89 -0
configs/gen_docs.py +96 -0
configs/gmlp_small.yml +72 -0
configs/llama/13B.yml +26 -0

.clang-format ADDED Viewed

	@@ -0,0 +1,155 @@

+---
+# Refer to the following link for the explanation of each params:
+#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+Language: Cpp
+# BasedOnStyle: Google
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+# This is deprecated
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments:  false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:            false
+  AfterControlStatement: false
+  AfterEnum:             false
+  AfterFunction:         false
+  AfterNamespace:        false
+  AfterObjCDeclaration:  false
+  AfterStruct:           false
+  AfterUnion:            false
+  AfterExternBlock:      false
+  BeforeCatch:           false
+  BeforeElse:            false
+  IndentBraces:          false
+  # disabling the below splits, else, they'll just add to the vertical length of source files!
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: WebKit
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+# Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 4
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+# Enabling comment reflow causes doxygen comments to be messed up in their formats!
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+# Be consistent with indent-width, even for people who use tab for indentation!
+TabWidth: 4
+UseTab: Never

.dockerignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ 20B_checkpoints/

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+images/memory_profiling.png filter=lfs diff=lfs merge=lfs -text

.github/CODEOWNERS ADDED Viewed

	@@ -0,0 +1 @@


1	+ * @Quentin-Anthony

.github/ISSUE_TEMPLATE/bug_report.md ADDED Viewed

	@@ -0,0 +1,34 @@

+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+---
+**Describe the bug**
+A clear and concise description of what the bug is.
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+**Proposed solution**
+If you have an idea for how we can fix this problem, describe it here.
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+**Environment (please complete the following information):**
+ - GPUs:
+- Configs:
+**Additional context**
+Add any other context about the problem here.

.github/ISSUE_TEMPLATE/feature_request.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: feature request
+assignees: ''
+---
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+**Additional context**
+Add any other context or screenshots about the feature request here.

.github/workflows/.cpu_ci_on_pr.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+# This file is hidden (.cpu_cpi_on_pr.yml) to minimize the number of runner minutes consumed.
+name: "Pull Request CPU Tests"
+on:
+  pull_request:
+    paths: # job only triggers when the PR changes files under megatron directory
+      - "megatron/**"
+jobs:
+  run-tests:
+    runs-on: ubuntu-22.04 # ubuntu-latest currently points to ubuntu-22.04 but 24.04 is in beta - recommend testing on 24.04 and then changing instead of using ubuntu-latest
+    steps:
+    - name: Checkout Repository
+      uses: actions/checkout@v4
+    - name: Run CPU tests
+      uses: ./tests/cpu_tests
+      with:
+        target_test_ref: ${{ github.event.pull_request.base.sha }}

.github/workflows/coverity_scan.yml ADDED Viewed

	@@ -0,0 +1,61 @@

+name: Coverity
+on:
+  workflow_dispatch:
+    inputs:
+      build_version:
+        description: "Version of GPT-NeoX being submitted for scan"
+        required: false
+        default: "GPT-NeoX build version"
+      build_description:
+        description: "Description of the current build"
+        required: false
+        default: "Current build of GPT-NeoX"
+jobs:
+  coverity:
+    runs-on: ubuntu-latest
+    env:
+      COV_USER: ${{ secrets.COV_USER }} # needs to be an email with access to the Coverity stream - add to secrets/actions
+      COVERITY_PROJECT: ${{ secrets.COVERITY_PROJECT }}
+      COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }} # you can get this token from Coverity stream dashboard:
+        # https://scan.coverity.com/projects/<project>?tab=project_settings
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        path: gpt-neox
+    - name: Install utils
+      run: |
+        sudo apt update -y && sudo apt upgrade -y
+        sudo apt install curl jq wget -y
+    - name: Coverity Download
+      run: |
+        wget https://scan.coverity.com/download/linux64 --post-data "token=$COVERITY_TOKEN&project=$COVERITY_PROJECT" -O coverity_tool.tgz --no-verbose
+        mkdir $GITHUB_WORKSPACE/coverity && tar xvf coverity_tool.tgz -C $GITHUB_WORKSPACE/coverity --strip-components=1
+        $GITHUB_WORKSPACE/coverity/bin/cov-configure --python
+        $GITHUB_WORKSPACE/coverity/bin/cov-configure --gcc
+    - name: Coverity Scan and Upload
+      run: |
+        set -x
+        pushd $GITHUB_WORKSPACE
+        cd $GITHUB_WORKSPACE/gpt-neox
+        $GITHUB_WORKSPACE/coverity/bin/cov-build --dir $GITHUB_WORKSPACE/cov-int --no-command --fs-capture-search ./
+        popd
+        tar caf build-results.bz2 cov-int
+        curl --form token=$COVERITY_TOKEN \
+          --form email=$COV_USER \
+          --form [email protected] \
+          --form version="${{ inputs.build_version }}" \
+          --form description="${{ inputs.build_description }}" \
+          https://scan.coverity.com/builds?project=$COVERITY_PROJECT
+    - name: Upload Scan Build as Artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: coverity-build-${{ github.sha }}
+        path: build-results.bz2

.github/workflows/cpu_ci.yml ADDED Viewed

	@@ -0,0 +1,34 @@

+name: "Run CPU Tests"
+on: "push"
+jobs:
+  run-tests:
+    #runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - name: Install Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+          cache: "pip"
+          cache-dependency-path: "**/requirements*.txt"
+      - name: Upgrade Pip
+        run: python -m pip install --upgrade pip
+      - name: Install Dependencies
+        run: |
+          sudo apt-get install libopenmpi-dev -y
+          pip install torch==1.8.2 torchvision==0.9.2 torchaudio==0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cpu
+          pip install -r requirements/requirements.txt
+          pip install -r requirements/requirements-dev.txt
+          pip install -r requirements/requirements-wandb.txt
+      - name: Prepare Data
+        run: python prepare_data.py
+      - name: Run CPU Tests
+        run: PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python pytest tests -m cpu

.github/workflows/cpu_ci_dispatch.yml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: "Workflow Dispatch CPU Tests"
+on:
+  workflow_dispatch:
+    inputs:
+      ref:
+        description: 'Target ref / SHA to run tests against'
+        required: true
+        default: 'main'
+jobs:
+  run-tests:
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout Repository
+      uses: actions/checkout@v4
+    - name: Run CPU tests
+      uses: ./tests/cpu_tests
+      with:
+        target_test_ref: ${{ inputs.ref }}

.github/workflows/docker_build.yml ADDED Viewed

	@@ -0,0 +1,50 @@

+name: docker_build
+on:
+  push:
+    branches:
+      - '**'
+jobs:
+  main:
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Checkout
+        uses: actions/checkout@v2
+      -
+        name: Docker meta
+        id: docker_meta
+        uses: crazy-max/ghaction-docker-meta@v1
+        with:
+          images: leogao2/gpt-neox # list of Docker images to use as base name for tags
+          tag-sha: true # add git short SHA as Docker tag
+      -
+        name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      -
+        name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v2
+        with:
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.docker_meta.outputs.tags }}
+          labels: ${{ steps.docker_meta.outputs.labels }}
+      -
+        name: Image digest
+        run: echo ${{ steps.docker_build.outputs.digest }}

.github/workflows/pull_request.yml ADDED Viewed

	@@ -0,0 +1,60 @@

+name: Pull Request
+#on: [pull_request, workflow_dispatch]
+on: workflow_dispatch
+jobs:
+  pre-commit:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10.14"
+          cache: "pip"
+          cache-dependency-path: "**/requirements*.txt"
+      # Need the right version of clang-format
+      - run: pip install -r requirements/requirements-dev.txt
+      - uses: pre-commit/[email protected]
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      -
+        name: Docker build
+        id: docker_build
+        uses: docker/build-push-action@v2
+  update-documentation:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          ref: ${{ github.event.pull_request.head.ref}}
+      - run: |
+          rm megatron/__init__.py
+          pip install shortuuid
+          rm megatron/neox_arguments/__init__.py
+          python configs/gen_docs.py
+          git config user.name github-actions
+          git config user.email [email protected]
+          git add configs/neox_arguments.md
+          git commit -m "Update NeoXArgs docs automatically"
+          git push
+  run-tests:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10.13"
+          cache-dependency-path: "**/requirements*.txt"
+      - name: prepare data
+        run: python3 prepare_data.py
+      - name: install pytest
+        run: python3 -m pip install pytest pytest-forked pyyaml requests wandb
+      - name: install torch
+        run: python3 -m pip install torch
+      - name: install requirements
+        run: pip install -r requirements/requirements.txt
+      - name: Run Tests
+        run: pytest --forked tests

.gitignore ADDED Viewed

	@@ -0,0 +1,157 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# wandb logs
+wandb/
+# data files
+data/**/*.idx
+data/**/*.bin
+data/**/*.json*
+data/**/*.txt
+data/**/*.gz
+data/**/*.zip
+data/**/*.np*
+data/**/*.npy
+checkpoints/
+.vscode/
+*.pt
+*.ckpt
+#test logs
+test_checkpoint/
+test_logs/
+logs/
+tensorboard/
+src/
+# test data files
+tests/data/*.bin
+tests/data/*.idx

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+repos:
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v4.1.0
+      hooks:
+          - id: check-case-conflict
+          - id: check-json
+          - id: check-symlinks
+          - id: check-yaml
+          - id: destroyed-symlinks
+          - id: end-of-file-fixer
+            exclude: ^(docs/CNAME/|configs/neox_arguments.md)
+          - id: fix-byte-order-marker
+          - id: fix-encoding-pragma
+            args: [--remove]
+          - id: mixed-line-ending
+            args: [--fix=lf]
+          - id: requirements-txt-fixer
+          - id: trailing-whitespace
+            exclude: ^(docs/CNAME/|configs/neox_arguments.md)
+    - repo: https://gitlab.com/daverona/pre-commit/cpp
+      rev: 0.8.0
+      hooks:
+          - id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
+            args: []
+    - repo: https://github.com/psf/black
+      rev: 22.3.0
+      hooks:
+          - id: black
+            language_version: python3
+    - repo: https://github.com/codespell-project/codespell
+      rev: v2.1.0
+      hooks:
+      - id: codespell
+        args: [
+              '--ignore-words-list=reord,dout,te',  # Word used in error messages that need rewording. te --> transformerengine
+              --check-filenames,
+              --check-hidden,
+          ]
+        exclude: tests/data/hf_cache/tokenizer/gpt2.json

CITATION.cff ADDED Viewed

	@@ -0,0 +1,79 @@

+# YAML 1.2
+---
+authors:
+  - affiliation: EleutherAI
+    family-names: Andonian
+    given-names: Alex
+  - affiliation: EleutherAI
+    family-names: Anthony
+    given-names: Quentin
+  - affiliation: EleutherAI
+    family-names: Biderman
+    given-names: Stella
+  - affiliation: EleutherAI
+    family-names: Black
+    given-names: Sid
+  - affiliation: EleutherAI
+    family-names: Gali
+    given-names: Preetham
+  - affiliation: EleutherAI
+    family-names: Gao
+    given-names: Leo
+  - affiliation: EleutherAI
+    family-names: Hallahan
+    given-names: Eric
+  - affiliation: EleutherAI
+    family-names: Levy-Kramer
+    given-names: Josh
+  - affiliation: EleutherAI
+    family-names: Leahy
+    given-names: Connor
+  - affiliation: EleutherAI
+    family-names: Nestler
+    given-names: Lucas
+  - affiliation: EleutherAI
+    family-names: Parker
+    given-names: Kip
+  - affiliation: EleutherAI
+    family-names: Pieler
+    given-names: Michael
+  - affiliation: EleutherAI
+    family-names: Phang
+    given-names: Jason
+  - affiliation: EleutherAI
+    family-names: Purohit
+    given-names: Shivanshu
+  - affiliation: EleutherAI
+    family-names: Schoelkopf
+    given-names: Hailey
+  - affiliation: EleutherAI
+    family-names: Stander
+    given-names: Dashiell
+  - affiliation: EleutherAI
+    family-names: Songz
+    given-names: Tri
+  - affiliation: EleutherAI
+    family-names: Tigges
+    given-names: Curt
+  - affiliation: EleutherAI
+    family-names: Thérien
+    given-names: Benjamin
+  - affiliation: EleutherAI
+    family-names: Wang
+    given-names: Phil
+  - affiliation: EleutherAI
+    family-names: Weinbach
+    given-names: Samuel
+cff-version: "1.1.0"
+keywords:
+  - "Transformers"
+  - "Massive language model"
+  - "Autoregressive language model"
+license: "Apache-2.0"
+message: "If you use this software, please cite it using these metadata."
+repository-code: "https://www.github.com/eleutherai/gpt-neox"
+title: "GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch"
+version: "2.0.0"
+doi: "10.5281/zenodo.5879544"
+date-released: 2021-08-23
+...

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,86 @@

+# Contributing
+GPT-NeoX welcomes your contributions!
+## Prerequisites
+GPT-NeoX uses [pre-commit](https://pre-commit.com/) to ensure that formatting is
+consistent across GPT-NeoX. First, ensure that `pre-commit` is installed with
+`pip install pre-commit`. Next, the pre-commit hooks must be installed once
+before commits can be made:
+```bash
+pre-commit install
+```
+Please install `clang-format` from Conda:
+```bash
+conda install clang-format
+```
+Afterwards, our suite of formatting tests run automatically before each `git commit`. You
+can also run these manually:
+```bash
+pre-commit run --all-files
+```
+If a formatting test fails, it will fix the modified code in place and abort
+the `git commit`. After looking over the changes, you can `git add <modified files>`
+and then repeat the previous `git commit` command.
+## Testing
+GPT-NeoX tracks two types of tests: unit tests and more costly model convergence tests.
+Unit tests are found in `tests/unit/` and the model convergence tests are found in
+`tests/model/`.
+### Unit Tests
+[PyTest](https://docs.pytest.org/en/latest/) is used to execute tests. PyTest can be
+installed from PyPI via `pip install pytest`. Simply invoke `pytest --forked` to run the
+unit tests:
+```bash
+pytest --forked tests/unit/
+```
+You can also provide the `-v` flag to `pytest` to see additional information about the
+tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) and the
+`--forked` flag are required to test CUDA functionality in distributed tests.
+### Model Tests
+To execute model tests, first install GPT-NeoX. Next, execute the model test driver:
+```bash
+cd tests/model/
+pytest run_sanity_check.py
+```
+Note that the `--forked` flag is not necessary for the model tests.
+## Contributor License Agreement
+This project welcomes contributions and suggestions. Most contributions require you to
+agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
+actually do, grant us the rights to use your contribution. For details, visit
+https://cla-assistant.io/EleutherAI/gpt-neox.
+When you submit a pull request, a CLA bot will automatically determine whether you need
+to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
+follow the instructions provided by the bot. You will only need to do this once across
+all repos using our CLA.
+## New Feature Contribution Guidelines
+Unlike bug fix or improving existing feature (where users usually directly submit a PR and we review it), adding a new feature to GPT-NeoX requires several steps: (1) proposal and discussion, (2) implementation and verification, (3) release and maintenance. This general guideline applies to all new feature contributions. Core GPT-NeoX team member contributions may complete step 1 internally.
+### Step 1: Proposal and Discussion
+We ask users to first post your intended feature in an issue. This issue needs to include:
+* A description of the proposed feature.
+* A motivation of why it will be useful to GPT-NeoX users.
+* A rough design of how you implement the feature inside GPT-NeoX.
+* (Important) Results or planned experiments to demonstrate the effectiveness and correctness of the feature.
+  * If the feature only affects performance and does not affect training convergence, we require testing on a fraction of training to demonstrate that the training/validation loss are consistent with baseline, and that the performance is better than baseline.
+  * If the feature does affect training convergence, we require testing the whole training to demonstrate that the feature achieves better/on-par final model quality and training performance compared to baseline.
+Based on the issue we shall discuss the merit of the new feature and decide whether to accept or decline the proposal. Once accepted and after we confirm the design and implementation plan, we are ready for step 2.
+### Step 2: Implementation and Verification
+The contributor will proceed and implement the feature, and the GPT-NeoX team will provide guidance/helps as needed. The required deliverables include:
+* A PR to [EleutherAI/GPT-NeoX](https://github.com/EleutherAI/gpt-neox) including (1) the feature implementation (2) unit tests (3) documentation (4) example usage.
+* In the implementation (code, documentation, tutorial), we require the feature author to record their GitHub username as a contact method for future questions/maintenance.
+After receiving the PRs, we will review them and merge them after necessary tests/fixes.
+### Step 3: Release and Maintenance
+After the PRs are merged, we will announce the feature on our website (with credit to the feature author). We ask the feature author to commit to the maintenance of the feature.

Dockerfile ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) 2024, EleutherAI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+ENV DEBIAN_FRONTEND=noninteractive
+# metainformation
+LABEL org.opencontainers.image.version = "2.0"
+LABEL org.opencontainers.image.authors = "[email protected]"
+LABEL org.opencontainers.image.source = "https://www.github.com/eleutherai/gpt-neox"
+LABEL org.opencontainers.image.licenses = " Apache-2.0"
+LABEL org.opencontainers.image.base.name="nvcr.io/nvidia/pytorch:24.02-py3"
+#### System package (uses default Python 3 version in Ubuntu 20.04)
+RUN apt-get update -y && \
+    apt-get install -y \
+    python3-pip sudo pdsh \
+    htop tmux zstd software-properties-common \
+    nfs-common pdsh cmake htop iftop iotop ssh \
+    iputils-ping net-tools libcupti-dev libmlx4-1 infiniband-diags ibutils \
+    rdmacm-utils perftest rdma-core && \
+    update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
+    update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
+    python -m pip install --upgrade pip && \
+    python -m pip install gpustat
+### SSH
+RUN mkdir /var/run/sshd && \
+    # Prevent user being kicked off after login
+    sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd && \
+    echo 'AuthorizedKeysFile     .ssh/authorized_keys' >> /etc/ssh/sshd_config && \
+    echo 'PasswordAuthentication yes' >> /etc/ssh/sshd_config && \
+    # FIX SUDO BUG: https://github.com/sudo-project/sudo/issues/42
+    echo "Set disable_coredump false" >> /etc/sudo.conf
+# Expose SSH port
+EXPOSE 22
+# Needs to be in docker PATH if compiling other items & bashrc PATH (later)
+ENV PATH=/usr/local/mpi/bin:${PATH} \
+    LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+    chmod a+x /usr/local/mpi/bin/mpirun
+#### User account
+RUN useradd --create-home --uid 1000 --shell /bin/bash mchorse && \
+    usermod -aG sudo mchorse && \
+    echo "mchorse ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+## SSH config and bashrc
+RUN mkdir -p /home/mchorse/.ssh /job && \
+    echo 'Host *' > /home/mchorse/.ssh/config && \
+    echo '    StrictHostKeyChecking no' >> /home/mchorse/.ssh/config && \
+    echo 'export PDSH_RCMD_TYPE=ssh' >> /home/mchorse/.bashrc && \
+    echo 'export PATH=/home/mchorse/.local/bin:$PATH' >> /home/mchorse/.bashrc && \
+    echo 'export PATH=/usr/local/mpi/bin:$PATH' >> /home/mchorse/.bashrc && \
+    echo 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:$LD_LIBRARY_PATH' >> /home/mchorse/.bashrc
+#### Python packages
+COPY requirements/* ./
+RUN python -m pip install --no-cache-dir -r requirements.txt && pip install -r requirements-onebitadam.txt
+RUN python -m pip install -r requirements-wandb.txt
+RUN python -m pip install protobuf==3.20.*
+COPY megatron/fused_kernels/ /megatron/fused_kernels
+WORKDIR /megatron/fused_kernels
+RUN python setup.py install
+# Clear staging
+RUN mkdir -p /tmp && chmod 0777 /tmp
+#### SWITCH TO mchorse USER
+USER mchorse
+WORKDIR /home/mchorse

LICENSE ADDED Viewed

	@@ -0,0 +1,467 @@

+                                 Apache License
+                           Version 2.0, January 2024
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+--
+This repository also contains code from Hugging Face Inc., Google Research,
+and Facebook (from their Fairseq project). Files from these
+organizations have notices at the top of each file. Below are licenses
+used in those files, as indicated.
+------------- LICENSE FOR NVIDIA code  --------------
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+------------- LICENSE FOR huggingface and Google Research code  --------------
+                                 Apache License
+                           Version 2.0, January 2024
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+------------- LICENSE FOR Facebook Fairseq code --------------
+MIT License
+Copyright (c) Facebook, Inc. and its affiliates.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ include megatron/data/Makefile
2	+ include megatron/data/helpers.cpp

README-MUP.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# How to use Mup (https://github.com/microsoft/mup)
+## Add mup neox args to your config
+```
+# mup
+"use-mup": true,
+"save-base-shapes": false, # this only needs to be enabled once in order to generate the base-shapes-file on each rank
+"base-shapes-file": "base-shapes", # load base shapes from this file
+"coord-check": false, # generate coord check plots to verify mup's implementation in neox
+# mup hp search
+"mup-init-scale": 1.0,
+"mup-attn-temp": 1.0,
+"mup-output-temp": 1.0,
+"mup-embedding-mult": 1.0,
+"mup-rp-embedding-mult": 1.0,
+```
+## Generate base shapes
+1. Set use-mup to true
+2. Set save-base-shapes to true
+3. Run once. gpt-neox will instantiate a base model and a delta model, then save one file per rank named <base-shapes-file>.<rank>. gpt-neox will exit immediately.
+4. Set save-base-shapes to false
+## Generate coord check plots (optional)
+1. Keep use-mup true
+2. Set coord-check to true
+3. Run once. gpt-neox will output jpg images similar to https://github.com/microsoft/mutransformers/blob/main/README.md#coord-check. gpt-neox will exit immediately
+4. Set coord-check to false
+## Tune mup hyperparameters and LR
+The values under `mup hp search` were added and correspond to appendix F.4 from https://arxiv.org/pdf/2203.03466.pdf. These and LR are tuned with a random search using the scaled-up config (tested with 6-7B.yml) but with hidden-size set to the value from the scaled-down config (125M.yml).
+## Transfer
+With the best LR set and the best mup HPs set, revert the value of hidden-size in the scaled-up config and run again.

README.md ADDED Viewed

	@@ -0,0 +1,863 @@

+[![GitHub issues](https://img.shields.io/github/issues/EleutherAI/gpt-neox)](https://github.com/EleutherAI/gpt-neox/issues)
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Weights & Biases monitoring" height=20>](https://wandb.ai/eleutherai/neox)
+# GPT-NeoX
+This repository records [EleutherAI](https://www.eleuther.ai)'s library for training large-scale language models on GPUs. Our current framework is based on NVIDIA's [Megatron Language Model](https://github.com/NVIDIA/Megatron-LM) and has been augmented with techniques from [DeepSpeed](https://www.deepspeed.ai) as well as some novel optimizations. We aim to make this repo a centralized and accessible place to gather techniques for training large-scale autoregressive language models, and accelerate research into large-scale training. This library is in widespread use in [academic, industry, and government labs](https://github.com/EleutherAI/gpt-neox#adoption-and-publications), including by researchers at Oak Ridge National Lab, CarperAI, Stability AI, Together.ai, Korea University, Carnegie Mellon University, and the University of Tokyo among others. Uniquely among similar libraries GPT-NeoX supports a wide variety of systems and hardwares, including launching via Slurm, MPI, and the IBM Job Step Manager, and has been run at scale on [AWS](https://aws.amazon.com/), [CoreWeave](https://www.coreweave.com/), [ORNL Summit](https://www.olcf.ornl.gov/summit/), [ORNL Frontier](https://www.olcf.ornl.gov/frontier/),  [LUMI](https://www.lumi-supercomputer.eu/), and others.
+**If you are not looking to train models with billions of parameters from scratch, this is likely the wrong library to use. For generic inference needs, we recommend you use the Hugging Face `transformers` library instead which supports GPT-NeoX models.**
+## Why GPT-NeoX?
+GPT-NeoX leverages many of the same features and technologies as the popular Megatron-DeepSpeed library but with substantially increased usability and novel optimizations. Major features include:
+* Distributed training with ZeRO and 3D parallelism
+* A wide variety of systems and hardwares, including launching via Slurm, MPI, and the IBM Job Step Manager, and has been run at scale on [AWS](https://aws.amazon.com/), [CoreWeave](https://www.coreweave.com/), Oak Ridge's [Summit](https://www.olcf.ornl.gov/summit/) and [Frontier](https://www.olcf.ornl.gov/frontier/),  [Pacific Northwest National Laboratory](https://hpc.pnl.gov/index.shtml), Argonne's [Polaris](https://docs.alcf.anl.gov/polaris/data-science-workflows/applications/gpt-neox/), [LUMI](https://www.lumi-supercomputer.eu/), and more.
+* Cutting edge architectural innovations including rotary and alibi positional embeddings, parallel feedforward attention layers, and flash attention.
+* Predefined configurations for popular architectures including Pythia, PaLM, Falcon, and LLaMA 1 \& 2
+* Curriculum Learning
+* Easy connections with the open source ecosystem, including Hugging Face's [tokenizers](https://github.com/huggingface/tokenizers) and [transformers](https://github.com/huggingface/transformers/) libraries, monitor experiments via [WandB](https://wandb.ai/site)/[Comet](https://www.comet.com/site/)/TensorBoard, and evaluation via our [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).
+## News
+**[9/9/2024]** We now support preference learning via [DPO](https://arxiv.org/abs/2305.18290), [KTO](https://arxiv.org/abs/2402.01306), and reward modeling
+**[9/9/2024]** We now support integration with [Comet ML](https://www.comet.com/site/), a machine learning monitoring platform
+**[5/21/2024]** We now support [RWKV](https://www.rwkv.com/) with pipeline parallelism!. See the PRs for [RWKV](https://github.com/EleutherAI/gpt-neox/pull/1198) and [RWKV+pipeline](https://github.com/EleutherAI/gpt-neox/pull/1221)
+**[3/21/2024]** We now support Mixture-of-Experts (MoE)
+**[3/17/2024]** We now support AMD MI250X GPUs
+**[3/15/2024]** We now support [Mamba](https://github.com/state-spaces/mamba) with tensor parallelism! See [the PR](https://github.com/EleutherAI/gpt-neox/pull/1184)
+**[8/10/2023]** We now support checkpointing with AWS S3! Activate with the `s3_path` config option (for more detail, see [the PR](https://github.com/EleutherAI/gpt-neox/pull/1010))
+**[9/20/2023]** As of https://github.com/EleutherAI/gpt-neox/pull/1035, we have deprecated Flash Attention 0.x and 1.x, and migrated support to Flash Attention 2.x. We don't believe this will cause problems, but if you have a specific use-case that requires old flash support using the latest GPT-NeoX, please raise an issue.
+**[8/10/2023]** We have experimental support for LLaMA 2 and Flash Attention v2 supported in our [math-lm](https://github.com/EleutherAI/math-lm) project that will be upstreamed later this month.
+**[5/17/2023]** After fixing some miscellaneous bugs we now fully support bf16.
+**[4/11/2023]** We have upgraded our Flash Attention implementation to now support Alibi positional embeddings.
+**[3/9/2023]** We have released GPT-NeoX 2.0.0, an upgraded version built on the latest DeepSpeed which will be regularly synced with going forward.
+## Versions
+Prior to 3/9/2023, GPT-NeoX relied on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), which was based on an old version of DeepSpeed (0.3.15). In order to migrate to the latest upstream DeepSpeed version while allowing users to access the old versions of GPT-NeoX and DeeperSpeed, we have introduced two versioned releases for both libraries:
+- Version 2.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v2.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v2.0) are the latest versions built on the latest DeepSpeed, and will be maintained going forward.
+- Version 1.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v1.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v1.0) maintain snapshots of the old stable versions that [GPT-NeoX-20B](https://arxiv.org/abs/2204.06745) and the [Pythia Suite](https://github.com/EleutherAI/pythia) were trained on.
+# Contents
+- [GPT-NeoX](#gpt-neox)
+  * [Why GPT-NeoX?](#why-gpt-neox)
+  * [News](#news)
+  * [Versions](#versions)
+- [Contents](#contents)
+- [Quick Start](#quick-start)
+  * [Environment and Dependencies](#environment-and-dependencies)
+    + [Host Setup](#host-setup)
+    + [Flash Attention](#flash-attention)
+    + [Multi-Node Launching](#multi-node-launching)
+    + [Containerized Setup](#containerized-setup)
+  * [Usage](#usage)
+- [Configuration](#configuration)
+    * [Mixture of Experts](#mixture-of-experts)
+- [Datasets](#datasets)
+  * [Preconfigured Datasets](#preconfigured-datasets)
+  * [Using Custom Data](#using-custom-data)
+- [Training and Finetuning](#training-and-finetuning)
+  * [Pretrained Models](#pretrained-models)
+    + [GPT-NeoX-20B](#gpt-neox-20b)
+    + [Pythia](#pythia)
+    + [Polyglot](#polyglot)
+- [Inference](#inference)
+- [Evaluation](#evaluation)
+- [Exporting to Hugging Face](#exporting-to-hugging-face)
+- [Monitoring](#monitoring)
+  * [Weights and Biases](#weights-and-biases)
+  * [TensorBoard](#tensorboard)
+- [Running on multi-node](#running-on-multi-node)
+- [Profiling](#profiling)
+- [Adoption and Publications](#adoption-and-publications)
+  * [Publications](#publications)
+  * [Models](#models)
+    + [English LLMs](#english-llms)
+    + [Non-English LLMs](#non-english-llms)
+    + [Code Models](#code-models)
+    + [Other Modalities](#other-modalities)
+- [Administrative Notes](#administrative-notes)
+  * [Citing GPT-NeoX](#citing-gpt-neox)
+  * [Contributing](#contributing)
+  * [Licensing](#licensing)
+  * [Acknowledgements](#acknowledgements)
+# Quick Start
+## Environment and Dependencies
+### Host Setup
+First make sure you are in an environment with Python 3.8 with an appropriate version of PyTorch 1.8 or later installed. **Note:** Some of the libraries that GPT-NeoX depends on have not been updated to be compatible with Python 3.10+. Python 3.9 appears to work, but this codebase has been developed and tested for Python 3.8.
+To install the remaining basic dependencies, run:
+```bash
+pip install -r requirements/requirements.txt
+pip install -r requirements/requirements-wandb.txt # optional, if logging using WandB
+pip install -r requirements/requirements-tensorboard.txt # optional, if logging via tensorboard
+pip install -r requirements/requirements-comet.txt # optional, if logging via Comet
+```
+from the repository root.
+> [!Warning]
+> Our codebase relies on [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed), our fork of the [DeepSpeed](https://github.com/microsoft/DeepSpeed) library with some added changes. We strongly recommend using Anaconda, a virtual machine, or some other form of environment isolation before continuing. Failure to do so may cause other repositories that rely on DeepSpeed to break.
+</aside>
+### Fused Kernels
+We now support AMD GPUs (MI100, MI250X) through JIT fused-kernel compilation. Fused kernels will be built and loaded as needed. To avoid waiting during job launching, you can also do the following for manual pre-build:
+```python
+python
+from megatron.fused_kernels import load
+load()
+```
+This will automatically adapts building process over different GPU vendors (AMD, NVIDIA) without platform specific code changes. To further test fused kernels using `pytest`, use `pytest tests/model/test_fused_kernels.py`
+### Flash Attention
+To use [Flash-Attention](https://github.com/HazyResearch/flash-attention), install the additional dependencies in  `./requirements/requirements-flashattention.txt` and set the attention type in your configuration accordingly (see [configs](./configs/)). This can provide significant speed-ups over regular attention on certain GPU architectures, including Ampere GPUs (such as A100s); see the repository for more details.
+### Multi-Node Launching
+NeoX and Deep(er)Speed support training on multiple different nodes and you have the option of using a variety of different launchers to orchestrate multi-node jobs.
+In general there needs to be a "hostfile" somewhere accessible with the format:
+```bash
+node1_ip slots=8
+node2_ip slots=8
+```
+where the first column contains the IP address for each node in your setup and the number of slots is the number of GPUs that node has access to. In your config you must pass in the path to the hostfile with `"hostfile": "/path/to/hostfile"`. Alternatively the path to the hostfile can be in the environment variable `DLTS_HOSTFILE`.
+#### pdsh
+`pdsh` is the default launcher, and if you're using `pdsh` then all you must do (besides ensuring that pdsh is installed in your environment) is set `{"launcher": "pdsh"}` in your config files.
+#### MPI
+If using MPI then you must specify the MPI library (DeepSpeed/GPT-NeoX currently supports `mvapich`, `openmpi`, `mpich`, and `impi`, though `openmpi` is the most commonly used and tested) as well as pass the `deepspeed_mpi` flag in your config file:
+```json
+{
+    "launcher": "openmpi",
+    "deepspeed_mpi": true
+}
+```
+With your environment properly set up and the correct configuration files you can use `deepy.py` like a normal python script and start (for example) a training job with:
+`python3 deepy.py train.py /path/to/configs/my_model.yml`
+#### Slurm
+Using Slurm can be slightly more involved. Like with MPI, you must add the following to your config:
+```json
+{
+    "launcher": "slurm",
+    "deepspeed_slurm": true
+}
+```
+If you do not have ssh access to the compute nodes in your Slurm cluster you need to add `{"no_ssh_check": true}`
+#### (Advanced) Custom Launching
+There are many cases where the above default launching options are not sufficient
+- Many clusters have their own unique job scheduler or specific MPI/Slurm arguments necessary for launching jobs such as [Summit JSRun](https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun) or [LLNL Flux](https://computing.llnl.gov/projects/flux-building-framework-resource-management)
+- While the above Slurm/MPI/pdsh default options are enough for most job runs, advanced users may want to add arguments for optimization or debugging purposes
+In these cases, you will need to modify the DeepSpeed [multinode runner](https://github.com/microsoft/DeepSpeed/blob/17957728c0362bf8ae70feca308e491e55ef9feb/deepspeed/launcher/multinode_runner.py) utility to support your usecase. Broadly, these enhancements fall under two categories:
+##### 1. Adding a Launcher (e.g. [JSRun](https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun), [Flux](https://computing.llnl.gov/projects/flux-building-framework-resource-management), etc)
+In this case, you must add a new multinode runner class to `deepspeed/launcher/multinode_runner.py` and expose it as a configuration option in GPT-NeoX. Examples on how we did this for [Summit JSRun](https://docs.olcf.ornl.gov/systems/summit_user_guide.html#job-launcher-jsrun) are in [this DeeperSpeed commit](https://github.com/EleutherAI/DeeperSpeed/commit/9aed6c8500d7c492d85c5c88687322dbda70e370) and [this GPT-NeoX commit](https://github.com/EleutherAI/gpt-neox/commit/3782c7ae60f8624e566e3879b89bb09e8b59b869), respectively.
+##### 2. Modifying Run Command or Environment Variables
+We have encountered many cases where we wish to modify the MPI/Slurm run command for an optimization or to debug (e.g. to modify the [Slurm srun CPU binding](https://slurm.schedmd.com/srun.html#OPT_cpu-bind) or to tag MPI logs with the rank). In this case, you must modify the multinode runner class' run command under its `get_cmd` method (e.g. [mpirun_cmd](https://github.com/microsoft/DeepSpeed/blob/17957728c0362bf8ae70feca308e491e55ef9feb/deepspeed/launcher/multinode_runner.py#L135-L147) for OpenMPI). Examples on how we did this to provide optimized and rank-tagged run commands using Slurm and OpenMPI for the Stability cluster are in [this DeeperSpeed branch](https://github.com/microsoft/DeepSpeed/compare/master...EleutherAI:DeeperSpeed:v2.0-stability)
+#### Hostfile Generation
+In general you will not be able to have a single fixed hostfile, so you need to have a script to generate one dynamically when your job starts. An example script to dynamically generate a hostfile using [Slurm](https://slurm.schedmd.com/documentation.html) and 8 GPUs per node is:
+```bash
+#!/bin/bash
+GPUS_PER_NODE=8
+mkdir -p /sample/path/to/hostfiles
+# need to add the current slurm jobid to hostfile name so that we don't add to previous hostfile
+hostfile=/sample/path/to/hostfiles/hosts_$SLURM_JOBID
+# be extra sure we aren't appending to a previous hostfile
+rm $hostfile &> /dev/null
+# loop over the node names
+for i in `scontrol show hostnames $SLURM_NODELIST`
+do
+    # add a line to the hostfile
+    echo $i slots=$GPUS_PER_NODE >>$hostfile
+done
+```
+`$SLURM_JOBID` and `$SLURM_NODELIST` being environment variables Slurm will create for you. See the [sbatch documentation](https://slurm.schedmd.com/sbatch.html#SECTION_OUTPUT-ENVIRONMENT-VARIABLES) for a full list of available Slurm environment variables set at job creation time.
+#### Job Launching
+Then you can create an [sbatch](https://slurm.schedmd.com/sbatch.html) script from which to kick off your GPT-NeoX job. A bare-bones sbatch script on a Slurm-based cluster with 8 GPUs per node would look like this:
+```bash
+#!/bin/bash
+#SBATCH --job-name="neox"
+#SBATCH --partition=your-partition
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --gres=gpu:8
+# Some potentially useful distributed environment variables
+export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
+export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_PORT=12802
+export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
+# Your hostfile creation script from above
+./write_hostfile.sh
+# Tell DeepSpeed where to find our generated hostfile via DLTS_HOSTFILE
+export DLTS_HOSTFILE=/sample/path/to/hostfiles/hosts_$SLURM_JOBID
+# Launch training
+python3 deepy.py train.py /sample/path/to/your/configs/my_model.yml
+```
+You can then kick off a training run with `sbatch my_sbatch_script.sh`
+### Containerized Setup
+We also provide a Dockerfile and docker-compose configuration if you prefer to run NeoX in a container.
+Requirements to run the container are to have appropriate GPU drivers, an up-to-date installation of Docker, and [nvidia-container-toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed. To test if your installation is good you can use their "sample workload", which is:
+```
+docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi
+```
+Provided that will run, you need to export NEOX_DATA_PATH and NEOX_CHECKPOINT_PATH in your environment to specify your data directory and directory for storing and loading checkpoints:
+```
+export NEOX_DATA_PATH=/mnt/sda/data/enwiki8 #or wherever your data is stored on your system
+export NEOX_CHECKPOINT_PATH=/mnt/sda/checkpoints
+```
+And then, from the gpt-neox directory, you can build the image and run a shell in a container with
+```
+docker compose run gpt-neox bash
+```
+After the build, you should be able to do this:
+```
+mchorse@537851ed67de:~$ echo $(pwd)
+/home/mchorse
+mchorse@537851ed67de:~$ ls -al
+total 48
+drwxr-xr-x  1 mchorse mchorse 4096 Jan  8 05:33 .
+drwxr-xr-x  1 root    root    4096 Jan  8 04:09 ..
+-rw-r--r--  1 mchorse mchorse  220 Feb 25  2020 .bash_logout
+-rw-r--r--  1 mchorse mchorse 3972 Jan  8 04:09 .bashrc
+drwxr-xr-x  4 mchorse mchorse 4096 Jan  8 05:35 .cache
+drwx------  3 mchorse mchorse 4096 Jan  8 05:33 .nv
+-rw-r--r--  1 mchorse mchorse  807 Feb 25  2020 .profile
+drwxr-xr-x  2 root    root    4096 Jan  8 04:09 .ssh
+drwxrwxr-x  8 mchorse mchorse 4096 Jan  8 05:35 chk
+drwxrwxrwx  6 root    root    4096 Jan  7 17:02 data
+drwxr-xr-x 11 mchorse mchorse 4096 Jan  8 03:52 gpt-neox
+```
+For a long-running job, you should run
+```
+docker compose up -d
+```
+to run the container in detached mode, and then, in a separate terminal session, run
+```
+docker compose exec gpt-neox bash
+```
+You can then run any job you want from inside the container.
+Concerns when running for a long time or in detached mode include
+ - You will have to terminate the container manually when you are no longer using it
+ - If you want processes to continue running when your shell session ends, you will need to background them.
+ - If you then want logging, you will have to make sure to pipe logs to disk, and set up wandb and/or Comet logging.
+If you prefer to run the prebuilt container image from dockerhub, you can run the docker compose commands with ```-f docker-compose-dockerhub.yml``` instead, e.g.,
+```
+docker compose run -f docker-compose-dockerhub.yml gpt-neox bash
+```
+## Usage
+All functionality should be launched using `deepy.py`, a wrapper around the `deepspeed` launcher.
+We currently offer three main functions:
+1. `train.py` is used for training and finetuning models.
+2. `eval.py` is used to evaluate a trained model using the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
+3. `generate.py` is used to sample text from a trained model.
+which can be launched with:
+```bash
+./deepy.py [script.py] [./path/to/config_1.yml] [./path/to/config_2.yml] ... [./path/to/config_n.yml]
+```
+For example, to launch training you can run
+```bash
+./deepy.py train.py ./configs/20B.yml ./configs/local_cluster.yml
+```
+For more details on each entry point, see the [Training and Finetuning](#training-and-finetuning), [Inference](#inference) and [Evaluation](#evaluation) respectively.
+# Configuration
+GPT-NeoX parameters are defined in a YAML configuration file which is passed to the deepy.py launcher. We have provided some example .yml files in [configs](./configs/), showing a diverse array of features and model sizes.
+These files are generally complete, but non-optimal. For example, depending on your specific GPU configuration, you may need to change some settings such as `pipe-parallel-size`, `model-parallel-size` to increase or decrease the degree of parallelisation, `train_micro_batch_size_per_gpu` or `gradient-accumulation-steps` to modify batch size related settings, or the `zero_optimization` dict to modify how optimizer states are parallelised across workers.
+For a more detailed guide to the features available and how to configure them, see [the configuration README](configs/README.md), and for documentation of every possible argument, see [configs/neox_arguments.md](configs/neox_arguments.md).
+## Mixture of Experts
+GPT-NeoX includes multiple expert implementations for MoE. To select between them, specify `moe_type` of `megablocks` (default) or `deepspeed`.
+Both are based on the DeepSpeed MoE parallelism framework, which supports tensor-expert-data parallelism.
+Both allow you to toggle between token-dropping and dropless (default, and this is what Megablocks was designed for).
+Sinkhorn routing to come soon!
+For an example of a basic complete configuration, see configs/125M-dmoe.yml (for Megablocks dropless) or configs/125M-moe.yml.
+Most MoE related configuration arguments are prefixed with `moe`. Some common configuration parameters and their defaults are as follows:
+```
+moe_type: megablocks
+moe_num_experts: 1 # 1 disables MoE. 8 is a reasonable value.
+moe_loss_coeff: 0.1
+expert_interval: 2 # See details below
+enable_expert_tensor_parallelism: false # See details below
+moe_expert_parallel_size: 1 # See details below
+moe_token_dropping: false
+```
+DeepSpeed can be further configured with the following:
+```
+moe_top_k: 1
+moe_min_capacity: 4
+moe_train_capacity_factor: 1.0 # Setting to 1.0
+moe_eval_capacity_factor: 1.0 # Setting to 1.0
+```
+One MoE layer is present every `expert_interval` transformer layers including the first, so with 12 layers total:
+```
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+```
+Experts would be in these layers:
+```
+0, 2, 4, 6, 8, 10
+```
+By default, we use expert-data parallelism, so any available tensor parallelism (`model_parallel_size`) will be used for expert routing. For instance, given the following:
+```
+expert_parallel_size: 4
+model_parallel_size: 2 # aka tensor parallelism
+```
+With 32 GPUs, the behavior will be look like:
+- In non-expert layers:
+  - Tensor parallelism is 2. (There are 32 / 2 = 16 such tensor parallel groups, each of size 2.)
+  - Data parallelism implicitly becomes 32 / 2 = 16.
+- In expert layers:
+  - There is no tensor parallelism.
+  - Expert parallelism is 4. (There are 32 / 4 = 8 expert parallel groups, each of size 4.)
+  - Data parallelism implicitly becomes 32 / 4 = 8.  Some cross-node token routing happens as a result of this redivision of data parallelism between 16 and 8.  To avoid it, ensure that `expert_parallel_size == model_parallel_size`.
+Setting `enable_expert_tensor_parallelism` enables tensor-expert-data (TED) parallelism. The way to interpret the above would then be:
+- In non-expert layers: same as before.
+- In expert layers:
+  - Tensor parallelism is 2. (There are 32 / 2 = 16 tensor parallel groups, each of size 2.)
+  - Expert parallelism is 4. (There are 32 / 4 = 8 expert parallel groups, each of size 4.)
+  - Data parallelism implicitly becomes 32 / (2 * 4) = 4.  Again, cross-node token routing happens.  To avoid, ensure `expert_parallel_size == 1` or `model_parallel_size == 1`.
+So note that DP must be divisible by (MP * EP).  For more details, see the [TED paper].
+Pipeline parallelism is not yet supported - coming soon!
+[TED paper]: https://arxiv.org/abs/2303.06318
+# Datasets
+## Preconfigured Datasets
+Several preconfigured datasets are available, including most components from [the Pile](https://arxiv.org/abs/2101.00027), as well as the Pile train set itself, for straightforward tokenization using the `prepare_data.py` entry point.
+E.G, to download and tokenize the enwik8 dataset with the GPT2 Tokenizer, saving them to `./data` you can run:
+```
+python prepare_data.py -d ./data
+```
+or a single shard of the pile (`pile_subset`) with the GPT-NeoX-20B tokenizer (assuming you have it saved at `./20B_checkpoints/20B_tokenizer.json`):
+```
+python prepare_data.py -d ./data -t HFTokenizer --vocab-file ./20B_checkpoints/20B_tokenizer.json pile_subset
+```
+The tokenized data will be saved out to two files: `[data-dir]/[dataset-name]/[dataset-name]_text_document.bin`and `[data-dir]/[dataset-name]/[dataset-name]_text_document.idx`. You will need to add the prefix that both these files share to your training configuration file under the `data-path` field. E.G:
+```yaml
+  "data-path": "./data/enwik8/enwik8_text_document",
+```
+## Using Custom Data
+To prepare your own dataset for training with custom data, format it as one large [jsonl](https://jsonlines.org/)-formatted file with each item in the list of dictionaries being a separate document. The document text should be grouped under one JSON key, i.e `"text"`. Any auxiliary data stored in other fields will not be used.
+Next make sure to download the GPT2 tokenizer vocab, and merge files from the following links:
+- Vocab: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
+- Merge: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
+Or use the 20B tokenizer (for which only a single Vocab file is needed):
+- Vocab: https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/20B_tokenizer.json
+(alternatively, you can provide any tokenizer file that can be loaded by Hugging Face's tokenizers library with the `Tokenizer.from_pretrained()` command)
+You can now pretokenize your data using `tools/datasets/preprocess_data.py`, the arguments for which are detailed below:
+```
+usage: preprocess_data.py [-h] --input INPUT [--jsonl-keys JSONL_KEYS [JSONL_KEYS ...]] [--num-docs NUM_DOCS] --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer} [--vocab-file VOCAB_FILE] [--merge-file MERGE_FILE] [--append-eod] [--ftfy] --output-prefix OUTPUT_PREFIX
+                          [--dataset-impl {lazy,cached,mmap}] [--workers WORKERS] [--log-interval LOG_INTERVAL]
+optional arguments:
+  -h, --help            show this help message and exit
+input data:
+  --input INPUT         Path to input jsonl files or lmd archive(s) - if using multiple archives, put them in a comma separated list
+  --jsonl-keys JSONL_KEYS [JSONL_KEYS ...]
+                        space separate listed of keys to extract from jsonl. Default: text
+  --num-docs NUM_DOCS   Optional: Number of documents in the input data (if known) for an accurate progress bar.
+tokenizer:
+  --tokenizer-type {HFGPT2Tokenizer,HFTokenizer,GPT2BPETokenizer,CharLevelTokenizer}
+                        What type of tokenizer to use.
+  --vocab-file VOCAB_FILE
+                        Path to the vocab file
+  --merge-file MERGE_FILE
+                        Path to the BPE merge file (if necessary).
+  --append-eod          Append an <eod> token to the end of a document.
+  --ftfy                Use ftfy to clean text
+output data:
+  --output-prefix OUTPUT_PREFIX
+                        Path to binary output file without suffix
+  --dataset-impl {lazy,cached,mmap}
+                        Dataset implementation to use. Default: mmap
+runtime:
+  --workers WORKERS     Number of worker processes to launch
+  --log-interval LOG_INTERVAL
+                        Interval between progress updates
+```
+For example:
+```bash
+python tools/datasets/preprocess_data.py \
+            --input ./data/mydataset.jsonl.zst \
+            --output-prefix ./data/mydataset \
+            --vocab ./data/gpt2-vocab.json \
+            --merge-file gpt2-merges.txt \
+            --dataset-impl mmap \
+            --tokenizer-type GPT2BPETokenizer \
+            --append-eod
+```
+You would then run training with the following settings added to your configuration file:
+```yaml
+  "data-path": "data/mydataset_text_document",
+```
+# Training and Finetuning
+Training is launched using `deepy.py`, a wrapper around DeepSpeed's launcher, which launches the same script in parallel across many GPUs / nodes.
+The general usage pattern is:
+```bash
+python ./deepy.py train.py [path/to/config1.yml] [path/to/config2.yml] ...
+```
+You can pass in an arbitrary number of configs which will all be merged at runtime.
+You can also optionally pass in a config prefix, which will assume all your configs are in the same folder and append that prefix to their path.
+For example:
+```bash
+python ./deepy.py train.py -d configs 125M.yml local_setup.yml
+```
+This will deploy the `train.py` script on all nodes with one process per GPU. The worker nodes and number of GPUs are specified in the `/job/hostfile` file (see [parameter documentation](configs/README.md)), or can simply be passed in as the `num_gpus` arg if running on a single node setup.
+Although this is not strictly necessary, we find it useful to define the model parameters in one config file (e.g `configs/125M.yml`) and the data path parameters in another (e.g `configs/local_setup.yml`).
+## Pretrained Models
+### GPT-NeoX-20B
+GPT-NeoX-20B is a 20 billion parameter autoregressive language model trained on [the Pile](https://arxiv.org/abs/2101.00027). Technical details about GPT-NeoX-20B can be found in [the associated paper](https://arxiv.org/abs/2204.06745). The configuration file for this model is both available at [`./configs/20B.yml`](./configs/20B.yml) and included in the download links below.
+[Slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/) - (No optimizer states, for inference or finetuning, 39GB)
+To download from the command line to a folder named `20B_checkpoints`, use the following command:
+```bash
+wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/ -P 20B_checkpoints
+```
+[Full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/) - (Including optimizer states, 268GB)
+To download from the command line to a folder named `20B_checkpoints`, use the following command:
+```bash
+wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights/ -P 20B_checkpoints
+```
+Weights can be alternatively be downloaded using a BitTorrent client. Torrent files can be downloaded here: [slim weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights.torrent), [full weights](https://the-eye.eu/public/AI/models/GPT-NeoX-20B/full_weights.torrent).
+We additionally have 150 checkpoints saved throughout training, one every 1,000 steps. We are working on figuring out how to best serve these at scale, but in the meanwhile people interested in working with the partially trained checkpoints can email us at [email protected] to arrange access.
+### Pythia
+The Pythia Scaling Suite is a suite of models ranging from 70M parameters to 12B parameters trained on [the Pile](https://pile.eleuther.ai) intended to promote research on interpretability and training dynamics of large language models. Further details about the project and links to the models can be found in the [in the paper](https://arxiv.org/abs/2304.01373) and [on the project's GitHub](https://github.com/EleutherAI/pythia).
+### Polyglot
+The Polyglot Project is an effort to train powerful non-English pretrained language models to promote the accessibility of this technology to researchers outside the dominant powerhouses of machine learning. EleutherAI has trained and released 1.3B, 3.8B, and 5.8B parameter Korean language models, the largest of which outpreforms all other publicly available language models on Korean language tasks. Further details about the project and links to the models can be found [here](https://github.com/EleutherAI/polyglot).
+# Inference
+**For most uses we recommend deploying models trained using the GPT-NeoX library via the Hugging Face Transformers library which is better optimized for inference.**
+We support three types of generation from a pretrained model:
+1. Unconditional generation
+2. Conditional generation based on an input read from a file
+3. Interactive generation, which allows for multiple rounds of back-and-forth between a user and the language model via a command line interface
+All three types of text generation can be launched via `python ./deepy.py generate.py -d configs 125M.yml local_setup.yml text_generation.yml` with the appropriate values set in `configs/text_generation.yml`.
+# Evaluation
+GPT-NeoX supports evaluation on downstream tasks through the [language model evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness).
+To evaluate a trained model on the evaluation harness, simply run:
+```bash
+python ./deepy.py eval.py -d configs your_configs.yml --eval_tasks task1 task2 ... taskn
+```
+where `--eval_tasks` is a list of evaluation tasks followed by spaces, e.g `--eval_tasks lambada hellaswag piqa sciq`. For details of all tasks available, refer to the [lm-evaluation-harness repo](https://github.com/EleutherAI/lm-evaluation-harness).
+# Exporting to Hugging Face
+GPT-NeoX is optimized heavily for training only, and GPT-NeoX model checkpoints are not compatible out of the box with other deep learning libraries. To make models easily loadable and shareable with end users, and for further exporting to various other frameworks, GPT-NeoX supports checkpoint conversion to the [Hugging Face Transformers](https://arxiv.org/abs/1910.03771) format.
+Though NeoX supports a number of different architectural configurations, including AliBi positional embeddings, not all of these configurations map cleanly onto the supported configurations within Hugging Face Transformers.
+NeoX supports export of compatible models into the following architectures:
+- GPTNeoXForCausalLM
+- LlamaForCausalLM
+- MistralForCausalLM
+Training a model which does not fit into one of these Hugging Face Transformers architectures cleanly will require writing custom modeling code for the exported model.
+To convert a GPT-NeoX library checkpoint to Hugging Face-loadable format, run:
+```bash
+python ./tools/ckpts/convert_neox_to_hf.py --input_dir /path/to/model/global_stepXXX --config_file your_config.yml --output_dir hf_model/save/location --precision {auto,fp16,bf16,fp32} --architecture {neox,mistral,llama}
+```
+Then to upload a model to [the Hugging Face Hub](https://huggingface.co/), run:
+```bash
+huggingface-cli login
+python ./tools/ckpts/upload.py
+```
+and input the requested information, including HF hub user token.
+### Importing Models Into GPT-NeoX
+NeoX supplies several utilities for converting a pretrained model checkpoint into a format that can be trained within the library.
+The following models or model families can be loaded in GPT-NeoX:
+- Llama 1
+- Llama 2
+- CodeLlama
+- Mistral-7b-v0.1
+We provide two utilities for converting from two different checkpoint formats into a format compatible with GPT-NeoX.
+To convert a Llama 1 or Llama 2 checkpoint distributed by Meta AI from its original file format (downloadable [here](https://github.com/facebookresearch/llama) or [here](https://huggingface.co/meta-llama/Llama-2-7b)) into the GPT-NeoX library, run
+```
+python tools/ckpts/convert_raw_llama_weights_to_neox.py --input_dir /path/to/model/parent/dir/7B --model_size 7B --output_dir /path/to/save/ckpt --num_output_shards <TENSOR_PARALLEL_SIZE> (--pipeline_parallel if pipeline-parallel-size >= 1)
+```
+To convert from a Hugging Face model into a NeoX-loadable, run `tools/ckpts/convert_hf_to_sequential.py`. See documentation within that file for further options.
+# Monitoring
+In addition to storing logs locally, we provide built-in support for two popular experiment monitoring frameworks: [Weights & Biases](https://wandb.ai/site), [TensorBoard](https://www.tensorflow.org/tensorboard/), and [Comet](https://www.comet.com/site)
+## Weights and Biases
+[Weights & Biases to record our experiments](https://wandb.ai/eleutherai/neox) is a machine learning monitoring platform. To use wandb to monitor your gpt-neox experiments:
+1. Create an account at https://wandb.ai/site to generate your API key
+2. Log into Weights & Biases on your machine&mdash;you can do this by executing `wandb login`&mdash;your runs will automatically be recorded.
+3. Dependencies required for wandb monitoring can be found in and installed from `./requirements/requirements-wandb.txt`. An example config is provided in `./configs/local_setup_wandb.yml`.
+4. There are two optional fields associated with Weights & Biases: <code><var>wandb_group</var></code> allows you to name the run group and <code><var>wandb_team</var></code> allows you to assign your runs to an organization or team account. An example config is provided in `./configs/local_setup_wandb.yml`.
+## TensorBoard
+We support using TensorBoard via the <code><var>tensorboard-dir</var></code> field. Dependencies required for TensorBoard monitoring can be found in and installed from  `./requirements/requirements-tensorboard.txt`.
+## Comet
+[Comet](https://www.comet.com/site) is a machine learning monitoring platform. To use comet to monitor your gpt-neox experiments:
+1. Create an account at https://www.comet.com/login to generate your API key.
+2. Once generated, link your API key at runtime by running `comet login` or passing `export COMET_API_KEY=<your-key-here>`
+3. Install `comet_ml` and any dependency libraries via `pip install -r requirements/requirements-comet.txt`
+4. Enable Comet with `use_comet: True`. You can also customize where data is being logged with `comet_workspace` and `comet_project`. A full example config with comet enabled is provided in `configs/local_setup_comet.yml`.
+5. Run your experiment, and monitor metrics in the Comet workspace that you passed!
+# Running on multi-node
+If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, you can set the environment variable `DLTS_HOSTFILE` to point to the hostfile.
+# Profiling
+We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling.
+## Nsight Systems Profiling
+To use the Nsight Systems profiling, set config options `profile`, `profile_step_start`, and `profile_step_stop` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config).
+To populate nsys metrics, launch training with:
+```
+nsys profile -s none -t nvtx,cuda -o <path/to/profiling/output> --force-overwrite true \
+--capture-range=cudaProfilerApi --capture-range-end=stop python $TRAIN_PATH/deepy.py \
+$TRAIN_PATH/train.py --conf_dir configs <config files>
+```
+The generated output file can then by viewed with the Nsight Systems GUI:
+![nsight-prof](images/nsight_profiling.png)
+## PyTorch Profiling
+To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config).
+The PyTorch profiler will save traces to your `tensorboard` log directory.  You can view these traces within
+TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
+![torch-prof](images/pytorch_profiling.png)
+## PyTorch Memory Profiling
+To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path` (see [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/neox_arguments.md) for argument usage, and [here](https://github.com/EleutherAI/gpt-neox/blob/main/configs/prof.yml) for a sample config).
+![mem-prof](images/memory_profiling.png)
+View the generated profile with the [memory_viz.py](https://github.com/pytorch/pytorch/blob/main/torch/cuda/_memory_viz.py) script. Run with:
+```
+python _memory_viz.py trace_plot <generated_profile> -o trace.html
+```
+# Adoption and Publications
+The GPT-NeoX library was been widely adopted by academic and industry researchers and ported on to many HPC systems.
+If you have found this library useful in your research, please reach out and let us know! We would love to add you to our lists.
+## Publications
+EleutherAI and our collaborators have used it in the following publications:
+ - **Sid Black**, **Stella Biderman**, **Eric Hallahan**, **Quentin Anthony**, **Leo Gao**, **Laurence Golding**, **Horace He**, **Connor Leahy**, **Kyle McDonell**, **Jason Phang**, **Michael Pieler**, **Shivanshu Purohit**, **Laria Reynolds**, **Jon Tow**, **Ben Wang**, and **Samuel Weinbach**. "[GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*, 2022.
+ - **Stella Biderman**, **Hailey Schoelkopf**, **Quentin Anthony**, **Herbie Bradley**, **Kyle O'Brien**, **Eric Hallahan**, **Mohammad Aflah Khan**, **Shivanshu Purohit**, **USVSN Sai Prashanth**, Edward Raff, **Aviya Skowron**, **Lintang Sutawika**, **Oskar van der Wal**. "[Pythia: A suite for analyzing large language models across training and scaling](https://arxiv.org/abs/2304.01373)." In _International Conference on Machine Learning_, pp. 2397-2430. _PMLR_, 2023.
+ - Zhangir Azerbayev, Bartosz Piotrowski, **Hailey Schoelkopf**, Edward W. Ayers, Dragomir Radev, and Jeremy Avigad. "[Proofnet: Autoformalizing and formally proving undergraduate-level mathematics](https://arxiv.org/abs/2302.12433). *arXiv preprint arXiv:2302.12433*, 2023.
+ - **Stella Biderman**, **USVSN Sai Prashanth**, **Lintang Sutawika**, **Hailey Schoelkopf**, **Quentin Anthony**, **Shivanshu Purohit**, and Edward Raff. "[Emergent and predictable memorization in large language models.](https://arxiv.org/abs/2304.11158)" In _Neural Information Processing Systems_, 2023.
+ - **Hyunwoong Ko**, **Kichang Yang**, **Minho Ryu**, **Taekyoon Choi**, **Seungmu Yang,** and Sungho Park. "[A Technical Report for Polyglot-Ko: Open-Source Large-Scale Korean Language Models](https://arxiv.org/abs/2306.02254)." *arXiv preprint arXiv:2306.02254*, 2023.
+ - Kshitij Gupta, Benjamin Thérien, Adam Ibrahim, Mats Leon Richter, **Quentin Anthony**, Eugene Belilovsky, Irina Rish, and Timothée Lesort. "[Continual Pre-Training of Large Language Models: How to re-warm your model?](https://arxiv.org/abs/2308.04014)" In _Workshop on Efficient Systems for Foundation Models @ ICML_, 2023.
+ - **Zhangir Azerbayev**, **Hailey Schoelkopf**, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q Jiang, Jia Deng, **Stella Biderman**, and Sean Welleck. "[Llemma: An open language model for mathematics]([https://arxiv.org/abs/2308.04014](https://arxiv.org/abs/2310.10631))" In _Math-AI Workshop @ NeurIPS_, 2023.
+ - Alexander Havrilla, Maksym Zhuravinskyi, Duy Phung, Aman Tiwari, Jonathan Tow, **Stella Biderman**, **Quentin Anthony**, and **Louis Castricato**. "[trlX: A Framework for Large Scale Reinforcement Learning from Human Feedback](https://aclanthology.org/2023.emnlp-main.530/)." In _Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing_, 2023.
+ -  **Quentin Anthony**, **Jacob Hatef**, Deepak Narayanan, **Stella Biderman**, Stas Bekman, Junqi Yin, Aamir Shafi, Hari Subramoni, and Dhabaleswar Panda. "[The Case for Co-Designing Model Architectures with Hardware](https://arxiv.org/abs/2401.14489)." In _arXiv preprint_, 2024.
+ - Adam Ibrahim, Benjamin Thérien, Kshitij Gupta, Mats L. Richter, **Quentin Anthony**, Timothée Lesort, Eugene Belilovsky, Irina Rish. "[Simple and Scalable Strategies to Continually Pre-train Large Language Models](https://arxiv.org/abs/2403.08763)." In _arXiv preprint_, 2024.
+ - Junqi Yin, Avishek Bose, Guojing Cong, Isaac Lyngaas, **Quentin Anthony**. "[Comparative Study of Large Language Model Architectures on Frontier](https://arxiv.org/abs/2402.00691)." In _arXiv preprint_, 2024.
+The following publications by other research groups use this library:
+- Ta-Chung Chi, Ting-Han Fan, Peter J. Ramadge, and Alexander Rudnicky. "[KERPLE: Kernelized Relative Positional Embedding for Length Extrapolation](https://arxiv.org/abs/2205.09921)." In *Advances in Neural Information Processing Systems* 35, 2022.
+- Sameera Horawalavithana, Ellyn Ayton, Shivam Sharma, Scott Howland, Megha Subramanian, Scott Vasquez, Robin Cosbey, Maria Glenski, and Svitlana Volkova. "[Foundation Models of Scientific Knowledge for Chemistry: Opportunities, Challenges and Lessons Learned](https://aclanthology.org/2022.bigscience-1.12/)." In *Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models*, 2022.
+- Sophia Kolak, Ruben Martins, Claire Le Goues, and Vincent J. Hellendoorn. "[Patch Generation with Language Models: Feasibility and Scaling Behavior](https://par.nsf.gov/biblio/10340618)"." In *Proceedings of the Deep Learning for Code Workshop at ICLR*, 2022.
+- Frank F. Xu, Uri Alon, Graham Neubig, and Vincent J. Hellendoorn. "[A Systematic Evaluation of Large Language Models of Code](https://arxiv.org/abs/2202.13169)." In *Proceedings of the ICLR Workshop on Deep Learning For Code*, 2022.
+- Byung-Doh Oh and William Schuler. "[Transformer-Based LM Surprisal Predicts Human Reading Times Best with About Two Billion Training Tokens](https://arxiv.org/abs/2304.11389)." In *Findings of the Association for Computational Linguistics*, 2023.
+- Ta-Chung Chi, Ting-Han Fan, Alexander Rudnicky, and Peter Ramadge. "[Dissecting Transformer Length Extrapolation via the Lens of Receptive Field Analysis](https://aclanthology.org/2023.acl-long.756/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)_, pp. 13522-13537, 2023.
+- Ta-Chung Chi, Ting-Han Fan, Li-Wei Chen, Alexander Rudnicky, and Peter Ramadge. "[Latent Positional Information is in the Self-Attention Variance of Transformer Language Models Without Positional Embeddings](https://aclanthology.org/2023.acl-short.102/)." In _Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)_, pp. 13522-13537, 2023.
+- Xidong Feng, Yicheng Luo, Ziyan Wang, Hongrui Tang, Mengyue Yang, Kun Shao, David Mguni, Yali Du, and Jun Wang. "[ChessGPT: Bridging Policy Learning and Language Modeling.](https://arxiv.org/abs/2306.09200)" _arXiv preprint arXiv:2306.09200_, 2023.
+- Orion Walker Dollar, Sameera Horawalavithana, Scott Vasquez, W. James Pfaendtner, and Svitlana Volkova. "[MolJET: Multimodal Joint Embedding Transformer for Conditional de novo Molecular Design and Multi-Property Optimization.](https://openreview.net/pdf?id=7UudBVsIrr)" _preprint under review_, 2023.
+- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." _arXiv:2310.01119_, 2023.
+- Alon Albalak, Liangming Pan, Colin Raffel, and William Yang Wang. "[Efficient Online Data Mixing For Language Model Pre-Training](https://arxiv.org/abs/2312.02406)." In _NeurIPS Workshop on R0-FoMo: Robustness of Few-shot and Zero-shot Learning in Large Foundation Models_, 2023.
+- Eghbal A. Hosseini and Evelina Fedorenko. "[Large language models implicitly learn to straighten neural sentence trajectories to construct a predictive representation of natural language](https://www.biorxiv.org/content/10.1101/2023.11.05.564832v1)." In _Neural Information Processing Systems_, 2023.
+- Junqi Yin, Sajal Dash, Feiyi Wang, and Mallikarjun Shankar. "[FORGE: Pre-Training Open Foundation Models for Science](https://dl.acm.org/doi/abs/10.1145/3581784.3613215). In _Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis_, 1-13, 2023.
+- Jean Kaddour and Qi Liu. "[Text Data Augmentation in Low-Resource Settings via Fine-Tuning of Large Language Models](https://arxiv.org/abs/2310.01119)." In _arXiv preprint arXiv:2310.01119_, 2023.
+- Peng Di, Jianguo Li, Hang Yu, Wei Jiang, Wenting Cai, Yang Cao, Chaoyu Chen, Dajun Chen, Hongwei Chen, Liang Chen, Gang Fan, Jie Gong, Zi Gong, Wen Hu, Tingting Guo, Zhichao Lei, Ting Li, Zheng Li, Ming Liang, Cong Liao, Bingchang Liu, Jiachen Liu, Zhiwei Liu, Shaojun Lu, Min Shen, Guangpei Wang, Huan Wang, Zhi Wang, Zhaogui Xu, Jiawei Yang, Qing Ye, Gehao Zhang, Yu Zhang, Zelin Zhao, Xunjin Zheng, Hailian Zhou, Lifu Zhu, and Xianying Zhu. "[CodeFuse-13B: A Pretrained Multi-lingual Code Large Language Model](https://arxiv.org/abs/2310.06266)." In _arXiv preprint arXiv:2310.06266_, 2023.
+- Nikitha Rao, Kush Jain, Uri Alon, Claire Le Goues, and Vincent J Hellendoorn. "[CAT-LM Training Language Models on Aligned Code And Tests](https://arxiv.org/abs/2310.01602)." In _38th IEEE/ACM International Conference on Automated Software Engineering (ASE)_, pp. 409-420. IEEE, 2023.
+- Pratyush Patel, Esha Choukse, Chaojie Zhang, Íñigo Goiri, Brijesh Warrier, Nithish Mahalingam, Ricardo Bianchini. "[POLCA: Power Oversubscription in LLM Cloud Providers](https://arxiv.org/abs/2308.12908)." In _arXiv preprint_, 2023.
+- Junqi Yin, Sajal Dash, John Gounley, Feiyi Wang, and Georgia Tourassi. "[Evaluation of pre-training large language models on leadership-class supercomputers](https://link.springer.com/article/10.1007/s11227-023-05479-7)." In _the Journal of Supercomputing_ 79, no. 18, 2023.
+- Tal Kadosh, Niranjan Hasabnis, Vy A. Vo, Nadav Schneider, Neva Krien, Mihai Capota, Abdul Wasay, Nesreen Ahmed, Ted Willke, Guy Tamir, Yuval Pinter, Timothy Mattson, and Gal Oren. "[Domain-Specific Code Language Models: Unraveling the Potential for HPC Codes and Tasks](https://arxiv.org/abs/2312.13322)." In _arXiv preprint_, 2023.
+- Guobin Shen, Dongcheng Zhao, Yiting Dong, Yang Li, Jindong Li, Kang Sun, and Yi Zeng. "[Astrocyte-Enabled Advancements in Spiking Neural Networks for Large Language Modeling](https://arxiv.org/abs/2312.07625)." In _arXiv preprint_, 2023.
+- Eghbal A. Hosseini, Martin A. Schrimpf, Yian Zhang, Samuel Bowman, Noga Zaslavsky, and Evelina Fedorenko. "[Artificial neural network language models align neurally and behaviorally with humans even after a developmentally realistic amount of training.](https://www.biorxiv.org/content/10.1101/2022.10.04.510681)" In _Neurobiology of Language_, 2024.
+- Xiongye Xiao, Chenyu Zhou, Heng Ping, Defu Cao, Yaxing Li, Yizhuo Zhou, Shixuan Li, and Paul Bogdan. "[Exploring Neuron Interactions and Emergence in LLMs: From the Multifractal Analysis Perspective](https://arxiv.org/abs/2402.09099)." In _arXiv preprint_, 2024.
+- Zhiyuan Zeng, Qipeng Guo, Zhaoye Fei, Zhangyue Yin, Yunhua Zhou, Linyang Li, Tianxiang Sun, Hang Yan, Dahua Lin, and Xipeng Qiu. "[Turn Waste into Worth: Rectifying Top-k Router of MoE](https://arxiv.org/abs/2402.12399)." In _arXiv preprint_, 2024.
+## Models
+The following models were trained using this library:
+### English LLMs
+- EleutherAI's [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) and [Pythia (70M through 13B)](https://github.com/EleutherAI/pythia)
+- CarperAI's [FIM-NeoX-1.3B](https://huggingface.co/CarperAI/FIM-NeoX-1.3B)
+- StabilityAI's [StableLM (3B and 7B)](https://github.com/Stability-AI/StableLM)
+- Together.ai's [RedPajama-INCITE (3B and 7B)](https://together.ai/blog/redpajama-models-v1)
+- Carnegie Mellon University's [proofGPT (1.3B and 6.7B)](https://huggingface.co/hoskinson-center/proofGPT-v0.1-6.7B)
+- Dampish's [StellarX (2.8B and 4B)](https://huggingface.co/Dampish/StellarX-4B-V0.2)
+- Chinese Academy of Sciences's [AstroSNN (1.5B)](https://arxiv.org/abs/2312.07625)
+### Non-English LLMs
+- EleutherAI's [Polyglot-Ko (1.3B through 12.8B)](https://github.com/EleutherAI/polyglot) (Korean)
+- Korea University's [KULLM-Polyglot (5.8B and 12.8B)](https://github.com/nlpai-lab/KULLM) (Korean)
+- Stability AI's [Japanese Stable LM (7B)](https://huggingface.co/stabilityai/japanese-stablelm-base-alpha-7b) (Japanese)
+- LearnItAnyway's [LLaVA-Polyglot-Ko (1.3B)](https://huggingface.co/LearnItAnyway/llava-polyglot-ko-1.3b-hf) (Korean)
+- Rinna Co.'s [japanese-gpt-neox-3.6b](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) (Japanese) and [bilingual-gpt-neox-4b](https://huggingface.co/rinna/bilingual-gpt-neox-4b) (English / Japanese)
+- CyberAgent's [Open-CLM (125M through 7B)](https://huggingface.co/cyberagent/open-calm-7b) (Japanese)
+- The Hungarian Research Centre for Linguistics's [PULI GPTrio (6.7B)](https://huggingface.co/NYTK/PULI-GPTrio) (Hungarian / English / Chinese)
+- The University of Tokyo's [weblab-10b](https://huggingface.co/Kojima777/weblab-10b) and [weblab-10b-instruct](https://huggingface.co/Kojima777/weblab-10b-instruction-sft) (Japanese)
+- nolando.ai's [Hi-NOLIN (9B)](https://blog.nolano.ai/Hi-NOLIN/) (English, Hindi)
+- Renmin University of China's [YuLan (12B)](https://huggingface.co/yulan-team/YuLan-Base-12b) (English, Chinese)
+- The Basque Center for Language Technology's [Latixna (70B)](https://huggingface.co/HiTZ/latxa-70b-v1.2) (Basque)
+### Code Models
+- Carnegie Mellon University's [PolyCoder (160M through 2.7B)](https://github.com/VHellendoorn/Code-LMs) and [CAT-LM (2.7B)](https://huggingface.co/nikitharao/catlm)
+- StabilityAI's [StableCode (1.3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding) and [StableCode-Completion-Alpha (3B)](https://stability.ai/blog/stablecode-llm-generative-ai-coding)
+- CodeFuse AI's [CodeFuse (13B)](https://huggingface.co/codefuse-ai/CodeFuse-13B)
+### AI for Science
+- EleutherAI's [LLeMMA (34B)](https://arxiv.org/abs/2310.10631)
+- Oak Ridge National Lab's [FORGE (26B)](https://github.com/at-aaims/forge)
+- Oak Ridge National Lab's [Unnamed Material Science Domain Models (7B)](https://arxiv.org/abs/2402.00691)
+- Pacific Northwest National Lab's [MolJet (undisclosed size)](https://openreview.net/pdf?id=7UudBVsIrr)
+### Other Modalities
+-  Rinna Co.'s [PSLM (7B)](https://arxiv.org/abs/2406.12428) (speech / text)
+-  University College London's [ChessGPT-3B](https://huggingface.co/Waterhorse/chessgpt-base-v1)
+-  Gretel's [Text-to-Table (3B)](https://huggingface.co/gretelai/text2table)
+# Administrative Notes
+## Citing GPT-NeoX
+If you have found the GPT-NeoX library helpful in your work, you can cite this repository as
+```bibtex
+@software{gpt-neox-library,
+  title = {{GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch}},
+  author = {Andonian, Alex and Anthony, Quentin and Biderman, Stella and Black, Sid and Gali, Preetham and Gao, Leo and Hallahan, Eric and Levy-Kramer, Josh and Leahy, Connor and Nestler, Lucas and Parker, Kip and Pieler, Michael and Phang, Jason and Purohit, Shivanshu and Schoelkopf, Hailey and Stander, Dashiell and Songz, Tri and Tigges, Curt and Thérien, Benjamin and Wang, Phil and Weinbach, Samuel},
+  url = {https://www.github.com/eleutherai/gpt-neox},
+  doi = {10.5281/zenodo.5879544},
+  month = {9},
+  year = {2023},
+  version = {2.0.0},
+}
+```
+To cite the 20 billion parameter model named `GPT-NeoX-20B`, please use
+```bibtex
+@inproceedings{gpt-neox-20b,
+  title={{GPT-NeoX-20B}: An Open-Source Autoregressive Language Model},
+  author={Black, Sid and Biderman, Stella and Hallahan, Eric and Anthony, Quentin and Gao, Leo and Golding, Laurence and He, Horace and Leahy, Connor and McDonell, Kyle and Phang, Jason and Pieler, Michael and Prashanth, USVSN Sai and Purohit, Shivanshu and Reynolds, Laria and Tow, Jonathan and Wang, Ben and Weinbach, Samuel},
+  booktitle={Proceedings of the ACL Workshop on Challenges \& Perspectives in Creating Large Language Models},
+  url={https://arxiv.org/abs/2204.06745},
+  year={2022}
+}
+```
+## Contributing
+GPT-NeoX is built by the open-source AI community, and relies on our amazing contributors! Please see our
+[contributing](CONTRIBUTING.md) guide for more details on our CLA, code formatting, testing,
+etc.
+## Licensing
+This repository hosts code that is part of EleutherAI's GPT-NeoX project. Copyright (c) 2024, EleutherAI. Licensed under the Apache License:
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+        http://www.apache.org/licenses/LICENSE-2.0
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+This repository is based off code written by NVIDIA that is licensed under the Apache License, Version 2.0. In accordance with the Apache License, all files that are modifications of code originally written by NVIDIA maintain a NVIDIA copyright header. All files that do not contain such a header are the exclusive copyright of EleutherAI. When the NVIDIA code has been modified from its original version, that fact is noted in the copyright header. All derivative works of this repository must preserve these headers under the terms of the Apache License.
+This repository also contains code written by a number of other authors. Such contributions are marked and the relevant licensing is included where appropriate.
+For full terms, see the `LICENSE` file. If you have any questions, comments, or concerns about licensing please email us at [email protected].
+## Acknowledgements
+We run our experiments on a Kubernetes cluster provided by [CoreWeave](https://coreweave.com/) and a Slurm cluster provided by [Stability AI](https://stability.ai). We are thankful to the DeepSpeed team for their advice and consultation.

configs/1-3B.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 24,
+   "hidden_size": 2048,
+   "num_attention_heads": 16,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0002,
+       "betas": [0.9, 0.95],
+       "eps":  1.0e-8,
+     }
+   },
+   "min_lr": 0.00002,
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/125M-dmoe.yml ADDED Viewed

	@@ -0,0 +1,101 @@

+# GPT-2 pretraining setup
+{
+   # See README for MoE config docs!
+   "moe_type": "megablocks",
+   "moe_token_dropping": false,
+   # Have 4 experts per layer (every 2 layers by default)
+   "moe_num_experts": 4,
+   # parallelism settings
+   "enable_expert_tensor_parallelism": true,
+   "pipe_parallel_size": 1, # not yet supported for MoE
+   "model_parallel_size": 1,
+   "moe_expert_parallel_size": 1,
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00006,
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 10,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+  #  networking
+  "hostfile": "/mock_path"
+}

configs/125M-json.yml ADDED Viewed

	@@ -0,0 +1,80 @@

+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  "num_layers": 12,
+  "hidden_size": 768,
+  "num_attention_heads": 12,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "norm": "layernorm",
+  "pos_emb": "rotary",
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0006,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8
+    }
+  },
+  "min_lr": 0.00006,
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true
+  },
+  "train_micro_batch_size_per_gpu": 4,
+  "data_impl": "mmap",
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0.0,
+  "attention_dropout": 0.0,
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "train_iters": 320000,
+  "lr_decay_iters": 320000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 10000,
+  "eval_interval": 1000,
+  "eval_iters": 10,
+  "log_interval": 100,
+  "steps_per_print": 10,
+  "keep_last_n_checkpoints": 4,
+  "wall_clock_breakdown": true,
+  "hostfile": "/mock_path"
+}

configs/125M-moe.yml ADDED Viewed

	@@ -0,0 +1,101 @@

+# GPT-2 pretraining setup
+{
+   # See README for MoE config docs!
+   "moe_type": "deepspeed",
+   "moe_token_dropping": true,
+   # Have 4 experts per layer (every 2 layers by default)
+   "moe_num_experts": 4,
+   # parallelism settings
+   "enable_expert_tensor_parallelism": true,
+   "pipe_parallel_size": 1, # not yet supported for MoE
+   "model_parallel_size": 1,
+   "moe_expert_parallel_size": 1,
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00006,
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 10,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+  #  networking
+  "hostfile": "/mock_path"
+}

configs/125M.yml ADDED Viewed

	@@ -0,0 +1,96 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00006,
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+  #  networking
+  "hostfile": "/mock_path"
+}

configs/13B.yml ADDED Viewed

	@@ -0,0 +1,94 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 40,
+   "hidden_size": 5120,
+   "num_attention_heads": 40,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0001,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   "min_lr": 0.00001,
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/175B.yml ADDED Viewed

	@@ -0,0 +1,92 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 96,
+   "hidden_size": 12288,
+   "num_attention_heads": 96,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00006,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000006,
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/19M.yml ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  # model settings
+  "num_layers": 6,
+  "hidden_size": 512,
+  "num_attention_heads": 8,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.0001,
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+  "train_micro_batch_size_per_gpu": 4, #32,
+  "gradient_accumulation_steps": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+  },
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "eval_interval": 100000,
+  "eval_iters": 10,
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+  # additional deepspeed args not specified above
+  "deepspeed_extra_args": {
+    "comms_logger": {
+        "enabled": true,
+        "verbose": true,
+        "prof_all": true,
+        "debug": false
+    },
+  }
+}

configs/2-7B.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 32,
+   "hidden_size": 2560,
+   "num_attention_heads": 32,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00016,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000016,
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/20B.yml ADDED Viewed

	@@ -0,0 +1,113 @@

+# DISCLAIMER: This is the configuration file for the GPT-NeoX-20B model as it was trained on 96x 40GB A100
+# GPUs. Depending on your system configuration, you may need to change some parameters in order to fit
+# the model in memory.
+{
+  # Tokenizer /  checkpoint settings - you will need to change these to the location you have them saved in
+  "vocab_file": "./20B_checkpoints/20B_tokenizer.json",
+  "save": "./20B_checkpoints",
+  "load": "./20B_checkpoints",
+  # If finetuning, edit the following to the location of your finetuning dataset:
+  "data_path": "./data/pile_20B_tokenizer/pile_20B_tokenizer_text_document",
+  # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+  # across the node boundaries )
+  "pipe_parallel_size": 4,
+  "model_parallel_size": 2,
+  # model settings
+  "num_layers": 44,
+  "hidden_size": 6144,
+  "num_attention_heads": 64,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "norm": "layernorm",
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": true,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+  # optimizer settings
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.97e-4,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+      }
+      },
+  "min_lr": 0.97e-5,
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+  "stage": 1,
+  "allgather_partitions": True,
+  "allgather_bucket_size": 1260000000,
+  "overlap_comm": True,
+  "reduce_scatter": True,
+  "reduce_bucket_size": 1260000000,
+  "contiguous_gradients": True,
+  },
+  # batch / data settings (assuming 96 GPUs)
+  "train_micro_batch_size_per_gpu": 4,
+  "gradient_accumulation_steps": 32,
+  "data_impl": "mmap",
+  "split": "995,4,1",
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": false,
+  "synchronize_each_layer": true,
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.01,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+    },
+  # misc. training settings
+  "train_iters": 150000,
+  "lr_decay_iters": 150000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 500, # this variable previously called `save-interval`
+  "eval_interval": 1000,
+  "eval_iters": 10,
+  # logging
+  "log_interval": 2,
+  "steps_per_print": 2,
+  "wall_clock_breakdown": false,
+  ### NEW DATA: ####
+  "tokenizer_type": "HFTokenizer",
+  "tensorboard-dir": "./tensorboard",
+  "log_dir": "./logs",
+}

configs/350M.yml ADDED Viewed

	@@ -0,0 +1,92 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 24,
+   "hidden_size": 1024,
+   "num_attention_heads": 16,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0003,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.00003,
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/49M.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+  # parallelism settings
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  # model settings
+  "num_layers": 10,
+  "hidden_size": 640,
+  "num_attention_heads": 10,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 0.25,
+  "no_weight_tying": true,
+  "gpt_j_residual": true,
+  "output_layer_parallelism": "column",
+  # these should provide some speedup but takes a while to build, set to true if desired
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+  # optimizer settings
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.0008,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.00008,
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+  # batch / data settings
+  "train_micro_batch_size_per_gpu": 32,
+  "gradient_accumulation_steps": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+  },
+  # misc. training settings
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "eval_interval": 100000,
+  "eval_iters": 10,
+  # logging
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+}

configs/6-7B.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 32,
+   "hidden_size": 4096,
+   "num_attention_heads": 32,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00012,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8,
+     }
+   },
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   "min_lr": 0.000012,
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/760M.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 24,
+   "hidden_size": 1536,
+   "num_attention_heads": 16,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # init methods
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00025,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   "min_lr": 0.000025,
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+   # precision settings
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/800M.yml ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 1,
+  # model settings
+  "num_layers": 16,
+  "hidden_size": 2048,
+  "num_attention_heads": 8,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "scaled_upper_triang_masked_softmax_fusion": false,
+  "bias_gelu_fusion": false,
+  "rope_fusion": false,
+  "layernorm_fusion": false,
+  # init methods
+  "init_method": "small_init",
+  "output_layer_init_method": "wang_init",
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.00025,
+      "betas": [0.9, 0.95],
+      "eps": 1.0e-8,
+    }
+  },
+  "min_lr": 0.000025,
+  # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+  "train_micro_batch_size_per_gpu": 16,
+  "gradient_accumulation_steps": 1,
+  "data_impl": "mmap",
+  "num_workers": 1,
+  # activation checkpointing
+  "checkpoint_activations": true,
+  "checkpoint_num_layers": 1,
+  "partition_activations": true,
+  "synchronize_each_layer": true,
+  # regularization
+  "gradient_clipping": 1.0,
+  "weight_decay": 0.1,
+  "hidden_dropout": 0,
+  "attention_dropout": 0,
+  # precision settings
+  "fp16": {
+    "fp16": true,
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 12,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+  },
+  "train_iters": 143000,
+  "lr_decay_iters": 143000,
+  "distributed_backend": "nccl",
+  "lr_decay_style": "cosine",
+  "warmup": 0.01,
+  "checkpoint_factor": 1000,
+  "eval_interval": 40000,
+  "eval_iters": 10,
+  "log_interval": 10,
+  "steps_per_print": 10,
+  "wall_clock_breakdown": true,
+}

configs/README.md ADDED Viewed

	@@ -0,0 +1,368 @@

+# Configuration and parameters
+GPT-NeoX parameters are defined in a YAML configuration file which is passed to the `deepy.py` launcher - for examples see the files contained in this folder.
+Parameters originate from either the [DeepSpeed runner CLI (DSL)](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/launcher/runner.py#L33), [DeepSpeed configuration file (DSC)](https://www.deepspeed.ai/docs/config-json/), [Megatron-LM CLI (Meg)](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/arguments.py#L224) or are GPT-NeoX (NeoX) modifications.
+## Example Configuration (GPT3 Small):
+Below is an example configuration `.yaml` to train a ~160M parameter GPT model. This readme will go through each section in the configuration and the options available.
+For a detailed list of all the arguments available for neox, see [neox_arguments.md](neox_arguments.md)
+Note: yaml arguments may be formatted with either '-' or '\_'. The standard separator used is a '\_' as shown in the example configurations below. However, the use of '-' as a separator may be deprecated in the future.
+```yaml
+# GPT-3 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "rmsnorm",
+   "pos_emb": "none",
+   "no_weight_tying": true,
+    # this should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "train_iters": 320000,
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "max_grad_norm": 1.0,
+       "betas": [0.9, 0.95]
+     }
+   },
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "gradient_accumulation_steps": 1,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0,
+   "hidden_dropout": 0,
+   "attention_dropout": 0,
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # lr decay settings
+   "lr_decay_iters": 320000,
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   # misc. training settings
+   "distributed_backend": "nccl",
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}
+```
+### Parallelism Settings:
+The parallelism settings are left at 1 in all configs, as the settings you want will be highly dependent on your compute setup and network topology.
+We have found it best to do model parallelism within a node, and schedule pipeline stages across node boundaries.
+```yaml
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+```
+These can be set to any integer between `0` and `num_gpus`, and `num_gpus` must be divisible by `pipe_parallel_size` * `model_parallel_size`.
+### Model Settings:
+```yaml
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "rmsnorm",
+   "pos_emb": "none",
+   "no_weight_tying": true,
+    # this should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "train_iters": 320000,
+    # alternatively, use train_epochs to automatically determine the number of training iterations
+    #"train_epochs": 1,
+```
+An example of some basic settings used to configure your model's architecture and number of training steps.
+### Optimizer Settings:
+Our optimizer configuration has a similar syntax to deepspeed's. Different optimizers will have different arguments for "params".
+Learning rate should be configured from here using the `"lr"` field of `optimizer["params"]`.
+```yaml
+  # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "max_grad_norm": 1.0,
+       "betas": [0.9, 0.95]
+     }
+   }
+   ```
+Available optimizer types are:
+- `"Adam"`: regular Adam optimizer
+- `"OneBitAdam"`: Deepspeed's [OneBitAdam optimizer](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). To use 1-bit adam, you'll also need to add the `freeze_step`, `cuda_aware`, and `comm_backend_name` fields, like so:
+```yaml
+   "optimizer": {
+     "type": "OneBitAdam",
+     "params": {
+       "lr": 0.0001,
+       "freeze_step": 23000,
+       "betas": [0.9, 0.95],
+       "cuda_aware": false,
+       "comm_backend_name": "nccl"
+     }
+```
+- `"CPU_Adam"`/`"CPU_torch_adam"`: Adam optimizer on CPU. Either megatron's version ("CPU_Adam") or torch's ("CPU_torch_adam")
+- `"SM3"`: SM3 or [Memory adaptive efficient optimization optimizer](https://arxiv.org/pdf/1901.11150.pdf). We have found this doesn't work well with fp16 training.
+- `"madgrad_wd"`: MADGRAD or [A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic
+    Optimizer] weight decay has been implemented AdamW style instead of the original madgrad Adam style. https://arxiv.org/abs/2101.11075
+### ZeRO Optimization:
+```yaml
+# for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+  "zero_optimization": {
+        "stage": 0,
+        "allgather_partitions": True,
+        "allgather_bucket_size": 500000000,
+        "overlap_comm": True,
+        "reduce_scatter": True,
+        "reduce_bucket_size": 500000000,
+        "contiguous_gradients": True,
+  },
+  "zero_allow_untested_optimizer": false,
+```
+ZeRO optimization in NeoX is currently configured identically to how deepspeed configures it, please see [the deepspeed docs](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training) for more information.
+If you want to combine an optimizer untested by DeepSpeed with ZeRO (i.e, not ADAM or LAMB), you must pass `"zero_allow_untested_optimizer": true` *outside* of the `"zero_optimization"` dictionary (see above).
+N.B - ZeRO stages 2+ are incompatible with pipeline parallelism. Please set `"pipe-parallel-size"` to 0 if you want to use ZeRO stage 2 or more.
+### Batch Size Settings:
+```yaml
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "gradient_accumulation_steps": 1,
+```
+Our global batch size configuration follows deepspeed's and can be configured in a number of ways. At least any one of `"train_batch_size"` and `"train_micro_batch_size_per_gpu"`.
+- `"train_batch_size"`: The effective training batch size. This is the amount of data samples that leads to one step of model update. train_batch_size is aggregated by the batch size that a single GPU processes in one forward/backward pass (a.k.a., train_step_batch_size), the gradient accumulation steps (a.k.a., gradient_accumulation_steps), and the number of GPUs.
+- `"train_micro_batch_size_per_gpu""`: Batch size to be processed by one GPU in one step (without gradient accumulation). When specified, `gradient_accumulation_steps` is automatically calculated using train_batch_size and number of GPUs.
+- `"gradient_accumulation_steps"`: Number of training steps to accumulate gradients before averaging and applying them. This feature is sometimes useful to improve scalability since it results in less frequent communication of gradients between steps. Another impact of this feature is the ability to train with larger batch sizes per GPU. When specified, train_step_batch_size is automatically calculated using train_batch_size and number of GPUs.
+### Extra DeepSpeed Settings
+```yaml
+# additional deepspeed args not specified above
+"deepspeed_extra_args": {
+    "comms_logger": {
+        "enabled": true,
+        "verbose": true,
+        "prof_all": true,
+        "debug": false
+    },
+}
+```
+Additional DeepSpeed settings besides those mentioned above should be wrapped in the `"deepspeed_extra_args` argument, as in the example above. This functionality is designed to allow arguments not specified by existing dataclasses to be passed to DeepSpeed (e.g. when new functionalities are implemented). If any settings are duplicated here from elsewhere in the YAML, the system will throw an exception and notify the user.
+### Dataset / Tokenizer / Checkpoint / Logging Settings:
+```yaml
+   "data_impl": "mmap",
+   "split": "949,50,1",
+   # Suggested data paths when using GPT-NeoX locally
+   "data_path": "data/enwik8/enwik8_text_document",
+   #"train_data_path": "data/enwik8/enwik8_text_document",
+   #"test_data_path": "data/enwik8/enwik8_text_document",
+   #"valid_data_path": "data/enwik8/enwik8_text_document",
+   "vocab_file": "data/gpt2-vocab.json",
+   "merge_file": "data/gpt2-merges.txt",
+   "save": "checkpoints",
+   "load": "checkpoints",
+   "tensorboard_dir": "tensorboard",
+   "log_dir": "logs",
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+```
+For KTO style training, you'll need to add the reward & label data path, e.g.:
+```yaml
+   "data_impl": "mmap",
+   # Suggested data paths when using GPT-NeoX locally
+   "train_data_path": "data/enwik8/enwik8_text_document",
+   "train_label_data_path": "data/enwik8/enwik8_text_label_document",
+   "train_reward_data_path": "data/enwik8/enwik8_text_reward_document",
+   "test_data_path": "data/enwik8/enwik8_text_document",
+   "test_label_data_path": "data/enwik8/enwik8_text_label_document",
+   "test_reward_data_path": "data/enwik8/enwik8_text_reward_document",
+   "valid_data_path": "data/enwik8/enwik8_text_document",
+   "valid_label_data_path": "data/enwik8/enwik8_text_label_document",
+   "valid_reward_data_path": "data/enwik8/enwik8_text_reward_document",
+   "vocab_file": "data/gpt2-vocab.json",
+   "merge_file": "data/gpt2-merges.txt",
+   "save": "checkpoints",
+   "load": "checkpoints",
+   "tensorboard_dir": "tensorboard",
+   "log_dir": "logs",
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+```
+For DPO style training, you'll need to set pos/neg data paths instead of a single one, e.g.
+```yaml
+   "dataset_impl": "pairwise",
+   "train_impl": "dpo",
+   "pack_impl": "unpacked",
+   "dpo_beta": 0.1,
+   "dpo_fp32": true,
+   "pos_train_data_path": "data/enwik8/enwik8_text_pos_document",
+   "pos_valid_data_path": "data/enwik8/enwik8_text_pos_document",
+   "pos_test_data_path": "data/enwik8/enwik8_text_pos_document",
+   "neg_train_data_path": "data/enwik8/enwik8_text_neg_document",
+   "neg_valid_data_path": "data/enwik8/enwik8_text_neg_document",
+   "neg_test_data_path": "data/enwik8/enwik8_text_neg_document",
+   ## If you have labels... (likely to mask out user turns)
+   "pos_train_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
+   "pos_valid_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
+   "pos_test_label_data_path": "data/enwik8/enwik8_text_pos_label_document",
+   "neg_train_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
+   "neg_valid_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
+   "neg_test_label_data_path": "data/enwik8/enwik8_text_neg_label_document",
+   ## If you want to precompute the logits over your dataset...
+   "precompute_model_name": "gpt2",
+   ## Needed for the generation.py step, if precomputing
+   "text_gen_type": "precompute"
+```
+### LR Scheduler settings
+```yaml
+   "lr_decay_iters": 320000,
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+```
+Settings used to modify the learning rate over time.
+N.B - `OneBitAdam` requires you to use deepspeed's internal lr scheduler because reasons. Currently the lr decay style defaults to deepspeed's `WarmupDecay
+### Activation Checkpointing Settings:
+```yaml
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+```
+Checkpointing works by trading compute for memory. Rather than storing all intermediate activations of the entire computation graph for computing backward, the checkpointed part does not save intermediate activations, and instead recomputes them in backward pass.
+### Mixed Precision Training Settings:
+gpt-neox's fp16 training is configured identically to DeepSpeed's, please see [their documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) for more information.
+An example config for fp16 training:
+```yaml
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+```
+Alternatively you can use the `precision` config which can be set to `fp16`, `bfloat16`, or `fp32`. If you set `"precision": "fp16"` without adding a `"fp16": {...}` dict, then it will simply use DeepSpeed's defaults for fp16 training.
+### SLURM Settings
+If you are running GPT-NeoX on a SLURM cluster and wish to use SLURM to coordinate nodes, then you must set the following variables in your config:
+```yaml
+    "launcher": "slurm",
+    "deepspeed_slurm": true
+```
+Additionally, you need to modify _all_ of your configs to conform to the JSON. When launching a GPT-NeoX job you can specify multiple YAML config files. Internally, all of these files are merged into one config and then passed as a single long command line argument to Deep(er)Speed. When using SLURM and its internal command `srun`, python fails to parse this long command line argument unless it is in the more restrictive JSON format. In practice, the example NeoX configs are already very close to JSON. As an example, this is a snippet of a YAML-compatible config, N.B. the comment the capital-F `False`:
+```yaml
+    # optimizer settings
+   "optimizer": {
+     "type": "OneBitAdam",
+     "params": {
+       "lr": 0.0001,
+       "freeze_step": 23000,
+       "betas": [0.9, 0.95],
+       "cuda_aware": False,
+       "comm_backend_name": "nccl"
+     }
+```
+To make this JSON just remove the comment and use all lowercase for the boolean:
+```yaml
+   "optimizer": {
+     "type": "OneBitAdam",
+     "params": {
+       "lr": 0.0001,
+       "freeze_step": 23000,
+       "betas": [0.9, 0.95],
+       "cuda_aware": false,
+       "comm_backend_name": "nccl"
+     }
+```

configs/autotuning_configs/small_tune.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 1,
+   "num-layers": 12,
+   "hidden-size": 768,
+   "num-attention-heads": 12,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+   "scaled-upper-triang-masked-softmax-fusion": false,
+   "bias-gelu-fusion": false,
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8
+     }
+   },
+   "train_micro_batch_size_per_gpu": 1,
+   "data-impl": "mmap",
+   "split": "949,50,1",
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+   "gradient_clipping": 1.0,
+   "weight-decay": 0.0,
+   "hidden-dropout": 0.0,
+   "attention-dropout": 0.0,
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   "train-iters": 320000,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "save-interval": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+   "log-interval": 100,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 4,
+   "wall_clock_breakdown": true,
+   "launcher": "slurm",
+   "deepspeed_slurm": true,
+   "comment": "neox",
+   "autotuning": {
+       "enabled": true,
+       "arg_mappings": {
+       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
+       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+     }
+   },
+   "zero_optimization": {
+      "stage": [0, 1, 2, 3]
+   },
+  "train-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
+  "valid-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"],
+  "test-data-paths": ["/fsx/pile_deduped/pile_0.87_deduped_text_document"]
+}

configs/autotuning_configs/tune.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 1,
+   "num-layers": 12,
+   "hidden-size": 768,
+   "num-attention-heads": 12,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+   "scaled-upper-triang-masked-softmax-fusion": true,
+   "bias-gelu-fusion": true,
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8
+     }
+   },
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false
+  },
+   "train_micro_batch_size_per_gpu": 1,
+   "autotuning_config": {
+     "enabled": true,
+     "arg_mappings": {
+       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
+       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+     }
+   },
+   "data-impl": "mmap",
+   "split": "949,50,1",
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+   "gradient_clipping": 1.0,
+   "weight-decay": 0.0,
+   "hidden-dropout": 0.0,
+   "attention-dropout": 0.0,
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   "train-iters": 200,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "save-interval": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+   "log-interval": 100,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 4,
+   "wall_clock_breakdown": true,
+   "launcher": "slurm",
+   "deepspeed_slurm": true,
+   "comment": "neox"
+}

configs/autotuning_configs/tune_1-3B.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 1,
+   "num-layers": 24,
+   "hidden-size": 2048,
+   "num-attention-heads": 16,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+   "gpt_j_residual": false,
+   "output_layer_parallelism": "column",
+   "attention_config": [[["flash"], 24]],
+   "scaled-upper-triang-masked-softmax-fusion": false,
+   "bias-gelu-fusion": false,
+   "init_method": "small_init",
+   "output_layer_init_method": "wang_init",
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0002,
+       "betas": [0.9, 0.95],
+       "eps":  1.0e-8
+     }
+   },
+   "min_lr": 0.00002,
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": true
+  },
+  "train_micro_batch_size_per_gpu": 1,
+   "autotuning": {
+     "enabled": true,
+     "arg_mappings": {
+       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
+       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+     }
+   },
+   "data-impl": "mmap",
+   "checkpoint-activations": false,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+   "gradient_clipping": 1.0,
+   "weight-decay": 0.1,
+   "hidden-dropout": 0,
+   "attention-dropout": 0,
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   "train-iters": 320000,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "checkpoint-factor": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+   "launcher": "slurm",
+   "deepspeed_slurm": true,
+   "no_ssh_check": true,
+   "log-interval": 10,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 1,
+   "wall_clock_breakdown": true
+}

configs/autotuning_configs/tune_6-7B.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+   "pipe-parallel-size": 1,
+   "model-parallel-size": 8,
+   "num-layers": 32,
+   "hidden-size": 4096,
+   "num-attention-heads": 32,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "no-weight-tying": true,
+   "scaled-upper-triang-masked-softmax-fusion": false,
+   "bias-gelu-fusion": false,
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00012,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8
+     }
+   },
+   "train_micro_batch_size_per_gpu": 1,
+   "zero_optimization": {
+      "stage": [0, 1, 2, 3]
+   },
+   "data-impl": "mmap",
+   "split": "949,50,1",
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+   "gradient_clipping": 1.0,
+   "weight-decay": 0,
+   "hidden-dropout": 0,
+   "attention-dropout": 0,
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   "train-iters": 100,
+   "lr-decay-iters": 320000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "checkpoint-factor": 10000,
+   "eval-interval": 1000,
+   "eval-iters": 10,
+   "log-interval": 100,
+   "steps_per_print": 10,
+   "keep-last-n-checkpoints": 4,
+   "wall_clock_breakdown": true,
+   "launcher": "slurm",
+   "deepspeed_slurm": true,
+   "no_ssh_check": true,
+   "comment": "neox",
+   "autotuning": {
+       "enabled": true,
+       "mp_size": 8,
+       "arg_mappings": {
+       "train_micro_batch_size_per_gpu": "--train_micro_batch_size_per_gpu",
+       "gradient_accumulation_steps ": "--gradient_accumulation_steps"
+     }
+   }
+}

configs/bf16_125M.yml ADDED Viewed

	@@ -0,0 +1,80 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.0,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+   "precision": "bfloat16",
+   "fp32_allreduce": True, # without a patch to torch, bf16 models have to do the allreduce in fp32
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/bnb_125M.yml ADDED Viewed

	@@ -0,0 +1,87 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768,
+   "num_attention_heads": 12,
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "rotary",
+   "no_weight_tying": true,
+   "use_bnb_optimizer": true,
+   # these should provide some speedup but takes a while to build, set to true if desired
+   "scaled_upper_triang_masked_softmax_fusion": false,
+   "bias_gelu_fusion": false,
+   "rope_fusion": false,
+   "layernorm_fusion": false,
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e-8,
+     }
+   },
+   # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
+   "zero_optimization": {
+    "stage": 0,
+    "allgather_partitions": True,
+    "allgather_bucket_size": 500000000,
+    "overlap_comm": True,
+    "reduce_scatter": True,
+    "reduce_bucket_size": 500000000,
+    "contiguous_gradients": True,
+  },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": true,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.0,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/cpu_mock_config.yml ADDED Viewed

	@@ -0,0 +1,5 @@

+# CPU unit tests should be independent of the presence of GPUs on the test server
+# host. This configuration mocks these GPU resources and other dependencies.
+{
+  "global_num_gpus": 1
+}

configs/docker/pythia-paths.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "train-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
+  "valid-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
+  "test-data-paths": ["/home/mchorse/data/pile_deduped/pile_0.87_deduped_text_document"],
+  "tokenizer-type": "HFTokenizer",
+  "vocab-file": "/home/mchorse/data/tokenizers/20B_tokenizer.json",
+  "save": "/home/mchorse/chk/",
+  "load": "/home/mchorse/chk/",
+  "checkpoint_validation_with_forward_pass": False
+}

configs/eleutherai_cluster.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+# Data paths and options when using EleutherAI cluster
+{
+  # you may include multiple distinct datasets if desired
+  "train_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_text_document"],
+  "valid_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_val_text_document"],
+  "test_data_paths": ["/mnt/ssd-1/data/enwik8/enwik8_test_text_document"],
+  # if using multiple datasets, provide weights for them to be sampled with
+  # "train-data-weights": [1., 2.],
+  # "test-data-weights": [2., 1.],
+  # "valid-data-weights": [0.5, 0.4],
+  # If you would like the code to create val and test datasets from your training set use the following instead
+  # "split" determines the relative size of train, val, and test
+  # "split" 995,4,1
+  # "data_path": "/mnt/ssd-1/data/enwik8/enwik8_text_document",
+  "vocab_file": "/mnt/ssd-1/data/gpt2-vocab.json",
+  "merge_file": "/mnt/ssd-1/data/gpt2-merges.txt",
+  "save": "/mnt/ssd-1/checkpoints",
+  "load": "/mnt/ssd-1/checkpoints",
+  "tensorboard_dir": "/mnt/ssd-1/tensorboard",
+  "log_dir": "/mnt/ssd-1/logs",
+  "wandb_team": "eleutherai",
+  "wandb_project": "neox",
+  "wandb_group": "example"
+}

configs/finetuning_configs/6-9B.yml ADDED Viewed

	@@ -0,0 +1,89 @@

+{
+  # finetuning option
+  "load": "/path/to/checkpoint",
+  "finetune": true,
+  "pipe-parallel-size": 1,
+  "model-parallel-size": 2,
+   "num-layers": 32,
+   "hidden-size": 4096,
+   "num-attention-heads": 32,
+   "seq-length": 2048,
+   "max-position-embeddings": 2048,
+   "norm": "layernorm",
+   "pos-emb": "rotary",
+   "rotary_pct": 0.25,
+   "no-weight-tying": true,
+   "gpt_j_residual": true,
+   "output_layer_parallelism": "column",
+   "attention-config": [[["flash"], 32]],
+   "scaled-upper-triang-masked-softmax-fusion": true,
+   "bias-gelu-fusion": true,
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.00012,
+       "betas": [0.9, 0.95],
+       "eps": 1.0e-8
+     }
+   },
+   "min_lr": 0.000012,
+   "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 1260000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 1260000000,
+    "contiguous_gradients": true,
+    "cpu_offload": false,
+    "load_from_fp32_weights": False, # if checkpoint has fp16/bf16 params
+  },
+   "train_micro_batch_size_per_gpu": 8,
+   "gradient_accumulation_steps": 2,
+   "data-impl": "mmap",
+   "checkpoint-activations": true,
+   "checkpoint-num-layers": 1,
+   "partition-activations": true,
+   "synchronize-each-layer": true,
+   "gradient_clipping": 1.0,
+   "weight-decay": 0.1,
+   "hidden-dropout": 0,
+   "attention-dropout": 0,
+   "fp16": {
+     "fp16": true,
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "initial_scale_power": 12,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   "train-iters": 143000,
+   "lr-decay-iters": 143000,
+   "distributed-backend": "nccl",
+   "lr-decay-style": "cosine",
+   "warmup": 0.01,
+   "checkpoint-factor": 1000,
+   "extra-save-iters": [0,1,2,4,8,16,32,64,128,256,512],
+   "eval-interval": 143000,
+   "eval-iters": 10,
+   "log-interval": 10,
+   "steps_per_print": 10,
+   "wall_clock_breakdown": true,
+   "tokenizer_type": "HFTokenizer"
+}

configs/gen_docs.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import sys
+import os
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+)
+from megatron.neox_arguments import neox_args, deepspeed_args
+from inspect import getmembers, getsource
+from dataclasses import field, is_dataclass
+from itertools import tee, zip_longest
+import pathlib
+def pairwise(iterable):
+    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
+    a, b = tee(iterable)
+    next(b, None)
+    return zip_longest(a, b)
+def get_docs(module):
+    ARGS_CLASSES = getmembers(module, is_dataclass)
+    results = {}
+    for name, dcls in ARGS_CLASSES:
+        assert is_dataclass(dcls)
+        src = getsource(dcls)
+        d = dcls()
+        loc = 0
+        results[name] = {"doc": d.__doc__.strip(), "attributes": {}}
+        for cur, _next in pairwise(d.__dataclass_fields__.items()):
+            field_name, field_def = cur
+            field_type = field_def.type
+            if hasattr(field_type, "__name__"):
+                if field_type.__name__ == "Literal" or field_type.__name__ == "Union":
+                    field_type = field_type
+                else:
+                    field_type = str(field_type.__name__)
+            else:
+                field_type = str(field_type)
+            field_default = field_def.default
+            # try to find the field definition
+            loc = src.find(f" {field_name}:", loc + len(field_name) + 1)
+            if _next is not None:
+                next_field_name, _ = _next
+                # try to find the next field definition
+                next_loc = src.find(f"{next_field_name}:", loc + len(field_name))
+            else:
+                next_loc = len(src)
+            # try to get the docstring
+            _src = src[loc:next_loc].strip()
+            if '"""' in _src:
+                doc = _src.split('"""')[1].strip()
+            elif "'''" in _src:
+                doc = _src.split("'''")[1].strip()
+            else:
+                doc = ""
+            results[name]["attributes"][field_name] = {
+                "name": field_name,
+                "type": field_type,
+                "default": field_default,
+                "doc": doc,
+            }
+    return results
+def to_md(docs, intro_str=""):
+    """
+    Writes the docs dictionary to markdown format
+    """
+    lines = []
+    lines.append(intro_str)
+    for name, doc in docs.items():
+        lines.append(f"## {name}")
+        lines.append(f"{doc['doc']}")
+        lines.append("")
+        for field_name, field_def in doc["attributes"].items():
+            # attribute name and type
+            lines.append(f"- **{field_name}**: {field_def['type']}")
+            # default value
+            lines.append(f"    Default = {str(field_def['default'])}")
+            lines.append(f"    {field_def['doc']}")
+            lines.append("")
+    return "\n\n".join(lines)
+if __name__ == "__main__":
+    docs = get_docs(neox_args)
+    docs.update(get_docs(deepspeed_args))
+    intro_str = """Arguments for gpt-neox. All of the following can be specified in your .yml config file(s):\n"""
+    md = to_md(docs, intro_str=intro_str)
+    with open(f"{pathlib.Path(__file__).parent.resolve()}/neox_arguments.md", "w") as f:
+        f.write(md)

configs/gmlp_small.yml ADDED Viewed

	@@ -0,0 +1,72 @@

+# GPT-2 pretraining setup
+{
+   # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages
+   # across the node boundaries )
+   "pipe_parallel_size": 1,
+   "model_parallel_size": 1,
+   "attention_config": [[["gmlp"], "all"]],
+   # model settings
+   "num_layers": 12,
+   "hidden_size": 768, # gmlp d_ff defaults to hidden_size * 4
+   "gmlp_attn_dim": 64,
+   "num_attention_heads": 12, # this has no effect with gmlp - and amlp defaults to single head attention.
+   "seq_length": 2048,
+   "max_position_embeddings": 2048,
+   "norm": "layernorm",
+   "pos_emb": "none",
+   "no_weight_tying": true,
+   # optimizer settings
+   "optimizer": {
+     "type": "Adam",
+     "params": {
+       "lr": 0.0006,
+       "betas": [0.9, 0.999],
+       "eps": 1.0e_8,
+     }
+   },
+   # batch / data settings
+   "train_micro_batch_size_per_gpu": 4,
+   "data_impl": "mmap",
+   "split": "949,50,1",
+   # activation checkpointing
+   "checkpoint_activations": true,
+   "checkpoint_num_layers": 1,
+   "partition_activations": false,
+   "synchronize_each_layer": true,
+   # regularization
+   "gradient_clipping": 1.0,
+   "weight_decay": 0.1,
+   "hidden_dropout": 0.0,
+   "attention_dropout": 0.0,
+   # precision settings
+   "fp16": {
+     "enabled": true,
+     "loss_scale": 0,
+     "loss_scale_window": 1000,
+     "hysteresis": 2,
+     "min_loss_scale": 1
+   },
+   # misc. training settings
+   "train_iters": 320000,
+   "lr_decay_iters": 320000,
+   "distributed_backend": "nccl",
+   "lr_decay_style": "cosine",
+   "warmup": 0.01,
+   "checkpoint_factor": 10000,
+   "eval_interval": 1000,
+   "eval_iters": 10,
+   # logging
+   "log_interval": 100,
+   "steps_per_print": 10,
+   "keep_last_n_checkpoints": 4,
+   "wall_clock_breakdown": true,
+}

configs/llama/13B.yml ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "pipe_parallel_size": 1,
+  "model_parallel_size": 2,
+  "make_vocab_size_divisible_by": 1,
+  # model settings
+  "num_layers": 40,
+  "hidden_size": 5120,
+  "num_attention_heads": 40,
+  "seq_length": 2048,
+  "max_position_embeddings": 2048,
+  "pos_emb": "rotary",
+  "rotary_pct": 1,
+  "no_weight_tying": true,
+  "gpt_j_residual": false,
+  "output_layer_parallelism": "column",
+  "norm": "rmsnorm",
+  "rms_norm_epsilon": 1.0e-6,
+  "scaled_upper_triang_masked_softmax_fusion": true,
+  "bias_gelu_fusion": false,
+  "use_bias_in_norms": false,
+  "use_bias_in_attn_linear": false,
+  "activation": "swiglu",
+  "mlp_multiple_of": 256,
+}