From 032e160c3cf0dd068a5e100090883b0e17dc6817 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 18 Mar 2026 17:07:11 +0000 Subject: [PATCH 01/15] use actions/cache to retrieve the cache --- .github/actions/build-container/action.yml | 83 ++++++++++++++++++++++ .github/container/Dockerfile.jax | 45 +++++++++++- .github/workflows/_build_base.yaml | 4 +- .github/workflows/_ci.yaml | 2 + .github/workflows/ci.yaml | 10 ++- 5 files changed, 134 insertions(+), 10 deletions(-) diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml index f53e14bea..297fbb738 100644 --- a/.github/actions/build-container/action.yml +++ b/.github/actions/build-container/action.yml @@ -54,6 +54,14 @@ inputs: description: "URL of the Bazel remote cache to use for building the image" required: true default: "" + ENABLE_BAZEL_DISK_CACHE: + description: "Enable Bazel disk cache via actions/cache" + required: false + default: "false" + ENABLE_BAZEL_REPO_CACHE: + description: "Enable Bazel repository cache via actions/cache" + required: false + default: "false" outputs: DOCKER_TAG_MEALKIT: @@ -106,6 +114,32 @@ runs: mv version.py .github/container/nsys_jax/nsys_jax/ cat .github/container/nsys_jax/nsys_jax/version.py + # BAZEL CACHE RESTORE + - name: Restore Bazel disk cache + if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' + uses: actions/cache/restore@v4 + with: + path: /tmp/bazel-disk-cache + key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} + restore-keys: | + bazel-disk-cache-${{ inputs.ARCHITECTURE }}- + + - name: Restore Bazel repo cache + if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true' + uses: actions/cache/restore@v4 + with: + path: /tmp/bazel-repo-cache + key: bazel-repo-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} + restore-keys: | + bazel-repo-cache-${{ inputs.ARCHITECTURE }}- + + # Always create the seed dirs so --build-context is always valid (no-op if empty) + - name: Prepare Bazel cache seed directories + shell: bash + run: | + mkdir -p /tmp/bazel-disk-cache + mkdir -p /tmp/bazel-repo-cache + # MEALKIT BUILD - name: Set docker metadata - mealkit id: mealkit-metadata @@ -134,6 +168,8 @@ runs: ssh: default secret-files: | "SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}" + build-contexts: | + bazel-disk-seed=/tmp/bazel-disk-cache build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }} @@ -173,6 +209,53 @@ runs: BUILD_DATE=${{ inputs.BUILD_DATE }} ${{ inputs.EXTRA_BUILD_ARGS }} + # BAZEL CACHE EXPORT + - name: Export Bazel disk cache + if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' + uses: docker/build-push-action@v5 + with: + context: ${{ inputs.DOCKER_CONTEXT }} + push: false + file: ${{ inputs.DOCKERFILE }} + platforms: linux/${{ inputs.ARCHITECTURE }} + target: bazel-disk-export + outputs: type=local,dest=/tmp/bazel-disk-cache-new + build-contexts: | + bazel-disk-seed=/tmp/bazel-disk-cache + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + ${{ inputs.EXTRA_BUILD_ARGS }} + + - name: Save Bazel disk cache + if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' + uses: actions/cache/save@v4 + with: + path: /tmp/bazel-disk-cache-new + key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} + + - name: Export Bazel repo cache + if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true' + uses: docker/build-push-action@v5 + with: + context: ${{ inputs.DOCKER_CONTEXT }} + push: false + file: ${{ inputs.DOCKERFILE }} + platforms: linux/${{ inputs.ARCHITECTURE }} + target: bazel-repo-export + outputs: type=local,dest=/tmp/bazel-repo-cache-new + build-args: | + BASE_IMAGE=${{ inputs.BASE_IMAGE }} + BUILD_DATE=${{ inputs.BUILD_DATE }} + ${{ inputs.EXTRA_BUILD_ARGS }} + + - name: Save Bazel repo cache + if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true' + uses: actions/cache/save@v4 + with: + path: /tmp/bazel-repo-cache-new + key: bazel-repo-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} + # SITREP GENERATION - name: Generate sitrep if: "!cancelled()" diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index da7c2a29e..f964df995 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -19,14 +19,24 @@ ARG SRC_PATH_TRANSFORMER_ENGINE=/opt/transformer-engine ARG GIT_USER_NAME="JAX Toolbox" ARG GIT_USER_EMAIL=jax@nvidia.com -ARG BAZEL_CACHE=/tmp +ARG BAZEL_CACHE=/cache/bazel-disk ARG BUILD_DATE +############################################################################### +## Bazel disk cache seed (overridden via --build-context on cache hit) +############################################################################### + +# On first run this is empty (FROM scratch). When actions/cache restores a +# previous disk cache to /tmp/bazel-disk-cache on the runner, the caller passes +# --build-context bazel-disk-seed=/tmp/bazel-disk-cache to inject it. +FROM scratch AS bazel-disk-seed + ############################################################################### ## Build JAX ############################################################################### FROM ${BASE_IMAGE} AS builder +ARG TARGETARCH ARG URLREF_JAX ARG URLREF_TRANSFORMER_ENGINE ARG URLREF_XLA @@ -54,9 +64,14 @@ RUN ARCH="$(dpkg --print-architecture)" && \ chmod +x /usr/local/bin/bazel # Populate ${BUILD_PATH_JAXLIB} with editable wheels; --no-install because # (a) this is the builder stage, and (b) pip-finalize.sh does the install -RUN mkdir -p /builder/extra-targets/{bin,python} && \ +RUN --mount=type=cache,id=bazel-disk-${TARGETARCH},target=/cache/bazel-disk,sharing=locked \ + --mount=type=cache,id=bazel-repo-${TARGETARCH},target=/cache/bazel-repo,sharing=locked \ + --mount=type=bind,from=bazel-disk-seed,source=.,target=/tmp/bazel-disk-seed,readonly \ + cp -a /tmp/bazel-disk-seed/. /cache/bazel-disk/ 2>/dev/null || true && \ + mkdir -p /builder/extra-targets/{bin,python} && \ build-jax.sh \ --bazel-cache ${BAZEL_CACHE} \ + --build-param --bazel_options=--repository_cache=/cache/bazel-repo \ --build-path-jaxlib ${BUILD_PATH_JAXLIB} \ --extra-targets "${EXTRA_BAZEL_TARGETS}" \ --extra-target-dest /builder/extra-targets \ @@ -148,3 +163,29 @@ RUN install-nsys-jax.sh ${SRC_PATH_NSYS_JAX} FROM mealkit AS final RUN pip-finalize.sh + +############################################################################### +## Bazel cache export stages (used by CI to persist caches via actions/cache) +############################################################################### + +# ARG BUILD_DATE ensures this always re-executes (never a registry cache hit), +# so the snapshot always reflects the current run's cache mount content. +FROM ${BASE_IMAGE} AS bazel-disk-snapshot +ARG TARGETARCH +ARG BUILD_DATE +RUN --mount=type=cache,id=bazel-disk-${TARGETARCH},target=/cache/bazel-disk,sharing=locked,readonly \ + mkdir -p /bazel-disk-snapshot && \ + cp -rp /cache/bazel-disk/. /bazel-disk-snapshot/ + +FROM scratch AS bazel-disk-export +COPY --from=bazel-disk-snapshot /bazel-disk-snapshot / + +FROM ${BASE_IMAGE} AS bazel-repo-snapshot +ARG TARGETARCH +ARG BUILD_DATE +RUN --mount=type=cache,id=bazel-repo-${TARGETARCH},target=/cache/bazel-repo,sharing=locked,readonly \ + mkdir -p /bazel-repo-snapshot && \ + cp -rp /cache/bazel-repo/. /bazel-repo-snapshot/ + +FROM scratch AS bazel-repo-export +COPY --from=bazel-repo-snapshot /bazel-repo-snapshot / diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index 930ae9cfa..4774b6bb8 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -58,7 +58,7 @@ permissions: jobs: build-base: - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small] + runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu32m' || 'linux-arm64-cpu32m' }} env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: @@ -137,7 +137,7 @@ jobs: BUILD_DATE=${{ inputs.BUILD_DATE }} JAX_TOOLBOX_REF=${{ github.head_ref || github.sha }} ${{ inputs.BASE_IMAGE != 'latest' && format('BASE_IMAGE={0}', inputs.BASE_IMAGE) || '' }} - + - name: Generate sitrep if: "!cancelled()" shell: bash -x -e {0} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index ca6bd3c41..e5fefb633 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -83,6 +83,8 @@ jobs: ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} github-token: ${{ secrets.GITHUB_TOKEN }} bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + ENABLE_BAZEL_DISK_CACHE: 'true' + ENABLE_BAZEL_REPO_CACHE: 'true' EXTRA_BUILD_ARGS: | URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }} URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }} diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index d711da63d..f199635e8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -4,12 +4,10 @@ on: schedule: - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC - cron: '0 0 * * 6' #midnight every Saturday UTC for scale-training - pull_request: - types: - - opened - - reopened - - ready_for_review - - synchronize + push: + # we need this to allow nv-gha-runners to run + branches: + - "**" paths-ignore: - '**.md' - '.github/triage/**' From 476e1029e18f0ace913d2135ebd814a4bbe3ce85 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 18 Mar 2026 17:24:23 +0000 Subject: [PATCH 02/15] fix docker seutup build --- .github/actions/build-container/action.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml index 297fbb738..72bc6970a 100644 --- a/.github/actions/build-container/action.yml +++ b/.github/actions/build-container/action.yml @@ -100,7 +100,6 @@ runs: with: driver-opts: | image=moby/buildkit:v0.12.1 - version: v0.30.1 - name: Download nsys-jax version.py uses: actions/download-artifact@v4 From c8029b3df2920b46374bcbf2a1de5f9596043852 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 18 Mar 2026 18:15:27 +0000 Subject: [PATCH 03/15] version fix --- .github/actions/build-container/action.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml index 72bc6970a..563f6077d 100644 --- a/.github/actions/build-container/action.yml +++ b/.github/actions/build-container/action.yml @@ -97,6 +97,8 @@ runs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + env: + DOCKER_API_VERSION: '1.43' with: driver-opts: | image=moby/buildkit:v0.12.1 From 0feebc33d98b33c2e04c2a896da34f9e03b54d26 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 18 Mar 2026 18:25:17 +0000 Subject: [PATCH 04/15] version error again? --- .github/actions/build-container/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml index 563f6077d..a4b463685 100644 --- a/.github/actions/build-container/action.yml +++ b/.github/actions/build-container/action.yml @@ -80,6 +80,8 @@ runs: run: | echo 'UPLD_IMAGE=ghcr.io/nvidia/jax-toolbox-internal' >> $GITHUB_ENV echo "BADGE_FILENAME_FULL=${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json" >> $GITHUB_ENV + # Cap Docker client API version to match the daemon on NVKS runners + echo 'DOCKER_API_VERSION=1.43' >> $GITHUB_ENV - name: Setup SSH id: setup-ssh @@ -97,8 +99,6 @@ runs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - env: - DOCKER_API_VERSION: '1.43' with: driver-opts: | image=moby/buildkit:v0.12.1 From 98cde85fa8a3183a9c936d7cff19a888511028a8 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 18 Mar 2026 19:34:04 +0000 Subject: [PATCH 05/15] avoid using grpc --- .github/actions/build-container/action.yml | 11 +- .github/workflows/_ci.yaml | 1190 ++++++++++---------- 2 files changed, 604 insertions(+), 597 deletions(-) diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml index a4b463685..e711fdcd5 100644 --- a/.github/actions/build-container/action.yml +++ b/.github/actions/build-container/action.yml @@ -82,6 +82,13 @@ runs: echo "BADGE_FILENAME_FULL=${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json" >> $GITHUB_ENV # Cap Docker client API version to match the daemon on NVKS runners echo 'DOCKER_API_VERSION=1.43' >> $GITHUB_ENV + # When disk cache is enabled use the BuildKit cache mount path; + # otherwise fall back to the remote cache URL (internal infra runners). + if [[ "${{ inputs.ENABLE_BAZEL_DISK_CACHE }}" == "true" ]]; then + echo 'BAZEL_CACHE_ARG=BAZEL_CACHE=/cache/bazel-disk' >> $GITHUB_ENV + else + echo 'BAZEL_CACHE_ARG=BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}' >> $GITHUB_ENV + fi - name: Setup SSH id: setup-ssh @@ -173,7 +180,7 @@ runs: bazel-disk-seed=/tmp/bazel-disk-cache build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} - BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }} + ${{ env.BAZEL_CACHE_ARG }} BUILD_DATE=${{ inputs.BUILD_DATE }} ${{ inputs.EXTRA_BUILD_ARGS }} # FINAL IMAGE BUILD @@ -206,7 +213,7 @@ runs: "SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}" build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} - BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }} + ${{ env.BAZEL_CACHE_ARG }} BUILD_DATE=${{ inputs.BUILD_DATE }} ${{ inputs.EXTRA_BUILD_ARGS }} diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index e5fefb633..893d4b04a 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -94,598 +94,598 @@ jobs: DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }} DOCKER_TAG_FINAL: ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }} - build-equinox: - needs: build-jax - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Build Equinox container - id: build-equinox - uses: ./.github/actions/build-container - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-equinox-build - BADGE_FILENAME: badge-equinox-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: equinox - DOCKERFILE: .github/container/Dockerfile.equinox - RUNNER_SIZE: small - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - EXTRA_BUILD_ARGS: | - URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - - build-maxtext: - needs: build-jax - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Build MaxText container - id: build-maxtext - uses: ./.github/actions/build-container - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-maxtext-build - BADGE_FILENAME: badge-maxtext-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: maxtext - DOCKERFILE: .github/container/Dockerfile.maxtext - RUNNER_SIZE: small - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - EXTRA_BUILD_ARGS: | - URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - - build-torchax: - needs: build-jax - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Build TorchAX container - id: build-torchax - uses: ./.github/actions/build-container - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-torchax-build - BADGE_FILENAME: badge-torchax-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: torchax - DOCKERFILE: .github/container/Dockerfile.torchax - RUNNER_SIZE: small - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - EXTRA_BUILD_ARGS: | - URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }} - - build-axlearn: - needs: build-jax - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] - outputs: - DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }} - DOCKER_TAG_FINAL: ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }} - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Build AxLearn container - id: build-axlearn - uses: ./.github/actions/build-container - with: - ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - ARTIFACT_NAME: artifact-axlearn-build - BADGE_FILENAME: badge-axlearn-build - BUILD_DATE: ${{ inputs.BUILD_DATE }} - BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - CONTAINER_NAME: axlearn - DOCKERFILE: .github/container/Dockerfile.axlearn - RUNNER_SIZE: large - ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - EXTRA_BUILD_ARGS: | - URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }} - - collect-docker-tags: - runs-on: ubuntu-22.04 - if: ${{ !cancelled() }} - needs: - - build-base - - build-jax - - build-equinox - - build-maxtext - - build-axlearn - outputs: - TAGS: ${{ steps.collect-tags.outputs.TAGS }} - steps: - - name: Save docker tags as a JSON object - id: collect-tags - run: | - TAGS=$(cat <> $GITHUB_OUTPUT - - test-jax: - needs: build-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'jax' - ) - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: jax - EXECUTE: | - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-single-gpu.log - nvidia-cuda-mps-control -d - test-jax.sh -b single-gpu - EOF - docker run -i --shm-size=1g --gpus all \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-multi-gpu.log - nvidia-cuda-mps-control -d - test-jax.sh -b multi-gpu - EOF - STATISTICS_SCRIPT: | - errors=$(cat test-*.log | grep -c 'ERROR:' || true) - failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - test-backend-independent.log - test-multi-gpu.log - test-single-gpu.log - secrets: inherit - - test-nsys-jax: - needs: build-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'jax' - ) - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: nsys-jax - EXECUTE: | - set -o pipefail - mkdir -p output-results - docker run -i --shm-size=1g --gpus all \ - -v $PWD/output-results:/opt/output \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-nsys-jax.log - # nsys-jax is already installed, this is just adding the test dependencies - pip install pytest-reportlog nsys-jax[test] - # abuse knowledge that nsys-jax is installed editable, so the tests exist - test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - pytest --basetemp=/opt/output/pytest-tmp --report-log=/opt/output/pytest-report.jsonl "${test_path}" - chmod -R a+rwX /opt/output - EOF - STATISTICS_SCRIPT: | - summary_line=$(tail -n1 test-nsys-jax.log) - num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$(( passed_tests + failed_tests )) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - ARTIFACTS: | - # pytest-driven part - test-nsys-jax.log - output-results/pytest-report.jsonl - output-results/pytest-tmp/ - secrets: inherit - - # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - # not already have nsys-jax installed - test-nsys-jax-archive: - needs: test-nsys-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'jax' - ) - strategy: - matrix: - os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - runs-on: ${{ matrix.os }} - steps: - - name: Download nsys-jax output .zip files - uses: actions/download-artifact@v4 - with: - name: nsys-jax-unit-test-A100 - - name: Extract archives and execute install scripts - run: | - pip install virtualenv # for install.sh - for zip in $(ls *.zip); do - ZIP="${PWD}/${zip}" - pushd $(mktemp -d) - unzip "${ZIP}" - ls -l - # TODO: verify this isn't needed, or make sure it isn't needed - chmod 755 install.sh - # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # Skip executing Jupyter lab - NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - popd - done - - test-nsys-jax-eks: - needs: build-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'jax' - ) - runs-on: eks - env: - JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: ${{ github.run_id }}-nsys-jax - # Service name cannot start with a number - SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax - POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - - name: Configure Kubernetes job - run: | - yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - | select(di == 0).metadata.name = strenv(SERVICE_NAME) - | select(di == 1).metadata.name = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) - | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME) - | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \ - .github/eks-workflow-files/job.yml - git diff .github/eks-workflow-files/job.yml - - name: Submit Kubernetes job - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: .github/eks-workflow-files/job.yml - job-name: ${{ env.JOB_NAME }} - - name: Configure post-processing job - run: | - export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - .github/eks-workflow-files/post-process-job.yml - git diff .github/eks-workflow-files/post-process-job.yml - - name: Submit post process Kubernetes job - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: .github/eks-workflow-files/post-process-job.yml - job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - test-te-h100: - needs: build-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'te' - ) - uses: ./.github/workflows/_transformer_engine_eks.yaml - with: - JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: transformerengine-${{ github.run_id }} - S3_BUCKET: jax-toolbox-eks-output - CI_NAME: transformer-engine - secrets: inherit - - test-jax-cutlass-h100: - needs: build-jax - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'jax-cutlass' - ) - uses: ./.github/workflows/_jax_cutlass_eks.yaml - with: - JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: jax-cutlass-${{ github.run_id }} - S3_BUCKET: jax-toolbox-eks-output - CI_NAME: jax-cutlass - secrets: inherit - - test-te-a100: - needs: build-jax - secrets: inherit - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'te' - ) - uses: ./.github/workflows/_test_unit.yaml - with: - TEST_NAME: te - EXECUTE: | - docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - bash <<"EOF" |& tee test-te.log - set -xu -o pipefail - - LOG_DIR=/log - - pip install pytest-reportlog pytest-xdist - # Start MPS daemon - nvidia-cuda-mps-control -d - # TE's default is slightly different, without the hyphen - export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE} - # 1 GPU per worker, 3 workers per GPU - pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh - ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation - ## into a single .jsonl file of results from multiple pytest invocations - ## inside the test.sh script, so it's useful even with a single worker per - ## device. - pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh - - # merge the log files - cat \ - ${LOG_DIR}/pytest-report-L0-unittest.jsonl \ - ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \ - > ${LOG_DIR}/pytest-report.jsonl - - EOF - STATISTICS_SCRIPT: | - report_json=pytest-report.jsonl - summary_line=$(tail -n1 test-te.log) - errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - total_tests=$((failed_tests + passed_tests)) - echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - - echo "$failed_tests tests failed" - if [[ $failed_tests -gt 0 ]]; then - exit 1 - else - exit 0 - fi - - TIMEOUT_MINUTES: 120 - ARTIFACTS: | - test-te.log - pytest-report.jsonl - pytest-report-L0-unittest.jsonl - pytest-report-L0-distributed-unittest.jsonl - - test-maxtext: - needs: build-maxtext - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'maxtext' - ) - uses: ./.github/workflows/_test_maxtext.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - - test-maxtext-gke: - needs: build-maxtext - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'maxtext' - ) - uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml - with: - MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - secrets: inherit - - test-axlearn-eks: - needs: build-axlearn - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'axlearn' - ) - runs-on: eks - env: - AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: axlearn-${{ github.run_id }} - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - - name: Configure axlearn test job - run: | - # Replace placeholders in axlearn-job.yml with environment variables - yq -i ea ' - select(di == 0).metadata.name = strenv(JOB_NAME) - | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" - | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ - .github/eks-workflow-files/axlearn/axlearn-job.yml - git diff .github/eks-workflow-files/axlearn/axlearn-job.yml - - name: Submit & delete axlearn test - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml" - job-name: ${{ env.JOB_NAME }} - - name: Download logs from S3 - id: log-s3 - if: ${{ !cancelled() }} - run: | - mkdir -p axlearn-output - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/ - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log" - aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml" - - - passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) - failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) - skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) - total_tests=$((failed_tests + passed_tests + skipped_tests)) - - echo "Passed tests: $passed_tests" - echo "Failed tests: $failed_tests" - echo "Skipped tests: $skipped_tests" - echo "Total tests: $total_tests" - echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT - echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT - echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT - - name: Generate sitrep - id: sitrep - if: ${{ !cancelled() }} - shell: bash -x -e {0} - run: | - # bring in utility functions - source .github/workflows/scripts/to_json.sh - - badge_label='Axlearn EKS Unit' - - total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \ - failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \ - passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \ - errors="0" \ - summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ - badge_message="Passed $passed_tests out of $total_tests." \ - badge_color="brightgreen" - if [ "$failed_tests" -gt 0 ]; then - badge_color="red" - fi \ - - to_json \ - summary \ - errors total_tests passed_tests failed_tests \ - badge_label badge_color badge_message \ - > sitrep.json - - schemaVersion=1 \ - label="${badge_label}" \ - message="Passed $passed_tests out of $total_tests." \ - color=$badge_color \ - to_json schemaVersion label message color \ - > badge-axlearn-test.json - - - name: Upload artifacts - if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 - with: - name: "artifact-axlearn-test" - path: | - sitrep.json - badge-axlearn-test.json - axlearn-unittests.jsonl - axlearn-output/* - - test-axlearn-fuji-models-eks: - needs: build-axlearn - if: >- - inputs.ARCHITECTURE == 'amd64' && - ( - inputs.MODE == 'full' || - inputs.MODE == 'axlearn' - ) - runs-on: eks - env: - AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - JOB_NAME: axlearn-fuji-3b-${{ github.run_id }} - steps: - - name: Check out the repository - uses: actions/checkout@v4 - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: K8s GHCR store and delete token - id: store-token - uses: ./.github/actions/store-delete-k8s-ghcr - - name: Configure axlearn test job - run: | - yq -i ea ' - select(di == 0).metadata.name = strenv(JOB_NAME) - | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ - .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml - git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml - - - name: Submit & delete axlearn fuji model test - uses: ./.github/actions/submit-delete-k8s-job - with: - job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" - job-name: ${{ env.JOB_NAME }} + # build-equinox: + # needs: build-jax + # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] + # outputs: + # DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }} + # DOCKER_TAG_FINAL: ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }} + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - name: Build Equinox container + # id: build-equinox + # uses: ./.github/actions/build-container + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-equinox-build + # BADGE_FILENAME: badge-equinox-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: equinox + # DOCKERFILE: .github/container/Dockerfile.equinox + # RUNNER_SIZE: small + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + # github-token: ${{ secrets.GITHUB_TOKEN }} + # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + # EXTRA_BUILD_ARGS: | + # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + + # build-maxtext: + # needs: build-jax + # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] + # outputs: + # DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }} + # DOCKER_TAG_FINAL: ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - name: Build MaxText container + # id: build-maxtext + # uses: ./.github/actions/build-container + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-maxtext-build + # BADGE_FILENAME: badge-maxtext-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: maxtext + # DOCKERFILE: .github/container/Dockerfile.maxtext + # RUNNER_SIZE: small + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + # github-token: ${{ secrets.GITHUB_TOKEN }} + # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + # EXTRA_BUILD_ARGS: | + # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + + # build-torchax: + # needs: build-jax + # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] + # outputs: + # DOCKER_TAG_MEALKIT: ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }} + # DOCKER_TAG_FINAL: ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }} + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - name: Build TorchAX container + # id: build-torchax + # uses: ./.github/actions/build-container + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-torchax-build + # BADGE_FILENAME: badge-torchax-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: torchax + # DOCKERFILE: .github/container/Dockerfile.torchax + # RUNNER_SIZE: small + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + # github-token: ${{ secrets.GITHUB_TOKEN }} + # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + # EXTRA_BUILD_ARGS: | + # URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }} + + # build-axlearn: + # needs: build-jax + # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] + # outputs: + # DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }} + # DOCKER_TAG_FINAL: ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }} + # steps: + # - name: Checkout repository + # uses: actions/checkout@v4 + # - name: Build AxLearn container + # id: build-axlearn + # uses: ./.github/actions/build-container + # with: + # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + # ARTIFACT_NAME: artifact-axlearn-build + # BADGE_FILENAME: badge-axlearn-build + # BUILD_DATE: ${{ inputs.BUILD_DATE }} + # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + # CONTAINER_NAME: axlearn + # DOCKERFILE: .github/container/Dockerfile.axlearn + # RUNNER_SIZE: large + # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + # github-token: ${{ secrets.GITHUB_TOKEN }} + # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + # EXTRA_BUILD_ARGS: | + # URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }} + + # collect-docker-tags: + # runs-on: ubuntu-22.04 + # if: ${{ !cancelled() }} + # needs: + # - build-base + # - build-jax + # - build-equinox + # - build-maxtext + # - build-axlearn + # outputs: + # TAGS: ${{ steps.collect-tags.outputs.TAGS }} + # steps: + # - name: Save docker tags as a JSON object + # id: collect-tags + # run: | + # TAGS=$(cat <> $GITHUB_OUTPUT + + # test-jax: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax' + # ) + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: jax + # EXECUTE: | + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-backend-independent.log + # test-jax.sh -b backend-independent + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-single-gpu.log + # nvidia-cuda-mps-control -d + # test-jax.sh -b single-gpu + # EOF + # docker run -i --shm-size=1g --gpus all \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-multi-gpu.log + # nvidia-cuda-mps-control -d + # test-jax.sh -b multi-gpu + # EOF + # STATISTICS_SCRIPT: | + # errors=$(cat test-*.log | grep -c 'ERROR:' || true) + # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) + # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # test-backend-independent.log + # test-multi-gpu.log + # test-single-gpu.log + # secrets: inherit + + # test-nsys-jax: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax' + # ) + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: nsys-jax + # EXECUTE: | + # set -o pipefail + # mkdir -p output-results + # docker run -i --shm-size=1g --gpus all \ + # -v $PWD/output-results:/opt/output \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-nsys-jax.log + # # nsys-jax is already installed, this is just adding the test dependencies + # pip install pytest-reportlog nsys-jax[test] + # # abuse knowledge that nsys-jax is installed editable, so the tests exist + # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + # pytest --basetemp=/opt/output/pytest-tmp --report-log=/opt/output/pytest-report.jsonl "${test_path}" + # chmod -R a+rwX /opt/output + # EOF + # STATISTICS_SCRIPT: | + # summary_line=$(tail -n1 test-nsys-jax.log) + # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$(( passed_tests + failed_tests )) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + # ARTIFACTS: | + # # pytest-driven part + # test-nsys-jax.log + # output-results/pytest-report.jsonl + # output-results/pytest-tmp/ + # secrets: inherit + + # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test + # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does + # # not already have nsys-jax installed + # test-nsys-jax-archive: + # needs: test-nsys-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax' + # ) + # strategy: + # matrix: + # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] + # runs-on: ${{ matrix.os }} + # steps: + # - name: Download nsys-jax output .zip files + # uses: actions/download-artifact@v4 + # with: + # name: nsys-jax-unit-test-A100 + # - name: Extract archives and execute install scripts + # run: | + # pip install virtualenv # for install.sh + # for zip in $(ls *.zip); do + # ZIP="${PWD}/${zip}" + # pushd $(mktemp -d) + # unzip "${ZIP}" + # ls -l + # # TODO: verify this isn't needed, or make sure it isn't needed + # chmod 755 install.sh + # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout + # # Skip executing Jupyter lab + # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh + # popd + # done + + # test-nsys-jax-eks: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax' + # ) + # runs-on: eks + # env: + # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: ${{ github.run_id }}-nsys-jax + # # Service name cannot start with a number + # SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax + # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: K8s GHCR store and delete token + # id: store-token + # uses: ./.github/actions/store-delete-k8s-ghcr + # - name: Configure Kubernetes job + # run: | + # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + # | select(di == 0).metadata.name = strenv(SERVICE_NAME) + # | select(di == 1).metadata.name = strenv(JOB_NAME) + # | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) + # | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME) + # | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \ + # .github/eks-workflow-files/job.yml + # git diff .github/eks-workflow-files/job.yml + # - name: Submit Kubernetes job + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/job.yml + # job-name: ${{ env.JOB_NAME }} + # - name: Configure post-processing job + # run: | + # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + # | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + # .github/eks-workflow-files/post-process-job.yml + # git diff .github/eks-workflow-files/post-process-job.yml + # - name: Submit post process Kubernetes job + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: .github/eks-workflow-files/post-process-job.yml + # job-name: ${{ env.POSTPROCESS_JOB_NAME }} + + # test-te-h100: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'te' + # ) + # uses: ./.github/workflows/_transformer_engine_eks.yaml + # with: + # JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: transformerengine-${{ github.run_id }} + # S3_BUCKET: jax-toolbox-eks-output + # CI_NAME: transformer-engine + # secrets: inherit + + # test-jax-cutlass-h100: + # needs: build-jax + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'jax-cutlass' + # ) + # uses: ./.github/workflows/_jax_cutlass_eks.yaml + # with: + # JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: jax-cutlass-${{ github.run_id }} + # S3_BUCKET: jax-toolbox-eks-output + # CI_NAME: jax-cutlass + # secrets: inherit + + # test-te-a100: + # needs: build-jax + # secrets: inherit + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'te' + # ) + # uses: ./.github/workflows/_test_unit.yaml + # with: + # TEST_NAME: te + # EXECUTE: | + # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ + # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ + # bash <<"EOF" |& tee test-te.log + # set -xu -o pipefail + + # LOG_DIR=/log + + # pip install pytest-reportlog pytest-xdist + # # Start MPS daemon + # nvidia-cuda-mps-control -d + # # TE's default is slightly different, without the hyphen + # export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE} + # # 1 GPU per worker, 3 workers per GPU + # pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh + # ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation + # ## into a single .jsonl file of results from multiple pytest invocations + # ## inside the test.sh script, so it's useful even with a single worker per + # ## device. + # pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh + + # # merge the log files + # cat \ + # ${LOG_DIR}/pytest-report-L0-unittest.jsonl \ + # ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \ + # > ${LOG_DIR}/pytest-report.jsonl + + # EOF + # STATISTICS_SCRIPT: | + # report_json=pytest-report.jsonl + # summary_line=$(tail -n1 test-te.log) + # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') + # passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + # failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + # total_tests=$((failed_tests + passed_tests)) + # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT + # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT + # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT + + # echo "$failed_tests tests failed" + # if [[ $failed_tests -gt 0 ]]; then + # exit 1 + # else + # exit 0 + # fi + + # TIMEOUT_MINUTES: 120 + # ARTIFACTS: | + # test-te.log + # pytest-report.jsonl + # pytest-report-L0-unittest.jsonl + # pytest-report-L0-distributed-unittest.jsonl + + # test-maxtext: + # needs: build-maxtext + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'maxtext' + # ) + # uses: ./.github/workflows/_test_maxtext.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit + + # test-maxtext-gke: + # needs: build-maxtext + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'maxtext' + # ) + # uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml + # with: + # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + # secrets: inherit + + # test-axlearn-eks: + # needs: build-axlearn + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'axlearn' + # ) + # runs-on: eks + # env: + # AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: axlearn-${{ github.run_id }} + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: K8s GHCR store and delete token + # id: store-token + # uses: ./.github/actions/store-delete-k8s-ghcr + # - name: Configure axlearn test job + # run: | + # # Replace placeholders in axlearn-job.yml with environment variables + # yq -i ea ' + # select(di == 0).metadata.name = strenv(JOB_NAME) + # | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + # | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" + # | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ + # .github/eks-workflow-files/axlearn/axlearn-job.yml + # git diff .github/eks-workflow-files/axlearn/axlearn-job.yml + # - name: Submit & delete axlearn test + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml" + # job-name: ${{ env.JOB_NAME }} + # - name: Download logs from S3 + # id: log-s3 + # if: ${{ !cancelled() }} + # run: | + # mkdir -p axlearn-output + # aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/ + # aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log" + # aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml" + + + # passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + # failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + # skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + # total_tests=$((failed_tests + passed_tests + skipped_tests)) + + # echo "Passed tests: $passed_tests" + # echo "Failed tests: $failed_tests" + # echo "Skipped tests: $skipped_tests" + # echo "Total tests: $total_tests" + # echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT + # echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT + # echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT + # - name: Generate sitrep + # id: sitrep + # if: ${{ !cancelled() }} + # shell: bash -x -e {0} + # run: | + # # bring in utility functions + # source .github/workflows/scripts/to_json.sh + + # badge_label='Axlearn EKS Unit' + + # total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \ + # failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \ + # passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \ + # errors="0" \ + # summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ + # badge_message="Passed $passed_tests out of $total_tests." \ + # badge_color="brightgreen" + # if [ "$failed_tests" -gt 0 ]; then + # badge_color="red" + # fi \ + + # to_json \ + # summary \ + # errors total_tests passed_tests failed_tests \ + # badge_label badge_color badge_message \ + # > sitrep.json + + # schemaVersion=1 \ + # label="${badge_label}" \ + # message="Passed $passed_tests out of $total_tests." \ + # color=$badge_color \ + # to_json schemaVersion label message color \ + # > badge-axlearn-test.json + + # - name: Upload artifacts + # if: ${{ !cancelled() }} + # uses: actions/upload-artifact@v4 + # with: + # name: "artifact-axlearn-test" + # path: | + # sitrep.json + # badge-axlearn-test.json + # axlearn-unittests.jsonl + # axlearn-output/* + + # test-axlearn-fuji-models-eks: + # needs: build-axlearn + # if: >- + # inputs.ARCHITECTURE == 'amd64' && + # ( + # inputs.MODE == 'full' || + # inputs.MODE == 'axlearn' + # ) + # runs-on: eks + # env: + # AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + # JOB_NAME: axlearn-fuji-3b-${{ github.run_id }} + # steps: + # - name: Check out the repository + # uses: actions/checkout@v4 + # - name: Login to GitHub Container Registry + # uses: docker/login-action@v3 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GITHUB_TOKEN }} + # - name: K8s GHCR store and delete token + # id: store-token + # uses: ./.github/actions/store-delete-k8s-ghcr + # - name: Configure axlearn test job + # run: | + # yq -i ea ' + # select(di == 0).metadata.name = strenv(JOB_NAME) + # | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + # | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ + # .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml + # git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml + + # - name: Submit & delete axlearn fuji model test + # uses: ./.github/actions/submit-delete-k8s-job + # with: + # job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" + # job-name: ${{ env.JOB_NAME }} From ddcdc489e091f0c9b1a71f18c5f9dddf8af8aab2 Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 19 Mar 2026 09:37:35 +0000 Subject: [PATCH 06/15] test without grpc --- .github/workflows/_ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 893d4b04a..509b5ea99 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -82,7 +82,7 @@ jobs: ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} github-token: ${{ secrets.GITHUB_TOKEN }} - bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} + bazel-remote-cache-url: "" ENABLE_BAZEL_DISK_CACHE: 'true' ENABLE_BAZEL_REPO_CACHE: 'true' EXTRA_BUILD_ARGS: | From 066f5fcfbb0a756390707dc9c96d36b768d44cb1 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 20 Mar 2026 08:51:17 +0000 Subject: [PATCH 07/15] impose liimits to the size of the cache, compress the cache to avoid having lots of files being pushed and timing out --- .github/actions/build-container/action.yml | 23 +++++++++++++++------- .github/container/Dockerfile.jax | 1 + 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml index e711fdcd5..b6d253ccc 100644 --- a/.github/actions/build-container/action.yml +++ b/.github/actions/build-container/action.yml @@ -127,7 +127,7 @@ runs: if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' uses: actions/cache/restore@v4 with: - path: /tmp/bazel-disk-cache + path: /tmp/bazel-disk.tar key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} restore-keys: | bazel-disk-cache-${{ inputs.ARCHITECTURE }}- @@ -136,17 +136,23 @@ runs: if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true' uses: actions/cache/restore@v4 with: - path: /tmp/bazel-repo-cache + path: /tmp/bazel-repo.tar key: bazel-repo-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} restore-keys: | bazel-repo-cache-${{ inputs.ARCHITECTURE }}- - # Always create the seed dirs so --build-context is always valid (no-op if empty) + # Extract restored tars into seed dirs; create empty dirs on first run - name: Prepare Bazel cache seed directories shell: bash run: | mkdir -p /tmp/bazel-disk-cache + if [[ -f /tmp/bazel-disk.tar ]]; then + tar -xf /tmp/bazel-disk.tar -C /tmp/bazel-disk-cache + fi mkdir -p /tmp/bazel-repo-cache + if [[ -f /tmp/bazel-repo.tar ]]; then + tar -xf /tmp/bazel-repo.tar -C /tmp/bazel-repo-cache + fi # MEALKIT BUILD - name: Set docker metadata - mealkit @@ -218,6 +224,9 @@ runs: ${{ inputs.EXTRA_BUILD_ARGS }} # BAZEL CACHE EXPORT + # type=tar,compression=zstd streams a single archive instead of per-file + # copies — avoids the O(N-files) overhead that caused 3h+ timeouts with + # type=local on a large Bazel disk cache. - name: Export Bazel disk cache if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' uses: docker/build-push-action@v5 @@ -227,7 +236,7 @@ runs: file: ${{ inputs.DOCKERFILE }} platforms: linux/${{ inputs.ARCHITECTURE }} target: bazel-disk-export - outputs: type=local,dest=/tmp/bazel-disk-cache-new + outputs: type=tar,dest=/tmp/bazel-disk.tar,compression=zstd,compression-level=3 build-contexts: | bazel-disk-seed=/tmp/bazel-disk-cache build-args: | @@ -239,7 +248,7 @@ runs: if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' uses: actions/cache/save@v4 with: - path: /tmp/bazel-disk-cache-new + path: /tmp/bazel-disk.tar key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} - name: Export Bazel repo cache @@ -251,7 +260,7 @@ runs: file: ${{ inputs.DOCKERFILE }} platforms: linux/${{ inputs.ARCHITECTURE }} target: bazel-repo-export - outputs: type=local,dest=/tmp/bazel-repo-cache-new + outputs: type=tar,dest=/tmp/bazel-repo.tar,compression=zstd,compression-level=3 build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} @@ -261,7 +270,7 @@ runs: if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true' uses: actions/cache/save@v4 with: - path: /tmp/bazel-repo-cache-new + path: /tmp/bazel-repo.tar key: bazel-repo-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} # SITREP GENERATION diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index f964df995..4247c2624 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -72,6 +72,7 @@ RUN --mount=type=cache,id=bazel-disk-${TARGETARCH},target=/cache/bazel-disk,shar build-jax.sh \ --bazel-cache ${BAZEL_CACHE} \ --build-param --bazel_options=--repository_cache=/cache/bazel-repo \ + --build-param --bazel_options=--disk_cache_max_size=32212254720 \ --build-path-jaxlib ${BUILD_PATH_JAXLIB} \ --extra-targets "${EXTRA_BAZEL_TARGETS}" \ --extra-target-dest /builder/extra-targets \ From 52fee504f5bbd58d4f0eb5b29c67c5b85c7f124a Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 20 Mar 2026 09:17:09 +0000 Subject: [PATCH 08/15] i thought we have a bazel disk size --- .github/container/Dockerfile.jax | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax index 4247c2624..f964df995 100644 --- a/.github/container/Dockerfile.jax +++ b/.github/container/Dockerfile.jax @@ -72,7 +72,6 @@ RUN --mount=type=cache,id=bazel-disk-${TARGETARCH},target=/cache/bazel-disk,shar build-jax.sh \ --bazel-cache ${BAZEL_CACHE} \ --build-param --bazel_options=--repository_cache=/cache/bazel-repo \ - --build-param --bazel_options=--disk_cache_max_size=32212254720 \ --build-path-jaxlib ${BUILD_PATH_JAXLIB} \ --extra-targets "${EXTRA_BAZEL_TARGETS}" \ --extra-target-dest /builder/extra-targets \ From 5d6838e7fab758e3eaeb5b069049b1d0d37e1bc0 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 20 Mar 2026 11:25:57 +0000 Subject: [PATCH 09/15] fix version of buildkit and compression, to allow the artifact compression over limits --- .github/actions/build-container/action.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml index b6d253ccc..5f5eb409b 100644 --- a/.github/actions/build-container/action.yml +++ b/.github/actions/build-container/action.yml @@ -108,7 +108,7 @@ runs: uses: docker/setup-buildx-action@v3 with: driver-opts: | - image=moby/buildkit:v0.12.1 + image=moby/buildkit:v0.19.0 - name: Download nsys-jax version.py uses: actions/download-artifact@v4 @@ -236,7 +236,7 @@ runs: file: ${{ inputs.DOCKERFILE }} platforms: linux/${{ inputs.ARCHITECTURE }} target: bazel-disk-export - outputs: type=tar,dest=/tmp/bazel-disk.tar,compression=zstd,compression-level=3 + outputs: type=tar,dest=/tmp/bazel-disk.tar build-contexts: | bazel-disk-seed=/tmp/bazel-disk-cache build-args: | @@ -260,7 +260,7 @@ runs: file: ${{ inputs.DOCKERFILE }} platforms: linux/${{ inputs.ARCHITECTURE }} target: bazel-repo-export - outputs: type=tar,dest=/tmp/bazel-repo.tar,compression=zstd,compression-level=3 + outputs: type=tar,dest=/tmp/bazel-repo.tar build-args: | BASE_IMAGE=${{ inputs.BASE_IMAGE }} BUILD_DATE=${{ inputs.BUILD_DATE }} From d17ef5e4e407475047f6b473b81883869436002d Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 20 Mar 2026 15:00:02 +0000 Subject: [PATCH 10/15] fix teh platform --- .github/workflows/_build_base.yaml | 2 +- .github/workflows/_ci.yaml | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml index 4774b6bb8..3251d5dd9 100644 --- a/.github/workflows/_build_base.yaml +++ b/.github/workflows/_build_base.yaml @@ -58,7 +58,7 @@ permissions: jobs: build-base: - runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu32m' || 'linux-arm64-cpu32m' }} + runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }} env: BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json outputs: diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index 509b5ea99..a91ed0a6a 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -53,17 +53,17 @@ jobs: MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} secrets: inherit - test-nccl: - if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - needs: build-base - uses: ./.github/workflows/_test_nccl.yaml - with: - CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }} - secrets: inherit + # test-nccl: + # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + # needs: build-base + # uses: ./.github/workflows/_test_nccl.yaml + # with: + # CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }} + # secrets: inherit build-jax: needs: build-base - runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] + runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu32m' || 'linux-arm64-cpu32m' }} steps: - name: Checkout repository uses: actions/checkout@v4 From 1a04c26f6422c8ad1087e2f67d41ac7ae8e87c57 Mon Sep 17 00:00:00 2001 From: Steboss Date: Sat, 21 Mar 2026 09:52:22 +0000 Subject: [PATCH 11/15] trigger a ci build --- simplefile | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 simplefile diff --git a/simplefile b/simplefile new file mode 100644 index 000000000..e69de29bb From 57481ee650f662dee25be8ee216fb15211fb84fa Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 23 Mar 2026 10:35:42 +0000 Subject: [PATCH 12/15] trigger again the pipeline --- simplefile2trigger | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 simplefile2trigger diff --git a/simplefile2trigger b/simplefile2trigger new file mode 100644 index 000000000..e69de29bb From 8429d77efef10a1999072534726924c5026058d3 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 23 Mar 2026 16:52:13 +0000 Subject: [PATCH 13/15] pruning on --- .github/actions/build-container/action.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml index 5f5eb409b..e00ab5d35 100644 --- a/.github/actions/build-container/action.yml +++ b/.github/actions/build-container/action.yml @@ -224,9 +224,12 @@ runs: ${{ inputs.EXTRA_BUILD_ARGS }} # BAZEL CACHE EXPORT - # type=tar,compression=zstd streams a single archive instead of per-file - # copies — avoids the O(N-files) overhead that caused 3h+ timeouts with - # type=local on a large Bazel disk cache. + - name: Prune BuildKit cache to free space for export + if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' || inputs.ENABLE_BAZEL_REPO_CACHE == 'true' + shell: bash + run: docker buildx prune --force + + # type=tar streams a single archive instead of per-file copies - name: Export Bazel disk cache if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' uses: docker/build-push-action@v5 From a422308b67f324641f9779f8f13c63dd291b9042 Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 25 Mar 2026 14:09:14 +0000 Subject: [PATCH 14/15] try new workflow and move jobs on eks --- .github/eks-workflow-files/jax/test.yml | 48 + .github/eks-workflow-files/maxtext/test.yml | 69 + .github/eks-workflow-files/nsys-jax/test.yml | 48 + .github/workflows/_ci.yaml | 1208 +++++++++--------- 4 files changed, 771 insertions(+), 602 deletions(-) create mode 100644 .github/eks-workflow-files/jax/test.yml create mode 100644 .github/eks-workflow-files/maxtext/test.yml create mode 100644 .github/eks-workflow-files/nsys-jax/test.yml diff --git a/.github/eks-workflow-files/jax/test.yml b/.github/eks-workflow-files/jax/test.yml new file mode 100644 index 000000000..c6b6c4bbe --- /dev/null +++ b/.github/eks-workflow-files/jax/test.yml @@ -0,0 +1,48 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue + kueue.x-k8s.io/max-exec-time-seconds: "10800" +spec: + template: + spec: + restartPolicy: Never + containers: + - name: jax + image: PLACEHOLDER + command: + - bash + - -c + - | + set -exo pipefail + + LOG_DIR="/output/${RUN_ID}" + mkdir -p ${LOG_DIR} + + # backend-independent tests + test-jax.sh -b backend-independent 2>&1 | tee ${LOG_DIR}/test-backend-independent.log + + # single-gpu tests + nvidia-cuda-mps-control -d + test-jax.sh -b single-gpu 2>&1 | tee ${LOG_DIR}/test-single-gpu.log + + # multi-gpu tests + test-jax.sh -b multi-gpu 2>&1 | tee ${LOG_DIR}/test-multi-gpu.log + env: + - name: RUN_ID + value: PLACEHOLDER + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: s3-storage + mountPath: /output + subPath: jax + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: s3-storage + persistentVolumeClaim: + claimName: s3-pvc diff --git a/.github/eks-workflow-files/maxtext/test.yml b/.github/eks-workflow-files/maxtext/test.yml new file mode 100644 index 000000000..455cf2f4d --- /dev/null +++ b/.github/eks-workflow-files/maxtext/test.yml @@ -0,0 +1,69 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue + kueue.x-k8s.io/max-exec-time-seconds: "10800" +spec: + template: + spec: + restartPolicy: Never + containers: + - name: maxtext + image: PLACEHOLDER + command: + - bash + - -c + - | + set -exo pipefail + + LOG_DIR="/output/${RUN_ID}" + mkdir -p ${LOG_DIR} + + # single-process-multi-device: PP=1, DP=1, FSDP=2, TP=4 + test-maxtext.sh \ + --output ${LOG_DIR}/1DP2FSDP4TP1PP_single_process \ + --dtype bfloat16 \ + --mem-fraction 0.65 \ + --decoder-block default \ + --attn-type dot_product \ + --batch-per-gpu 2 \ + --steps 10 \ + --pipeline-parallel 1 \ + --data-parallel 1 \ + --fsdp 2 \ + --tensor-parallel 4 \ + --nodes 1 + + # multi-process: PP=1, DP=2, FSDP=2, TP=2 + test-maxtext.sh \ + --output ${LOG_DIR}/2DP2FSDP2TP1PP \ + --dtype bfloat16 \ + --mem-fraction 0.65 \ + --decoder-block default \ + --attn-type dot_product \ + --batch-per-gpu 2 \ + --steps 10 \ + --pipeline-parallel 1 \ + --data-parallel 2 \ + --fsdp 2 \ + --tensor-parallel 2 \ + --nodes 1 \ + --multiprocess + env: + - name: RUN_ID + value: PLACEHOLDER + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: s3-storage + mountPath: /output + subPath: maxtext + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: s3-storage + persistentVolumeClaim: + claimName: s3-pvc diff --git a/.github/eks-workflow-files/nsys-jax/test.yml b/.github/eks-workflow-files/nsys-jax/test.yml new file mode 100644 index 000000000..789eca17c --- /dev/null +++ b/.github/eks-workflow-files/nsys-jax/test.yml @@ -0,0 +1,48 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue + kueue.x-k8s.io/max-exec-time-seconds: "10800" +spec: + template: + spec: + restartPolicy: Never + containers: + - name: nsys-jax + image: PLACEHOLDER + command: + - bash + - -c + - | + set -exo pipefail + + LOG_DIR="/output/${RUN_ID}" + mkdir -p ${LOG_DIR} + + # nsys-jax is already installed, this is just adding the test dependencies + pip install pytest-reportlog nsys-jax[test] + # abuse knowledge that nsys-jax is installed editable, so the tests exist + test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') + pytest \ + --basetemp=${LOG_DIR}/pytest-tmp \ + --report-log=${LOG_DIR}/pytest-report.jsonl \ + "${test_path}" \ + 2>&1 | tee ${LOG_DIR}/test-nsys-jax.log + env: + - name: RUN_ID + value: PLACEHOLDER + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: s3-storage + mountPath: /output + subPath: nsys-jax + imagePullSecrets: + - name: PLACEHOLDER + volumes: + - name: s3-storage + persistentVolumeClaim: + claimName: s3-pvc diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index a91ed0a6a..251352f68 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -53,13 +53,13 @@ jobs: MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }} secrets: inherit - # test-nccl: - # if: inputs.ARCHITECTURE == 'amd64' # build only amd64 - # needs: build-base - # uses: ./.github/workflows/_test_nccl.yaml - # with: - # CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }} - # secrets: inherit + test-nccl: + if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + needs: build-base + uses: ./.github/workflows/_test_nccl.yaml + with: + CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }} + secrets: inherit build-jax: needs: build-base @@ -94,598 +94,602 @@ jobs: DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }} DOCKER_TAG_FINAL: ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }} - # build-equinox: - # needs: build-jax - # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] - # outputs: - # DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }} - # DOCKER_TAG_FINAL: ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }} - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - name: Build Equinox container - # id: build-equinox - # uses: ./.github/actions/build-container - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-equinox-build - # BADGE_FILENAME: badge-equinox-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: equinox - # DOCKERFILE: .github/container/Dockerfile.equinox - # RUNNER_SIZE: small - # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - # github-token: ${{ secrets.GITHUB_TOKEN }} - # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - # EXTRA_BUILD_ARGS: | - # URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} - - # build-maxtext: - # needs: build-jax - # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] - # outputs: - # DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }} - # DOCKER_TAG_FINAL: ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }} - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - name: Build MaxText container - # id: build-maxtext - # uses: ./.github/actions/build-container - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-maxtext-build - # BADGE_FILENAME: badge-maxtext-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: maxtext - # DOCKERFILE: .github/container/Dockerfile.maxtext - # RUNNER_SIZE: small - # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - # github-token: ${{ secrets.GITHUB_TOKEN }} - # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - # EXTRA_BUILD_ARGS: | - # URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} - - # build-torchax: - # needs: build-jax - # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"] - # outputs: - # DOCKER_TAG_MEALKIT: ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }} - # DOCKER_TAG_FINAL: ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }} - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - name: Build TorchAX container - # id: build-torchax - # uses: ./.github/actions/build-container - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-torchax-build - # BADGE_FILENAME: badge-torchax-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: torchax - # DOCKERFILE: .github/container/Dockerfile.torchax - # RUNNER_SIZE: small - # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - # github-token: ${{ secrets.GITHUB_TOKEN }} - # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - # EXTRA_BUILD_ARGS: | - # URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }} - - # build-axlearn: - # needs: build-jax - # runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"] - # outputs: - # DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }} - # DOCKER_TAG_FINAL: ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }} - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - name: Build AxLearn container - # id: build-axlearn - # uses: ./.github/actions/build-container - # with: - # ARCHITECTURE: ${{ inputs.ARCHITECTURE }} - # ARTIFACT_NAME: artifact-axlearn-build - # BADGE_FILENAME: badge-axlearn-build - # BUILD_DATE: ${{ inputs.BUILD_DATE }} - # BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} - # CONTAINER_NAME: axlearn - # DOCKERFILE: .github/container/Dockerfile.axlearn - # RUNNER_SIZE: large - # ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} - # ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} - # github-token: ${{ secrets.GITHUB_TOKEN }} - # bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }} - # EXTRA_BUILD_ARGS: | - # URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }} - - # collect-docker-tags: - # runs-on: ubuntu-22.04 - # if: ${{ !cancelled() }} - # needs: - # - build-base - # - build-jax - # - build-equinox - # - build-maxtext - # - build-axlearn - # outputs: - # TAGS: ${{ steps.collect-tags.outputs.TAGS }} - # steps: - # - name: Save docker tags as a JSON object - # id: collect-tags - # run: | - # TAGS=$(cat <> $GITHUB_OUTPUT - - # test-jax: - # needs: build-jax - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'jax' - # ) - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: jax - # EXECUTE: | - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-backend-independent.log - # test-jax.sh -b backend-independent - # EOF - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-single-gpu.log - # nvidia-cuda-mps-control -d - # test-jax.sh -b single-gpu - # EOF - # docker run -i --shm-size=1g --gpus all \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-multi-gpu.log - # nvidia-cuda-mps-control -d - # test-jax.sh -b multi-gpu - # EOF - # STATISTICS_SCRIPT: | - # errors=$(cat test-*.log | grep -c 'ERROR:' || true) - # failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true) - # passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true) - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # test-backend-independent.log - # test-multi-gpu.log - # test-single-gpu.log - # secrets: inherit - - # test-nsys-jax: - # needs: build-jax - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'jax' - # ) - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: nsys-jax - # EXECUTE: | - # set -o pipefail - # mkdir -p output-results - # docker run -i --shm-size=1g --gpus all \ - # -v $PWD/output-results:/opt/output \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-nsys-jax.log - # # nsys-jax is already installed, this is just adding the test dependencies - # pip install pytest-reportlog nsys-jax[test] - # # abuse knowledge that nsys-jax is installed editable, so the tests exist - # test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())') - # pytest --basetemp=/opt/output/pytest-tmp --report-log=/opt/output/pytest-report.jsonl "${test_path}" - # chmod -R a+rwX /opt/output - # EOF - # STATISTICS_SCRIPT: | - # summary_line=$(tail -n1 test-nsys-jax.log) - # num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - # total_tests=$(( passed_tests + failed_tests )) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - # ARTIFACTS: | - # # pytest-driven part - # test-nsys-jax.log - # output-results/pytest-report.jsonl - # output-results/pytest-tmp/ - # secrets: inherit - - # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test - # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does - # # not already have nsys-jax installed - # test-nsys-jax-archive: - # needs: test-nsys-jax - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'jax' - # ) - # strategy: - # matrix: - # os: [ubuntu-22.04, ubuntu-24.04, macOS-latest] - # runs-on: ${{ matrix.os }} - # steps: - # - name: Download nsys-jax output .zip files - # uses: actions/download-artifact@v4 - # with: - # name: nsys-jax-unit-test-A100 - # - name: Extract archives and execute install scripts - # run: | - # pip install virtualenv # for install.sh - # for zip in $(ls *.zip); do - # ZIP="${PWD}/${zip}" - # pushd $(mktemp -d) - # unzip "${ZIP}" - # ls -l - # # TODO: verify this isn't needed, or make sure it isn't needed - # chmod 755 install.sh - # # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout - # # Skip executing Jupyter lab - # NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh - # popd - # done - - # test-nsys-jax-eks: - # needs: build-jax - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'jax' - # ) - # runs-on: eks - # env: - # JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - # JOB_NAME: ${{ github.run_id }}-nsys-jax - # # Service name cannot start with a number - # SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax - # POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess - # steps: - # - name: Check out the repository - # uses: actions/checkout@v4 - # - name: Login to GitHub Container Registry - # uses: docker/login-action@v3 - # with: - # registry: ghcr.io - # username: ${{ github.repository_owner }} - # password: ${{ secrets.GITHUB_TOKEN }} - # - name: K8s GHCR store and delete token - # id: store-token - # uses: ./.github/actions/store-delete-k8s-ghcr - # - name: Configure Kubernetes job - # run: | - # yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) - # | select(di == 0).metadata.name = strenv(SERVICE_NAME) - # | select(di == 1).metadata.name = strenv(JOB_NAME) - # | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) - # | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - # | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) - # | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME) - # | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \ - # .github/eks-workflow-files/job.yml - # git diff .github/eks-workflow-files/job.yml - # - name: Submit Kubernetes job - # uses: ./.github/actions/submit-delete-k8s-job - # with: - # job-config-file: .github/eks-workflow-files/job.yml - # job-name: ${{ env.JOB_NAME }} - # - name: Configure post-processing job - # run: | - # export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" - # yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) - # | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) - # | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" - # | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ - # .github/eks-workflow-files/post-process-job.yml - # git diff .github/eks-workflow-files/post-process-job.yml - # - name: Submit post process Kubernetes job - # uses: ./.github/actions/submit-delete-k8s-job - # with: - # job-config-file: .github/eks-workflow-files/post-process-job.yml - # job-name: ${{ env.POSTPROCESS_JOB_NAME }} - - # test-te-h100: - # needs: build-jax - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'te' - # ) - # uses: ./.github/workflows/_transformer_engine_eks.yaml - # with: - # JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - # JOB_NAME: transformerengine-${{ github.run_id }} - # S3_BUCKET: jax-toolbox-eks-output - # CI_NAME: transformer-engine - # secrets: inherit - - # test-jax-cutlass-h100: - # needs: build-jax - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'jax-cutlass' - # ) - # uses: ./.github/workflows/_jax_cutlass_eks.yaml - # with: - # JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} - # JOB_NAME: jax-cutlass-${{ github.run_id }} - # S3_BUCKET: jax-toolbox-eks-output - # CI_NAME: jax-cutlass - # secrets: inherit - - # test-te-a100: - # needs: build-jax - # secrets: inherit - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'te' - # ) - # uses: ./.github/workflows/_test_unit.yaml - # with: - # TEST_NAME: te - # EXECUTE: | - # docker run -i --gpus all --shm-size=1g -v $PWD:/log \ - # ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \ - # bash <<"EOF" |& tee test-te.log - # set -xu -o pipefail - - # LOG_DIR=/log - - # pip install pytest-reportlog pytest-xdist - # # Start MPS daemon - # nvidia-cuda-mps-control -d - # # TE's default is slightly different, without the hyphen - # export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE} - # # 1 GPU per worker, 3 workers per GPU - # pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh - # ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation - # ## into a single .jsonl file of results from multiple pytest invocations - # ## inside the test.sh script, so it's useful even with a single worker per - # ## device. - # pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh - - # # merge the log files - # cat \ - # ${LOG_DIR}/pytest-report-L0-unittest.jsonl \ - # ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \ - # > ${LOG_DIR}/pytest-report.jsonl - - # EOF - # STATISTICS_SCRIPT: | - # report_json=pytest-report.jsonl - # summary_line=$(tail -n1 test-te.log) - # errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}') - # passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) - # failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) - # total_tests=$((failed_tests + passed_tests)) - # echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT - # echo "ERRORS=${errors}" >> $GITHUB_OUTPUT - # echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT - - # echo "$failed_tests tests failed" - # if [[ $failed_tests -gt 0 ]]; then - # exit 1 - # else - # exit 0 - # fi - - # TIMEOUT_MINUTES: 120 - # ARTIFACTS: | - # test-te.log - # pytest-report.jsonl - # pytest-report-L0-unittest.jsonl - # pytest-report-L0-distributed-unittest.jsonl - - # test-maxtext: - # needs: build-maxtext - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'maxtext' - # ) - # uses: ./.github/workflows/_test_maxtext.yaml - # with: - # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-maxtext-gke: - # needs: build-maxtext - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'maxtext' - # ) - # uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml - # with: - # MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} - # secrets: inherit - - # test-axlearn-eks: - # needs: build-axlearn - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'axlearn' - # ) - # runs-on: eks - # env: - # AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - # JOB_NAME: axlearn-${{ github.run_id }} - # steps: - # - name: Check out the repository - # uses: actions/checkout@v4 - # - name: Login to GitHub Container Registry - # uses: docker/login-action@v3 - # with: - # registry: ghcr.io - # username: ${{ github.repository_owner }} - # password: ${{ secrets.GITHUB_TOKEN }} - # - name: K8s GHCR store and delete token - # id: store-token - # uses: ./.github/actions/store-delete-k8s-ghcr - # - name: Configure axlearn test job - # run: | - # # Replace placeholders in axlearn-job.yml with environment variables - # yq -i ea ' - # select(di == 0).metadata.name = strenv(JOB_NAME) - # | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - # | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" - # | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ - # .github/eks-workflow-files/axlearn/axlearn-job.yml - # git diff .github/eks-workflow-files/axlearn/axlearn-job.yml - # - name: Submit & delete axlearn test - # uses: ./.github/actions/submit-delete-k8s-job - # with: - # job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml" - # job-name: ${{ env.JOB_NAME }} - # - name: Download logs from S3 - # id: log-s3 - # if: ${{ !cancelled() }} - # run: | - # mkdir -p axlearn-output - # aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/ - # aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log" - # aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml" - - - # passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) - # failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) - # skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) - # total_tests=$((failed_tests + passed_tests + skipped_tests)) - - # echo "Passed tests: $passed_tests" - # echo "Failed tests: $failed_tests" - # echo "Skipped tests: $skipped_tests" - # echo "Total tests: $total_tests" - # echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT - # echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT - # echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT - # - name: Generate sitrep - # id: sitrep - # if: ${{ !cancelled() }} - # shell: bash -x -e {0} - # run: | - # # bring in utility functions - # source .github/workflows/scripts/to_json.sh - - # badge_label='Axlearn EKS Unit' - - # total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \ - # failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \ - # passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \ - # errors="0" \ - # summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ - # badge_message="Passed $passed_tests out of $total_tests." \ - # badge_color="brightgreen" - # if [ "$failed_tests" -gt 0 ]; then - # badge_color="red" - # fi \ - - # to_json \ - # summary \ - # errors total_tests passed_tests failed_tests \ - # badge_label badge_color badge_message \ - # > sitrep.json - - # schemaVersion=1 \ - # label="${badge_label}" \ - # message="Passed $passed_tests out of $total_tests." \ - # color=$badge_color \ - # to_json schemaVersion label message color \ - # > badge-axlearn-test.json - - # - name: Upload artifacts - # if: ${{ !cancelled() }} - # uses: actions/upload-artifact@v4 - # with: - # name: "artifact-axlearn-test" - # path: | - # sitrep.json - # badge-axlearn-test.json - # axlearn-unittests.jsonl - # axlearn-output/* - - # test-axlearn-fuji-models-eks: - # needs: build-axlearn - # if: >- - # inputs.ARCHITECTURE == 'amd64' && - # ( - # inputs.MODE == 'full' || - # inputs.MODE == 'axlearn' - # ) - # runs-on: eks - # env: - # AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} - # JOB_NAME: axlearn-fuji-3b-${{ github.run_id }} - # steps: - # - name: Check out the repository - # uses: actions/checkout@v4 - # - name: Login to GitHub Container Registry - # uses: docker/login-action@v3 - # with: - # registry: ghcr.io - # username: ${{ github.repository_owner }} - # password: ${{ secrets.GITHUB_TOKEN }} - # - name: K8s GHCR store and delete token - # id: store-token - # uses: ./.github/actions/store-delete-k8s-ghcr - # - name: Configure axlearn test job - # run: | - # yq -i ea ' - # select(di == 0).metadata.name = strenv(JOB_NAME) - # | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) - # | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ - # .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml - # git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml - - # - name: Submit & delete axlearn fuji model test - # uses: ./.github/actions/submit-delete-k8s-job - # with: - # job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" - # job-name: ${{ env.JOB_NAME }} + build-equinox: + needs: build-jax + runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }} + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build Equinox container + id: build-equinox + uses: ./.github/actions/build-container + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-equinox-build + BADGE_FILENAME: badge-equinox-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: equinox + DOCKERFILE: .github/container/Dockerfile.equinox + RUNNER_SIZE: small + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: "" + EXTRA_BUILD_ARGS: | + URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }} + + build-maxtext: + needs: build-jax + runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }} + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build MaxText container + id: build-maxtext + uses: ./.github/actions/build-container + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-maxtext-build + BADGE_FILENAME: badge-maxtext-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: maxtext + DOCKERFILE: .github/container/Dockerfile.maxtext + RUNNER_SIZE: small + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: "" + EXTRA_BUILD_ARGS: | + URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }} + + build-torchax: + needs: build-jax + runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }} + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build TorchAX container + id: build-torchax + uses: ./.github/actions/build-container + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-torchax-build + BADGE_FILENAME: badge-torchax-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: torchax + DOCKERFILE: .github/container/Dockerfile.torchax + RUNNER_SIZE: small + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: "" + EXTRA_BUILD_ARGS: | + URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }} + + build-axlearn: + needs: build-jax + runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }} + outputs: + DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }} + DOCKER_TAG_FINAL: ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Build AxLearn container + id: build-axlearn + uses: ./.github/actions/build-container + with: + ARCHITECTURE: ${{ inputs.ARCHITECTURE }} + ARTIFACT_NAME: artifact-axlearn-build + BADGE_FILENAME: badge-axlearn-build + BUILD_DATE: ${{ inputs.BUILD_DATE }} + BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }} + CONTAINER_NAME: axlearn + DOCKERFILE: .github/container/Dockerfile.axlearn + RUNNER_SIZE: large + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }} + github-token: ${{ secrets.GITHUB_TOKEN }} + bazel-remote-cache-url: "" + EXTRA_BUILD_ARGS: | + URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }} + + collect-docker-tags: + runs-on: ubuntu-22.04 + if: ${{ !cancelled() }} + needs: + - build-base + - build-jax + - build-equinox + - build-maxtext + - build-axlearn + outputs: + TAGS: ${{ steps.collect-tags.outputs.TAGS }} + steps: + - name: Save docker tags as a JSON object + id: collect-tags + run: | + TAGS=$(cat <> $GITHUB_OUTPUT + + test-jax-eks: + needs: build-jax + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'jax' + ) + runs-on: eks + env: + JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: jax-${{ github.run_id }} + steps: + - name: Check out the repository + uses: actions/checkout@v6 + - name: Login to GitHub Container Registry + uses: docker/login-action@v4 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr + - name: Configure JAX test job + run: | + yq -i ea ' + select(di == 0).metadata.name = strenv(JOB_NAME) + | select(di == 0).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" + | select(di == 0).spec.template.spec.imagePullSecrets[0].name = "${{ steps.store-token.outputs.token-name }}"' \ + .github/eks-workflow-files/jax/test.yml + git diff .github/eks-workflow-files/jax/test.yml + - name: Submit & delete JAX unit test job + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: ".github/eks-workflow-files/jax/test.yml" + job-name: ${{ env.JOB_NAME }} + - name: Download logs from S3 + id: log-s3 + if: ${{ !cancelled() }} + run: | + mkdir -p jax-output + aws s3 cp s3://jax-toolbox-eks-output/jax/${{ github.run_id }}/ jax-output/ --recursive + + errors=$(cat jax-output/test-*.log | grep -c 'ERROR:' || true) + failed_tests=$(cat jax-output/test-*.log | grep -c 'FAILED in' || true) + passed_tests=$(cat jax-output/test-*.log | grep -c 'PASSED in' || true) + total_tests=$((failed_tests + passed_tests)) + + echo "Passed tests: $passed_tests" + echo "Failed tests: $failed_tests" + echo "Total tests: $total_tests" + echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT + echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT + echo "ERRORS=$errors" >> $GITHUB_OUTPUT + + if [[ $failed_tests -gt 0 ]] || [[ $errors -gt 0 ]]; then + exit 1 + fi + - name: Generate sitrep + id: sitrep + if: ${{ !cancelled() }} + shell: bash -x -e {0} + run: | + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='JAX EKS unittest (8)' + + total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \ + failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \ + passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \ + errors=${{ steps.log-s3.outputs.ERRORS }} \ + summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ + badge_message="Passed $passed_tests out of $total_tests." \ + badge_color="brightgreen" + if [ "$failed_tests" -gt 0 ] || [ "$errors" -gt 0 ]; then + badge_color="red" + fi \ + + to_json \ + summary \ + errors total_tests passed_tests failed_tests \ + badge_label badge_color badge_message \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="Passed $passed_tests out of $total_tests." \ + color=$badge_color \ + to_json schemaVersion label message color \ + > badge-jax-unit-test-eks.json + + - name: Upload artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: "jax-unit-test-H100-eks" + path: | + sitrep.json + badge-jax-unit-test-eks.json + jax-output/* + + test-nsys-jax-eks: + needs: build-jax + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'jax' + ) + runs-on: eks + env: + JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: ${{ github.run_id }}-nsys-jax + # Service name cannot start with a number + SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax + POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr + - name: Configure Kubernetes job + run: | + yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + | select(di == 0).metadata.name = strenv(SERVICE_NAME) + | select(di == 1).metadata.name = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) + | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE) + | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \ + .github/eks-workflow-files/job.yml + git diff .github/eks-workflow-files/job.yml + - name: Submit Kubernetes job + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: .github/eks-workflow-files/job.yml + job-name: ${{ env.JOB_NAME }} + - name: Configure post-processing job + run: | + export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip" + yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME) + | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE) + | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}" + | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \ + .github/eks-workflow-files/post-process-job.yml + git diff .github/eks-workflow-files/post-process-job.yml + - name: Submit post process Kubernetes job + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: .github/eks-workflow-files/post-process-job.yml + job-name: ${{ env.POSTPROCESS_JOB_NAME }} + + test-te-h100: + needs: build-jax + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'te' + ) + uses: ./.github/workflows/_transformer_engine_eks.yaml + with: + JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: transformerengine-${{ github.run_id }} + S3_BUCKET: jax-toolbox-eks-output + CI_NAME: transformer-engine + secrets: inherit + + test-jax-cutlass-h100: + needs: build-jax + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'jax-cutlass' + ) + uses: ./.github/workflows/_jax_cutlass_eks.yaml + with: + JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: jax-cutlass-${{ github.run_id }} + S3_BUCKET: jax-toolbox-eks-output + CI_NAME: jax-cutlass + secrets: inherit + + test-maxtext-eks: + needs: build-maxtext + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'maxtext' + ) + runs-on: [eks] + env: + MAXTEXT_DOCKER_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: maxtext-${{ github.run_id }} + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr + - name: Configure maxtext test job + run: | + yq -i ea ' + select(di == 0).metadata.name = strenv(JOB_NAME) + | select(di == 0).spec.template.spec.containers[0].image = strenv(MAXTEXT_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" + | select(di == 0).spec.template.spec.imagePullSecrets[0].name = "${{ steps.store-token.outputs.token-name }}"' \ + .github/eks-workflow-files/maxtext/test.yml + git diff .github/eks-workflow-files/maxtext/test.yml + - name: Submit & delete maxtext test job + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: ".github/eks-workflow-files/maxtext/test.yml" + job-name: ${{ env.JOB_NAME }} + - name: Download results from S3 + id: s3-download + if: ${{ !cancelled() }} + run: | + mkdir -p maxtext-output + aws s3 cp s3://jax-toolbox-eks-output/maxtext/${{ github.run_id }}/ maxtext-output/ --recursive + - name: Run metrics + id: metrics + if: ${{ !cancelled() }} + run: | + pip install 'numpy<2.0.0' pytest pytest-reportlog tensorboard + RESULTS_DIR=maxtext-output BASELINES_DIR=MAXTEXT/upstream \ + pytest --report-log=report.jsonl .github/workflows/baselines/test_maxtext_metrics.py || true + - name: Generate sitrep + id: sitrep + if: ${{ !cancelled() }} + shell: bash -x -e {0} + run: | + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='MaxText EKS' + + passed_tests=$(cat report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l) + failed_tests=$(cat report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l) + total_tests=$(( passed_tests + failed_tests )) + errors=0 + badge_color="brightgreen" + if [ "$failed_tests" -gt 0 ]; then + badge_color="red" + fi + + total_tests=$total_tests \ + failed_tests=$failed_tests \ + passed_tests=$passed_tests \ + errors=$errors \ + summary="All metrics tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ + badge_message="Passed $passed_tests out of $total_tests." \ + badge_color=$badge_color \ + to_json \ + summary errors total_tests passed_tests failed_tests \ + badge_label badge_color badge_message \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="Passed $passed_tests out of $total_tests." \ + color=$badge_color \ + to_json schemaVersion label message color \ + > badge-maxtext-test-eks.json + + - name: Upload artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: "maxtext-test-H100-eks" + path: | + sitrep.json + badge-maxtext-test-eks.json + maxtext-output/ + report.jsonl + + test-maxtext-gke: + needs: build-maxtext + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'maxtext' + ) + uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml + with: + MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }} + secrets: inherit + + test-axlearn-eks: + needs: build-axlearn + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'axlearn' + ) + runs-on: eks + env: + AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: axlearn-${{ github.run_id }} + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr + - name: Configure axlearn test job + run: | + # Replace placeholders in axlearn-job.yml with environment variables + yq -i ea ' + select(di == 0).metadata.name = strenv(JOB_NAME) + | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}" + | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ + .github/eks-workflow-files/axlearn/axlearn-job.yml + git diff .github/eks-workflow-files/axlearn/axlearn-job.yml + - name: Submit & delete axlearn test + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml" + job-name: ${{ env.JOB_NAME }} + - name: Download logs from S3 + id: log-s3 + if: ${{ !cancelled() }} + run: | + mkdir -p axlearn-output + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/ + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log" + aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml" + + + passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' ) + total_tests=$((failed_tests + passed_tests + skipped_tests)) + + echo "Passed tests: $passed_tests" + echo "Failed tests: $failed_tests" + echo "Skipped tests: $skipped_tests" + echo "Total tests: $total_tests" + echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT + echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT + echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT + - name: Generate sitrep + id: sitrep + if: ${{ !cancelled() }} + shell: bash -x -e {0} + run: | + # bring in utility functions + source .github/workflows/scripts/to_json.sh + + badge_label='Axlearn EKS Unit' + + total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \ + failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \ + passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \ + errors="0" \ + summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \ + badge_message="Passed $passed_tests out of $total_tests." \ + badge_color="brightgreen" + if [ "$failed_tests" -gt 0 ]; then + badge_color="red" + fi \ + + to_json \ + summary \ + errors total_tests passed_tests failed_tests \ + badge_label badge_color badge_message \ + > sitrep.json + + schemaVersion=1 \ + label="${badge_label}" \ + message="Passed $passed_tests out of $total_tests." \ + color=$badge_color \ + to_json schemaVersion label message color \ + > badge-axlearn-test.json + + - name: Upload artifacts + if: ${{ !cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: "artifact-axlearn-test" + path: | + sitrep.json + badge-axlearn-test.json + axlearn-unittests.jsonl + axlearn-output/* + + test-axlearn-fuji-models-eks: + needs: build-axlearn + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'axlearn' + ) + runs-on: eks + env: + AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }} + JOB_NAME: axlearn-fuji-3b-${{ github.run_id }} + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: K8s GHCR store and delete token + id: store-token + uses: ./.github/actions/store-delete-k8s-ghcr + - name: Configure axlearn test job + run: | + yq -i ea ' + select(di == 0).metadata.name = strenv(JOB_NAME) + | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE) + | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \ + .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml + git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml + + - name: Submit & delete axlearn fuji model test + uses: ./.github/actions/submit-delete-k8s-job + with: + job-config-file: ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml" + job-name: ${{ env.JOB_NAME }} From 4ade0f5eab0e7acd4bf8b40a9c50457485eeb5da Mon Sep 17 00:00:00 2001 From: Steboss Date: Wed, 1 Apr 2026 10:23:08 +0200 Subject: [PATCH 15/15] fix actioN --- .github/actions/build-container/action.yml | 26 ++++++++++++---------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml index e00ab5d35..b945ca10f 100644 --- a/.github/actions/build-container/action.yml +++ b/.github/actions/build-container/action.yml @@ -224,11 +224,7 @@ runs: ${{ inputs.EXTRA_BUILD_ARGS }} # BAZEL CACHE EXPORT - - name: Prune BuildKit cache to free space for export - if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' || inputs.ENABLE_BAZEL_REPO_CACHE == 'true' - shell: bash - run: docker buildx prune --force - + # Snapshots are captured first; prune runs after to free space before upload. # type=tar streams a single archive instead of per-file copies - name: Export Bazel disk cache if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' @@ -247,13 +243,6 @@ runs: BUILD_DATE=${{ inputs.BUILD_DATE }} ${{ inputs.EXTRA_BUILD_ARGS }} - - name: Save Bazel disk cache - if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' - uses: actions/cache/save@v4 - with: - path: /tmp/bazel-disk.tar - key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} - - name: Export Bazel repo cache if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true' uses: docker/build-push-action@v5 @@ -269,6 +258,19 @@ runs: BUILD_DATE=${{ inputs.BUILD_DATE }} ${{ inputs.EXTRA_BUILD_ARGS }} + # Prune layer cache after snapshots are captured to free disk space before upload + - name: Prune BuildKit layer cache before upload + if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' || inputs.ENABLE_BAZEL_REPO_CACHE == 'true' + shell: bash + run: docker buildx prune --force + + - name: Save Bazel disk cache + if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' + uses: actions/cache/save@v4 + with: + path: /tmp/bazel-disk.tar + key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }} + - name: Save Bazel repo cache if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true' uses: actions/cache/save@v4