From 032e160c3cf0dd068a5e100090883b0e17dc6817 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 18 Mar 2026 17:07:11 +0000
Subject: [PATCH 01/15] use actions/cache to retrieve the cache

---
 .github/actions/build-container/action.yml | 83 ++++++++++++++++++++++
 .github/container/Dockerfile.jax           | 45 +++++++++++-
 .github/workflows/_build_base.yaml         |  4 +-
 .github/workflows/_ci.yaml                 |  2 +
 .github/workflows/ci.yaml                  | 10 ++-
 5 files changed, 134 insertions(+), 10 deletions(-)

diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index f53e14bea..297fbb738 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -54,6 +54,14 @@ inputs:
     description: "URL of the Bazel remote cache to use for building the image"
     required: true
     default: ""
+  ENABLE_BAZEL_DISK_CACHE:
+    description: "Enable Bazel disk cache via actions/cache"
+    required: false
+    default: "false"
+  ENABLE_BAZEL_REPO_CACHE:
+    description: "Enable Bazel repository cache via actions/cache"
+    required: false
+    default: "false"
 
 outputs:
   DOCKER_TAG_MEALKIT:
@@ -106,6 +114,32 @@ runs:
         mv version.py .github/container/nsys_jax/nsys_jax/
         cat .github/container/nsys_jax/nsys_jax/version.py
 
+    # BAZEL CACHE RESTORE
+    - name: Restore Bazel disk cache
+      if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
+      uses: actions/cache/restore@v4
+      with:
+        path: /tmp/bazel-disk-cache
+        key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
+        restore-keys: |
+          bazel-disk-cache-${{ inputs.ARCHITECTURE }}-
+
+    - name: Restore Bazel repo cache
+      if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
+      uses: actions/cache/restore@v4
+      with:
+        path: /tmp/bazel-repo-cache
+        key: bazel-repo-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
+        restore-keys: |
+          bazel-repo-cache-${{ inputs.ARCHITECTURE }}-
+
+    # Always create the seed dirs so --build-context is always valid (no-op if empty)
+    - name: Prepare Bazel cache seed directories
+      shell: bash
+      run: |
+        mkdir -p /tmp/bazel-disk-cache
+        mkdir -p /tmp/bazel-repo-cache
+
     # MEALKIT BUILD
     - name: Set docker metadata - mealkit
       id: mealkit-metadata
@@ -134,6 +168,8 @@ runs:
         ssh: default
         secret-files: |
           "SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}"
+        build-contexts: |
+          bazel-disk-seed=/tmp/bazel-disk-cache
         build-args: |
           BASE_IMAGE=${{ inputs.BASE_IMAGE }}
           BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}
@@ -173,6 +209,53 @@ runs:
           BUILD_DATE=${{ inputs.BUILD_DATE }}
           ${{ inputs.EXTRA_BUILD_ARGS }}
 
+    # BAZEL CACHE EXPORT
+    - name: Export Bazel disk cache
+      if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
+      uses: docker/build-push-action@v5
+      with:
+        context: ${{ inputs.DOCKER_CONTEXT }}
+        push: false
+        file: ${{ inputs.DOCKERFILE }}
+        platforms: linux/${{ inputs.ARCHITECTURE }}
+        target: bazel-disk-export
+        outputs: type=local,dest=/tmp/bazel-disk-cache-new
+        build-contexts: |
+          bazel-disk-seed=/tmp/bazel-disk-cache
+        build-args: |
+          BASE_IMAGE=${{ inputs.BASE_IMAGE }}
+          BUILD_DATE=${{ inputs.BUILD_DATE }}
+          ${{ inputs.EXTRA_BUILD_ARGS }}
+
+    - name: Save Bazel disk cache
+      if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
+      uses: actions/cache/save@v4
+      with:
+        path: /tmp/bazel-disk-cache-new
+        key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
+
+    - name: Export Bazel repo cache
+      if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
+      uses: docker/build-push-action@v5
+      with:
+        context: ${{ inputs.DOCKER_CONTEXT }}
+        push: false
+        file: ${{ inputs.DOCKERFILE }}
+        platforms: linux/${{ inputs.ARCHITECTURE }}
+        target: bazel-repo-export
+        outputs: type=local,dest=/tmp/bazel-repo-cache-new
+        build-args: |
+          BASE_IMAGE=${{ inputs.BASE_IMAGE }}
+          BUILD_DATE=${{ inputs.BUILD_DATE }}
+          ${{ inputs.EXTRA_BUILD_ARGS }}
+
+    - name: Save Bazel repo cache
+      if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
+      uses: actions/cache/save@v4
+      with:
+        path: /tmp/bazel-repo-cache-new
+        key: bazel-repo-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
+
     # SITREP GENERATION
     - name: Generate sitrep
       if: "!cancelled()"
diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax
index da7c2a29e..f964df995 100644
--- a/.github/container/Dockerfile.jax
+++ b/.github/container/Dockerfile.jax
@@ -19,14 +19,24 @@ ARG SRC_PATH_TRANSFORMER_ENGINE=/opt/transformer-engine
 ARG GIT_USER_NAME="JAX Toolbox"
 ARG GIT_USER_EMAIL=jax@nvidia.com
 
-ARG BAZEL_CACHE=/tmp
+ARG BAZEL_CACHE=/cache/bazel-disk
 ARG BUILD_DATE
 
+###############################################################################
+## Bazel disk cache seed (overridden via --build-context on cache hit)
+###############################################################################
+
+# On first run this is empty (FROM scratch). When actions/cache restores a
+# previous disk cache to /tmp/bazel-disk-cache on the runner, the caller passes
+# --build-context bazel-disk-seed=/tmp/bazel-disk-cache to inject it.
+FROM scratch AS bazel-disk-seed
+
 ###############################################################################
 ## Build JAX
 ###############################################################################
 
 FROM ${BASE_IMAGE} AS builder
+ARG TARGETARCH
 ARG URLREF_JAX
 ARG URLREF_TRANSFORMER_ENGINE
 ARG URLREF_XLA
@@ -54,9 +64,14 @@ RUN ARCH="$(dpkg --print-architecture)" && \
     chmod +x /usr/local/bin/bazel
 # Populate ${BUILD_PATH_JAXLIB} with editable wheels; --no-install because
 # (a) this is the builder stage, and (b) pip-finalize.sh does the install
-RUN mkdir -p /builder/extra-targets/{bin,python} && \
+RUN --mount=type=cache,id=bazel-disk-${TARGETARCH},target=/cache/bazel-disk,sharing=locked \
+    --mount=type=cache,id=bazel-repo-${TARGETARCH},target=/cache/bazel-repo,sharing=locked \
+    --mount=type=bind,from=bazel-disk-seed,source=.,target=/tmp/bazel-disk-seed,readonly \
+    cp -a /tmp/bazel-disk-seed/. /cache/bazel-disk/ 2>/dev/null || true && \
+    mkdir -p /builder/extra-targets/{bin,python} && \
     build-jax.sh \
     --bazel-cache ${BAZEL_CACHE} \
+    --build-param --bazel_options=--repository_cache=/cache/bazel-repo \
     --build-path-jaxlib ${BUILD_PATH_JAXLIB} \
     --extra-targets "${EXTRA_BAZEL_TARGETS}" \
     --extra-target-dest /builder/extra-targets \
@@ -148,3 +163,29 @@ RUN install-nsys-jax.sh ${SRC_PATH_NSYS_JAX}
 
 FROM mealkit AS final
 RUN pip-finalize.sh
+
+###############################################################################
+## Bazel cache export stages (used by CI to persist caches via actions/cache)
+###############################################################################
+
+# ARG BUILD_DATE ensures this always re-executes (never a registry cache hit),
+# so the snapshot always reflects the current run's cache mount content.
+FROM ${BASE_IMAGE} AS bazel-disk-snapshot
+ARG TARGETARCH
+ARG BUILD_DATE
+RUN --mount=type=cache,id=bazel-disk-${TARGETARCH},target=/cache/bazel-disk,sharing=locked,readonly \
+    mkdir -p /bazel-disk-snapshot && \
+    cp -rp /cache/bazel-disk/. /bazel-disk-snapshot/
+
+FROM scratch AS bazel-disk-export
+COPY --from=bazel-disk-snapshot /bazel-disk-snapshot /
+
+FROM ${BASE_IMAGE} AS bazel-repo-snapshot
+ARG TARGETARCH
+ARG BUILD_DATE
+RUN --mount=type=cache,id=bazel-repo-${TARGETARCH},target=/cache/bazel-repo,sharing=locked,readonly \
+    mkdir -p /bazel-repo-snapshot && \
+    cp -rp /cache/bazel-repo/. /bazel-repo-snapshot/
+
+FROM scratch AS bazel-repo-export
+COPY --from=bazel-repo-snapshot /bazel-repo-snapshot /
diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml
index 930ae9cfa..4774b6bb8 100644
--- a/.github/workflows/_build_base.yaml
+++ b/.github/workflows/_build_base.yaml
@@ -58,7 +58,7 @@ permissions:
 jobs:
 
   build-base:
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", small]
+    runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu32m' || 'linux-arm64-cpu32m' }}
     env:
       BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json
     outputs:
@@ -137,7 +137,7 @@ jobs:
             BUILD_DATE=${{ inputs.BUILD_DATE }}
             JAX_TOOLBOX_REF=${{ github.head_ref || github.sha }}
             ${{ inputs.BASE_IMAGE != 'latest' && format('BASE_IMAGE={0}', inputs.BASE_IMAGE) || '' }}
-        
+
       - name: Generate sitrep
         if: "!cancelled()"
         shell: bash -x -e {0}
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index ca6bd3c41..e5fefb633 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -83,6 +83,8 @@ jobs:
             ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
             github-token: ${{ secrets.GITHUB_TOKEN }}
             bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+            ENABLE_BAZEL_DISK_CACHE: 'true'
+            ENABLE_BAZEL_REPO_CACHE: 'true'
             EXTRA_BUILD_ARGS: |
               URLREF_JAX=${{ fromJson(inputs.SOURCE_URLREFS).JAX }}
               URLREF_XLA=${{ fromJson(inputs.SOURCE_URLREFS).XLA }}
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index d711da63d..f199635e8 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -4,12 +4,10 @@ on:
   schedule:
     - cron: '30 9 * * *' # Pacific Time 01:30 AM in UTC
     - cron: '0 0 * * 6' #midnight every Saturday UTC for scale-training
-  pull_request:
-    types:
-      - opened
-      - reopened
-      - ready_for_review
-      - synchronize
+  push:
+    # we need this to allow nv-gha-runners to run
+    branches:
+      - "**"
     paths-ignore:
       - '**.md'
       - '.github/triage/**'

From 476e1029e18f0ace913d2135ebd814a4bbe3ce85 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 18 Mar 2026 17:24:23 +0000
Subject: [PATCH 02/15] fix docker seutup build

---
 .github/actions/build-container/action.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index 297fbb738..72bc6970a 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -100,7 +100,6 @@ runs:
       with:
         driver-opts: |
           image=moby/buildkit:v0.12.1
-        version: v0.30.1
 
     - name: Download nsys-jax version.py
       uses: actions/download-artifact@v4

From c8029b3df2920b46374bcbf2a1de5f9596043852 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 18 Mar 2026 18:15:27 +0000
Subject: [PATCH 03/15] version fix

---
 .github/actions/build-container/action.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index 72bc6970a..563f6077d 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -97,6 +97,8 @@ runs:
 
     - name: Set up Docker Buildx
       uses: docker/setup-buildx-action@v3
+      env:
+        DOCKER_API_VERSION: '1.43'
       with:
         driver-opts: |
           image=moby/buildkit:v0.12.1

From 0feebc33d98b33c2e04c2a896da34f9e03b54d26 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 18 Mar 2026 18:25:17 +0000
Subject: [PATCH 04/15] version error again?

---
 .github/actions/build-container/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index 563f6077d..a4b463685 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -80,6 +80,8 @@ runs:
       run: |
         echo 'UPLD_IMAGE=ghcr.io/nvidia/jax-toolbox-internal' >> $GITHUB_ENV
         echo "BADGE_FILENAME_FULL=${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json" >> $GITHUB_ENV
+        # Cap Docker client API version to match the daemon on NVKS runners
+        echo 'DOCKER_API_VERSION=1.43' >> $GITHUB_ENV
 
     - name: Setup SSH
       id: setup-ssh
@@ -97,8 +99,6 @@ runs:
 
     - name: Set up Docker Buildx
       uses: docker/setup-buildx-action@v3
-      env:
-        DOCKER_API_VERSION: '1.43'
       with:
         driver-opts: |
           image=moby/buildkit:v0.12.1

From 98cde85fa8a3183a9c936d7cff19a888511028a8 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 18 Mar 2026 19:34:04 +0000
Subject: [PATCH 05/15] avoid using grpc

---
 .github/actions/build-container/action.yml |   11 +-
 .github/workflows/_ci.yaml                 | 1190 ++++++++++----------
 2 files changed, 604 insertions(+), 597 deletions(-)

diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index a4b463685..e711fdcd5 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -82,6 +82,13 @@ runs:
         echo "BADGE_FILENAME_FULL=${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json" >> $GITHUB_ENV
         # Cap Docker client API version to match the daemon on NVKS runners
         echo 'DOCKER_API_VERSION=1.43' >> $GITHUB_ENV
+        # When disk cache is enabled use the BuildKit cache mount path;
+        # otherwise fall back to the remote cache URL (internal infra runners).
+        if [[ "${{ inputs.ENABLE_BAZEL_DISK_CACHE }}" == "true" ]]; then
+          echo 'BAZEL_CACHE_ARG=BAZEL_CACHE=/cache/bazel-disk' >> $GITHUB_ENV
+        else
+          echo 'BAZEL_CACHE_ARG=BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}' >> $GITHUB_ENV
+        fi
 
     - name: Setup SSH
       id: setup-ssh
@@ -173,7 +180,7 @@ runs:
           bazel-disk-seed=/tmp/bazel-disk-cache
         build-args: |
           BASE_IMAGE=${{ inputs.BASE_IMAGE }}
-          BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}
+          ${{ env.BAZEL_CACHE_ARG }}
           BUILD_DATE=${{ inputs.BUILD_DATE }}
           ${{ inputs.EXTRA_BUILD_ARGS }}
     # FINAL IMAGE BUILD
@@ -206,7 +213,7 @@ runs:
           "SSH_KNOWN_HOSTS=${{ steps.setup-ssh.outputs.known-hosts-file }}"
         build-args: |
           BASE_IMAGE=${{ inputs.BASE_IMAGE }}
-          BAZEL_CACHE=${{ inputs.bazel-remote-cache-url }}
+          ${{ env.BAZEL_CACHE_ARG }}
           BUILD_DATE=${{ inputs.BUILD_DATE }}
           ${{ inputs.EXTRA_BUILD_ARGS }}
 
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index e5fefb633..893d4b04a 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -94,598 +94,598 @@ jobs:
       DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }}
       DOCKER_TAG_FINAL:   ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }}
 
-  build-equinox:
-    needs: build-jax
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
-    outputs:
-      DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }}
-      DOCKER_TAG_FINAL:   ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Build Equinox container
-        id: build-equinox
-        uses: ./.github/actions/build-container
-        with:
-          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-          ARTIFACT_NAME: artifact-equinox-build
-          BADGE_FILENAME: badge-equinox-build
-          BUILD_DATE: ${{ inputs.BUILD_DATE }}
-          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-          CONTAINER_NAME: equinox
-          DOCKERFILE: .github/container/Dockerfile.equinox
-          RUNNER_SIZE: small
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-          EXTRA_BUILD_ARGS: |
-            URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-
-  build-maxtext:
-    needs: build-jax
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
-    outputs:
-      DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}
-      DOCKER_TAG_FINAL:   ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Build MaxText container
-        id: build-maxtext
-        uses: ./.github/actions/build-container
-        with:
-          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-          ARTIFACT_NAME: artifact-maxtext-build
-          BADGE_FILENAME: badge-maxtext-build
-          BUILD_DATE: ${{ inputs.BUILD_DATE }}
-          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-          CONTAINER_NAME: maxtext
-          DOCKERFILE: .github/container/Dockerfile.maxtext
-          RUNNER_SIZE: small
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-          EXTRA_BUILD_ARGS: |
-            URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-
-  build-torchax:
-    needs: build-jax
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
-    outputs:
-      DOCKER_TAG_MEALKIT: ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }}
-      DOCKER_TAG_FINAL:   ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Build TorchAX container
-        id: build-torchax
-        uses: ./.github/actions/build-container
-        with:
-          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-          ARTIFACT_NAME: artifact-torchax-build
-          BADGE_FILENAME: badge-torchax-build
-          BUILD_DATE: ${{ inputs.BUILD_DATE }}
-          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-          CONTAINER_NAME: torchax
-          DOCKERFILE: .github/container/Dockerfile.torchax
-          RUNNER_SIZE: small
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-          EXTRA_BUILD_ARGS: |
-            URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }}
-
-  build-axlearn:
-    needs: build-jax
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
-    outputs:
-      DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}
-      DOCKER_TAG_FINAL:   ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Build AxLearn container
-        id: build-axlearn
-        uses: ./.github/actions/build-container
-        with:
-          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-          ARTIFACT_NAME: artifact-axlearn-build
-          BADGE_FILENAME: badge-axlearn-build
-          BUILD_DATE: ${{ inputs.BUILD_DATE }}
-          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-          CONTAINER_NAME: axlearn
-          DOCKERFILE: .github/container/Dockerfile.axlearn
-          RUNNER_SIZE: large
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-          EXTRA_BUILD_ARGS: |
-            URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }}
-
-  collect-docker-tags:
-    runs-on: ubuntu-22.04
-    if: ${{ !cancelled() }}
-    needs:
-      - build-base
-      - build-jax
-      - build-equinox
-      - build-maxtext
-      - build-axlearn
-    outputs:
-      TAGS: ${{ steps.collect-tags.outputs.TAGS }}
-    steps:
-      - name: Save docker tags as a JSON object
-        id: collect-tags
-        run: |
-          TAGS=$(cat <<EOF | jq -c
-          [\
-            {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
-            {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
-            {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-            {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
-
-            {}\
-          ]
-          EOF
-          )
-
-          echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
-
-  test-jax:
-    needs: build-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'jax'
-      )
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: jax
-      EXECUTE: |
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-backend-independent.log
-          test-jax.sh -b backend-independent
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-single-gpu.log
-          nvidia-cuda-mps-control -d
-          test-jax.sh -b single-gpu
-        EOF
-        docker run -i --shm-size=1g --gpus all \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-multi-gpu.log
-          nvidia-cuda-mps-control -d
-          test-jax.sh -b multi-gpu
-        EOF
-      STATISTICS_SCRIPT: |
-        errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-        failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-        passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        test-backend-independent.log
-        test-multi-gpu.log
-        test-single-gpu.log
-    secrets: inherit
-
-  test-nsys-jax:
-    needs: build-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'jax'
-      )
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: nsys-jax
-      EXECUTE: |
-        set -o pipefail
-        mkdir -p output-results
-        docker run -i --shm-size=1g --gpus all \
-          -v $PWD/output-results:/opt/output \
-          ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-          bash <<"EOF" |& tee test-nsys-jax.log
-            # nsys-jax is already installed, this is just adding the test dependencies
-            pip install pytest-reportlog nsys-jax[test]
-            # abuse knowledge that nsys-jax is installed editable, so the tests exist
-            test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-            pytest --basetemp=/opt/output/pytest-tmp --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-            chmod -R a+rwX /opt/output
-        EOF
-      STATISTICS_SCRIPT: |
-        summary_line=$(tail -n1 test-nsys-jax.log)
-        num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$(( passed_tests + failed_tests ))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-      ARTIFACTS: |
-        # pytest-driven part
-        test-nsys-jax.log
-        output-results/pytest-report.jsonl
-        output-results/pytest-tmp/
-    secrets: inherit
-
-  # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
-  # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
-  # not already have nsys-jax installed
-  test-nsys-jax-archive:
-    needs: test-nsys-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'jax'
-      )
-    strategy:
-      matrix:
-        os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Download nsys-jax output .zip files
-      uses: actions/download-artifact@v4
-      with:
-        name: nsys-jax-unit-test-A100
-    - name: Extract archives and execute install scripts
-      run: |
-        pip install virtualenv # for install.sh
-        for zip in $(ls *.zip); do
-          ZIP="${PWD}/${zip}"
-          pushd $(mktemp -d)
-          unzip "${ZIP}"
-          ls -l
-          # TODO: verify this isn't needed, or make sure it isn't needed
-          chmod 755 install.sh
-          # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-          # Skip executing Jupyter lab
-          NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-          popd
-        done
-
-  test-nsys-jax-eks:
-    needs: build-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'jax'
-      )
-    runs-on: eks
-    env:
-      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: ${{ github.run_id }}-nsys-jax
-      # Service name cannot start with a number
-      SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax
-      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-    - name: K8s GHCR store and delete token
-      id: store-token
-      uses: ./.github/actions/store-delete-k8s-ghcr
-    - name: Configure Kubernetes job
-      run: |
-        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-          | select(di == 0).metadata.name = strenv(SERVICE_NAME)
-          | select(di == 1).metadata.name = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
-          | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
-          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)
-          | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \
-          .github/eks-workflow-files/job.yml
-        git diff .github/eks-workflow-files/job.yml
-    - name: Submit Kubernetes job
-      uses: ./.github/actions/submit-delete-k8s-job
-      with:
-        job-config-file: .github/eks-workflow-files/job.yml
-        job-name: ${{ env.JOB_NAME }}
-    - name: Configure post-processing job
-      run: |
-        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-          | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
-          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-          .github/eks-workflow-files/post-process-job.yml
-        git diff .github/eks-workflow-files/post-process-job.yml
-    - name: Submit post process Kubernetes job
-      uses: ./.github/actions/submit-delete-k8s-job
-      with:
-        job-config-file: .github/eks-workflow-files/post-process-job.yml
-        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-
-  test-te-h100:
-    needs: build-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'te'
-      )
-    uses: ./.github/workflows/_transformer_engine_eks.yaml
-    with:
-      JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: transformerengine-${{ github.run_id }}
-      S3_BUCKET: jax-toolbox-eks-output
-      CI_NAME: transformer-engine
-    secrets: inherit
-
-  test-jax-cutlass-h100:
-    needs: build-jax
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'jax-cutlass'
-      )
-    uses: ./.github/workflows/_jax_cutlass_eks.yaml
-    with:
-      JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: jax-cutlass-${{ github.run_id }}
-      S3_BUCKET: jax-toolbox-eks-output
-      CI_NAME: jax-cutlass
-    secrets: inherit
-
-  test-te-a100:
-    needs: build-jax
-    secrets: inherit
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'te'
-      )
-    uses: ./.github/workflows/_test_unit.yaml
-    with:
-      TEST_NAME: te
-      EXECUTE: |
-        docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-        ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-        bash <<"EOF" |& tee test-te.log
-          set -xu -o pipefail
-
-          LOG_DIR=/log
-
-          pip install pytest-reportlog pytest-xdist
-          # Start MPS daemon
-          nvidia-cuda-mps-control -d
-          # TE's default is slightly different, without the hyphen
-          export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
-          # 1 GPU per worker, 3 workers per GPU
-          pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
-          ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation
-          ## into a single .jsonl file of results from multiple pytest invocations
-          ## inside the test.sh script, so it's useful even with a single worker per
-          ## device.
-          pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh
-
-          # merge the log files
-          cat \
-            ${LOG_DIR}/pytest-report-L0-unittest.jsonl \
-            ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \
-            > ${LOG_DIR}/pytest-report.jsonl
-
-        EOF
-      STATISTICS_SCRIPT: |
-        report_json=pytest-report.jsonl
-        summary_line=$(tail -n1 test-te.log)
-        errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-        passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-        failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-        total_tests=$((failed_tests + passed_tests))
-        echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-        echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-        echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-
-        echo "$failed_tests tests failed"
-        if [[ $failed_tests -gt 0 ]]; then
-            exit 1
-        else
-            exit 0
-        fi
-
-      TIMEOUT_MINUTES: 120
-      ARTIFACTS: |
-        test-te.log
-        pytest-report.jsonl
-        pytest-report-L0-unittest.jsonl
-        pytest-report-L0-distributed-unittest.jsonl
-
-  test-maxtext:
-    needs: build-maxtext
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'maxtext'
-      )
-    uses: ./.github/workflows/_test_maxtext.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
-
-  test-maxtext-gke:
-    needs: build-maxtext
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'maxtext'
-      )
-    uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml
-    with:
-      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-    secrets: inherit
-
-  test-axlearn-eks:
-    needs: build-axlearn
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'axlearn'
-      )
-    runs-on: eks
-    env:
-      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: axlearn-${{ github.run_id }}
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-    - name: K8s GHCR store and delete token
-      id: store-token
-      uses: ./.github/actions/store-delete-k8s-ghcr
-    - name: Configure axlearn test job
-      run: |
-        # Replace placeholders in axlearn-job.yml with environment variables
-        yq -i ea '
-           select(di == 0).metadata.name = strenv(JOB_NAME)
-          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-          | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}"
-          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
-        .github/eks-workflow-files/axlearn/axlearn-job.yml
-        git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
-    - name: Submit & delete axlearn test
-      uses: ./.github/actions/submit-delete-k8s-job
-      with:
-        job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
-        job-name: ${{ env.JOB_NAME }}
-    - name: Download logs from S3
-      id: log-s3
-      if: ${{ !cancelled() }}
-      run: |
-        mkdir -p axlearn-output
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log"
-        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml"
-
-
-        passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
-        failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
-        skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
-        total_tests=$((failed_tests + passed_tests + skipped_tests))
-
-        echo "Passed tests: $passed_tests"
-        echo "Failed tests: $failed_tests"
-        echo "Skipped tests: $skipped_tests"
-        echo "Total tests: $total_tests"
-        echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
-        echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
-        echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
-    - name: Generate sitrep
-      id: sitrep
-      if: ${{ !cancelled() }}
-      shell: bash -x -e {0}
-      run: |
-        # bring in utility functions
-        source .github/workflows/scripts/to_json.sh
-
-        badge_label='Axlearn EKS Unit'
-
-        total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
-        failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
-        passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
-        errors="0" \
-        summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
-        badge_message="Passed $passed_tests out of $total_tests." \
-        badge_color="brightgreen"
-        if [ "$failed_tests" -gt 0 ]; then
-          badge_color="red"
-        fi \
-
-        to_json \
-          summary \
-          errors total_tests passed_tests failed_tests \
-          badge_label badge_color badge_message \
-        > sitrep.json
-
-        schemaVersion=1 \
-        label="${badge_label}" \
-        message="Passed $passed_tests out of $total_tests." \
-        color=$badge_color \
-        to_json schemaVersion label message color \
-        > badge-axlearn-test.json
-
-    - name: Upload artifacts
-      if: ${{ !cancelled() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: "artifact-axlearn-test"
-        path: |
-          sitrep.json
-          badge-axlearn-test.json
-          axlearn-unittests.jsonl
-          axlearn-output/*
-
-  test-axlearn-fuji-models-eks:
-    needs: build-axlearn
-    if: >-
-      inputs.ARCHITECTURE == 'amd64' &&
-      (
-        inputs.MODE == 'full' ||
-        inputs.MODE == 'axlearn'
-      )
-    runs-on: eks
-    env:
-      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-      JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
-    steps:
-    - name: Check out the repository
-      uses: actions/checkout@v4
-    - name: Login to GitHub Container Registry
-      uses: docker/login-action@v3
-      with:
-        registry: ghcr.io
-        username: ${{ github.repository_owner }}
-        password: ${{ secrets.GITHUB_TOKEN }}
-    - name: K8s GHCR store and delete token
-      id: store-token
-      uses: ./.github/actions/store-delete-k8s-ghcr
-    - name: Configure axlearn test job
-      run: |
-        yq -i ea '
-           select(di == 0).metadata.name = strenv(JOB_NAME)
-          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
-        .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
-        git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
-
-    - name: Submit & delete axlearn fuji model test
-      uses: ./.github/actions/submit-delete-k8s-job
-      with:
-        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
-        job-name: ${{ env.JOB_NAME }}
+  # build-equinox:
+  #   needs: build-jax
+  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
+  #   outputs:
+  #     DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }}
+  #     DOCKER_TAG_FINAL:   ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }}
+  #   steps:
+  #     - name: Checkout repository
+  #       uses: actions/checkout@v4
+  #     - name: Build Equinox container
+  #       id: build-equinox
+  #       uses: ./.github/actions/build-container
+  #       with:
+  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #         ARTIFACT_NAME: artifact-equinox-build
+  #         BADGE_FILENAME: badge-equinox-build
+  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #         CONTAINER_NAME: equinox
+  #         DOCKERFILE: .github/container/Dockerfile.equinox
+  #         RUNNER_SIZE: small
+  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+  #         github-token: ${{ secrets.GITHUB_TOKEN }}
+  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+  #         EXTRA_BUILD_ARGS: |
+  #           URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+
+  # build-maxtext:
+  #   needs: build-jax
+  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
+  #   outputs:
+  #     DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}
+  #     DOCKER_TAG_FINAL:   ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   steps:
+  #     - name: Checkout repository
+  #       uses: actions/checkout@v4
+  #     - name: Build MaxText container
+  #       id: build-maxtext
+  #       uses: ./.github/actions/build-container
+  #       with:
+  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #         ARTIFACT_NAME: artifact-maxtext-build
+  #         BADGE_FILENAME: badge-maxtext-build
+  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #         CONTAINER_NAME: maxtext
+  #         DOCKERFILE: .github/container/Dockerfile.maxtext
+  #         RUNNER_SIZE: small
+  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+  #         github-token: ${{ secrets.GITHUB_TOKEN }}
+  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+  #         EXTRA_BUILD_ARGS: |
+  #           URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+
+  # build-torchax:
+  #   needs: build-jax
+  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
+  #   outputs:
+  #     DOCKER_TAG_MEALKIT: ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }}
+  #     DOCKER_TAG_FINAL:   ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }}
+  #   steps:
+  #     - name: Checkout repository
+  #       uses: actions/checkout@v4
+  #     - name: Build TorchAX container
+  #       id: build-torchax
+  #       uses: ./.github/actions/build-container
+  #       with:
+  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #         ARTIFACT_NAME: artifact-torchax-build
+  #         BADGE_FILENAME: badge-torchax-build
+  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #         CONTAINER_NAME: torchax
+  #         DOCKERFILE: .github/container/Dockerfile.torchax
+  #         RUNNER_SIZE: small
+  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+  #         github-token: ${{ secrets.GITHUB_TOKEN }}
+  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+  #         EXTRA_BUILD_ARGS: |
+  #           URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }}
+
+  # build-axlearn:
+  #   needs: build-jax
+  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
+  #   outputs:
+  #     DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}
+  #     DOCKER_TAG_FINAL:   ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+  #   steps:
+  #     - name: Checkout repository
+  #       uses: actions/checkout@v4
+  #     - name: Build AxLearn container
+  #       id: build-axlearn
+  #       uses: ./.github/actions/build-container
+  #       with:
+  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+  #         ARTIFACT_NAME: artifact-axlearn-build
+  #         BADGE_FILENAME: badge-axlearn-build
+  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
+  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+  #         CONTAINER_NAME: axlearn
+  #         DOCKERFILE: .github/container/Dockerfile.axlearn
+  #         RUNNER_SIZE: large
+  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+  #         github-token: ${{ secrets.GITHUB_TOKEN }}
+  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+  #         EXTRA_BUILD_ARGS: |
+  #           URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }}
+
+  # collect-docker-tags:
+  #   runs-on: ubuntu-22.04
+  #   if: ${{ !cancelled() }}
+  #   needs:
+  #     - build-base
+  #     - build-jax
+  #     - build-equinox
+  #     - build-maxtext
+  #     - build-axlearn
+  #   outputs:
+  #     TAGS: ${{ steps.collect-tags.outputs.TAGS }}
+  #   steps:
+  #     - name: Save docker tags as a JSON object
+  #       id: collect-tags
+  #       run: |
+  #         TAGS=$(cat <<EOF | jq -c
+  #         [\
+  #           {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
+  #           {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
+  #           {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
+  #           {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+  #           {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+  #           {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
+
+  #           {}\
+  #         ]
+  #         EOF
+  #         )
+
+  #         echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
+
+  # test-jax:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax'
+  #     )
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: jax
+  #     EXECUTE: |
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-backend-independent.log
+  #         test-jax.sh -b backend-independent
+  #       EOF
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-single-gpu.log
+  #         nvidia-cuda-mps-control -d
+  #         test-jax.sh -b single-gpu
+  #       EOF
+  #       docker run -i --shm-size=1g --gpus all \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-multi-gpu.log
+  #         nvidia-cuda-mps-control -d
+  #         test-jax.sh -b multi-gpu
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
+  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
+  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       test-backend-independent.log
+  #       test-multi-gpu.log
+  #       test-single-gpu.log
+  #   secrets: inherit
+
+  # test-nsys-jax:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax'
+  #     )
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: nsys-jax
+  #     EXECUTE: |
+  #       set -o pipefail
+  #       mkdir -p output-results
+  #       docker run -i --shm-size=1g --gpus all \
+  #         -v $PWD/output-results:/opt/output \
+  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #         bash <<"EOF" |& tee test-nsys-jax.log
+  #           # nsys-jax is already installed, this is just adding the test dependencies
+  #           pip install pytest-reportlog nsys-jax[test]
+  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
+  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+  #           pytest --basetemp=/opt/output/pytest-tmp --report-log=/opt/output/pytest-report.jsonl "${test_path}"
+  #           chmod -R a+rwX /opt/output
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       summary_line=$(tail -n1 test-nsys-jax.log)
+  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$(( passed_tests + failed_tests ))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+  #     ARTIFACTS: |
+  #       # pytest-driven part
+  #       test-nsys-jax.log
+  #       output-results/pytest-report.jsonl
+  #       output-results/pytest-tmp/
+  #   secrets: inherit
+
+  # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
+  # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
+  # # not already have nsys-jax installed
+  # test-nsys-jax-archive:
+  #   needs: test-nsys-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax'
+  #     )
+  #   strategy:
+  #     matrix:
+  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
+  #   runs-on: ${{ matrix.os }}
+  #   steps:
+  #   - name: Download nsys-jax output .zip files
+  #     uses: actions/download-artifact@v4
+  #     with:
+  #       name: nsys-jax-unit-test-A100
+  #   - name: Extract archives and execute install scripts
+  #     run: |
+  #       pip install virtualenv # for install.sh
+  #       for zip in $(ls *.zip); do
+  #         ZIP="${PWD}/${zip}"
+  #         pushd $(mktemp -d)
+  #         unzip "${ZIP}"
+  #         ls -l
+  #         # TODO: verify this isn't needed, or make sure it isn't needed
+  #         chmod 755 install.sh
+  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
+  #         # Skip executing Jupyter lab
+  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
+  #         popd
+  #       done
+
+  # test-nsys-jax-eks:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax'
+  #     )
+  #   runs-on: eks
+  #   env:
+  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: ${{ github.run_id }}-nsys-jax
+  #     # Service name cannot start with a number
+  #     SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax
+  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: Login to GitHub Container Registry
+  #     uses: docker/login-action@v3
+  #     with:
+  #       registry: ghcr.io
+  #       username: ${{ github.repository_owner }}
+  #       password: ${{ secrets.GITHUB_TOKEN }}
+  #   - name: K8s GHCR store and delete token
+  #     id: store-token
+  #     uses: ./.github/actions/store-delete-k8s-ghcr
+  #   - name: Configure Kubernetes job
+  #     run: |
+  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+  #         | select(di == 0).metadata.name = strenv(SERVICE_NAME)
+  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
+  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
+  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)
+  #         | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \
+  #         .github/eks-workflow-files/job.yml
+  #       git diff .github/eks-workflow-files/job.yml
+  #   - name: Submit Kubernetes job
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with:
+  #       job-config-file: .github/eks-workflow-files/job.yml
+  #       job-name: ${{ env.JOB_NAME }}
+  #   - name: Configure post-processing job
+  #     run: |
+  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+  #         | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
+  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+  #         .github/eks-workflow-files/post-process-job.yml
+  #       git diff .github/eks-workflow-files/post-process-job.yml
+  #   - name: Submit post process Kubernetes job
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with:
+  #       job-config-file: .github/eks-workflow-files/post-process-job.yml
+  #       job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+
+  # test-te-h100:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'te'
+  #     )
+  #   uses: ./.github/workflows/_transformer_engine_eks.yaml
+  #   with:
+  #     JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: transformerengine-${{ github.run_id }}
+  #     S3_BUCKET: jax-toolbox-eks-output
+  #     CI_NAME: transformer-engine
+  #   secrets: inherit
+
+  # test-jax-cutlass-h100:
+  #   needs: build-jax
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'jax-cutlass'
+  #     )
+  #   uses: ./.github/workflows/_jax_cutlass_eks.yaml
+  #   with:
+  #     JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: jax-cutlass-${{ github.run_id }}
+  #     S3_BUCKET: jax-toolbox-eks-output
+  #     CI_NAME: jax-cutlass
+  #   secrets: inherit
+
+  # test-te-a100:
+  #   needs: build-jax
+  #   secrets: inherit
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'te'
+  #     )
+  #   uses: ./.github/workflows/_test_unit.yaml
+  #   with:
+  #     TEST_NAME: te
+  #     EXECUTE: |
+  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
+  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
+  #       bash <<"EOF" |& tee test-te.log
+  #         set -xu -o pipefail
+
+  #         LOG_DIR=/log
+
+  #         pip install pytest-reportlog pytest-xdist
+  #         # Start MPS daemon
+  #         nvidia-cuda-mps-control -d
+  #         # TE's default is slightly different, without the hyphen
+  #         export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
+  #         # 1 GPU per worker, 3 workers per GPU
+  #         pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
+  #         ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation
+  #         ## into a single .jsonl file of results from multiple pytest invocations
+  #         ## inside the test.sh script, so it's useful even with a single worker per
+  #         ## device.
+  #         pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh
+
+  #         # merge the log files
+  #         cat \
+  #           ${LOG_DIR}/pytest-report-L0-unittest.jsonl \
+  #           ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \
+  #           > ${LOG_DIR}/pytest-report.jsonl
+
+  #       EOF
+  #     STATISTICS_SCRIPT: |
+  #       report_json=pytest-report.jsonl
+  #       summary_line=$(tail -n1 test-te.log)
+  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
+  #       passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+  #       failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+  #       total_tests=$((failed_tests + passed_tests))
+  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
+  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
+  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
+
+  #       echo "$failed_tests tests failed"
+  #       if [[ $failed_tests -gt 0 ]]; then
+  #           exit 1
+  #       else
+  #           exit 0
+  #       fi
+
+  #     TIMEOUT_MINUTES: 120
+  #     ARTIFACTS: |
+  #       test-te.log
+  #       pytest-report.jsonl
+  #       pytest-report-L0-unittest.jsonl
+  #       pytest-report-L0-distributed-unittest.jsonl
+
+  # test-maxtext:
+  #   needs: build-maxtext
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'maxtext'
+  #     )
+  #   uses: ./.github/workflows/_test_maxtext.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
+
+  # test-maxtext-gke:
+  #   needs: build-maxtext
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'maxtext'
+  #     )
+  #   uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml
+  #   with:
+  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+  #   secrets: inherit
+
+  # test-axlearn-eks:
+  #   needs: build-axlearn
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'axlearn'
+  #     )
+  #   runs-on: eks
+  #   env:
+  #     AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: axlearn-${{ github.run_id }}
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: Login to GitHub Container Registry
+  #     uses: docker/login-action@v3
+  #     with:
+  #       registry: ghcr.io
+  #       username: ${{ github.repository_owner }}
+  #       password: ${{ secrets.GITHUB_TOKEN }}
+  #   - name: K8s GHCR store and delete token
+  #     id: store-token
+  #     uses: ./.github/actions/store-delete-k8s-ghcr
+  #   - name: Configure axlearn test job
+  #     run: |
+  #       # Replace placeholders in axlearn-job.yml with environment variables
+  #       yq -i ea '
+  #          select(di == 0).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+  #         | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}"
+  #         | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+  #       .github/eks-workflow-files/axlearn/axlearn-job.yml
+  #       git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
+  #   - name: Submit & delete axlearn test
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with:
+  #       job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
+  #       job-name: ${{ env.JOB_NAME }}
+  #   - name: Download logs from S3
+  #     id: log-s3
+  #     if: ${{ !cancelled() }}
+  #     run: |
+  #       mkdir -p axlearn-output
+  #       aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/
+  #       aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log"
+  #       aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml"
+
+
+  #       passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
+  #       failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
+  #       skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
+  #       total_tests=$((failed_tests + passed_tests + skipped_tests))
+
+  #       echo "Passed tests: $passed_tests"
+  #       echo "Failed tests: $failed_tests"
+  #       echo "Skipped tests: $skipped_tests"
+  #       echo "Total tests: $total_tests"
+  #       echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
+  #       echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
+  #       echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
+  #   - name: Generate sitrep
+  #     id: sitrep
+  #     if: ${{ !cancelled() }}
+  #     shell: bash -x -e {0}
+  #     run: |
+  #       # bring in utility functions
+  #       source .github/workflows/scripts/to_json.sh
+
+  #       badge_label='Axlearn EKS Unit'
+
+  #       total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
+  #       failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
+  #       passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
+  #       errors="0" \
+  #       summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
+  #       badge_message="Passed $passed_tests out of $total_tests." \
+  #       badge_color="brightgreen"
+  #       if [ "$failed_tests" -gt 0 ]; then
+  #         badge_color="red"
+  #       fi \
+
+  #       to_json \
+  #         summary \
+  #         errors total_tests passed_tests failed_tests \
+  #         badge_label badge_color badge_message \
+  #       > sitrep.json
+
+  #       schemaVersion=1 \
+  #       label="${badge_label}" \
+  #       message="Passed $passed_tests out of $total_tests." \
+  #       color=$badge_color \
+  #       to_json schemaVersion label message color \
+  #       > badge-axlearn-test.json
+
+  #   - name: Upload artifacts
+  #     if: ${{ !cancelled() }}
+  #     uses: actions/upload-artifact@v4
+  #     with:
+  #       name: "artifact-axlearn-test"
+  #       path: |
+  #         sitrep.json
+  #         badge-axlearn-test.json
+  #         axlearn-unittests.jsonl
+  #         axlearn-output/*
+
+  # test-axlearn-fuji-models-eks:
+  #   needs: build-axlearn
+  #   if: >-
+  #     inputs.ARCHITECTURE == 'amd64' &&
+  #     (
+  #       inputs.MODE == 'full' ||
+  #       inputs.MODE == 'axlearn'
+  #     )
+  #   runs-on: eks
+  #   env:
+  #     AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+  #     JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
+  #   steps:
+  #   - name: Check out the repository
+  #     uses: actions/checkout@v4
+  #   - name: Login to GitHub Container Registry
+  #     uses: docker/login-action@v3
+  #     with:
+  #       registry: ghcr.io
+  #       username: ${{ github.repository_owner }}
+  #       password: ${{ secrets.GITHUB_TOKEN }}
+  #   - name: K8s GHCR store and delete token
+  #     id: store-token
+  #     uses: ./.github/actions/store-delete-k8s-ghcr
+  #   - name: Configure axlearn test job
+  #     run: |
+  #       yq -i ea '
+  #          select(di == 0).metadata.name = strenv(JOB_NAME)
+  #         | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+  #         | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+  #       .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+  #       git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+
+  #   - name: Submit & delete axlearn fuji model test
+  #     uses: ./.github/actions/submit-delete-k8s-job
+  #     with:
+  #       job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
+  #       job-name: ${{ env.JOB_NAME }}

From ddcdc489e091f0c9b1a71f18c5f9dddf8af8aab2 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Thu, 19 Mar 2026 09:37:35 +0000
Subject: [PATCH 06/15] test without grpc

---
 .github/workflows/_ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 893d4b04a..509b5ea99 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -82,7 +82,7 @@ jobs:
             ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
             ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
             github-token: ${{ secrets.GITHUB_TOKEN }}
-            bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
+            bazel-remote-cache-url: ""
             ENABLE_BAZEL_DISK_CACHE: 'true'
             ENABLE_BAZEL_REPO_CACHE: 'true'
             EXTRA_BUILD_ARGS: |

From 066f5fcfbb0a756390707dc9c96d36b768d44cb1 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 20 Mar 2026 08:51:17 +0000
Subject: [PATCH 07/15] impose liimits to the size of the cache, compress the
 cache to avoid having lots of files being pushed and timing out

---
 .github/actions/build-container/action.yml | 23 +++++++++++++++-------
 .github/container/Dockerfile.jax           |  1 +
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index e711fdcd5..b6d253ccc 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -127,7 +127,7 @@ runs:
       if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
       uses: actions/cache/restore@v4
       with:
-        path: /tmp/bazel-disk-cache
+        path: /tmp/bazel-disk.tar
         key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
         restore-keys: |
           bazel-disk-cache-${{ inputs.ARCHITECTURE }}-
@@ -136,17 +136,23 @@ runs:
       if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
       uses: actions/cache/restore@v4
       with:
-        path: /tmp/bazel-repo-cache
+        path: /tmp/bazel-repo.tar
         key: bazel-repo-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
         restore-keys: |
           bazel-repo-cache-${{ inputs.ARCHITECTURE }}-
 
-    # Always create the seed dirs so --build-context is always valid (no-op if empty)
+    # Extract restored tars into seed dirs; create empty dirs on first run
     - name: Prepare Bazel cache seed directories
       shell: bash
       run: |
         mkdir -p /tmp/bazel-disk-cache
+        if [[ -f /tmp/bazel-disk.tar ]]; then
+          tar -xf /tmp/bazel-disk.tar -C /tmp/bazel-disk-cache
+        fi
         mkdir -p /tmp/bazel-repo-cache
+        if [[ -f /tmp/bazel-repo.tar ]]; then
+          tar -xf /tmp/bazel-repo.tar -C /tmp/bazel-repo-cache
+        fi
 
     # MEALKIT BUILD
     - name: Set docker metadata - mealkit
@@ -218,6 +224,9 @@ runs:
           ${{ inputs.EXTRA_BUILD_ARGS }}
 
     # BAZEL CACHE EXPORT
+    # type=tar,compression=zstd streams a single archive instead of per-file
+    # copies — avoids the O(N-files) overhead that caused 3h+ timeouts with
+    # type=local on a large Bazel disk cache.
     - name: Export Bazel disk cache
       if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
       uses: docker/build-push-action@v5
@@ -227,7 +236,7 @@ runs:
         file: ${{ inputs.DOCKERFILE }}
         platforms: linux/${{ inputs.ARCHITECTURE }}
         target: bazel-disk-export
-        outputs: type=local,dest=/tmp/bazel-disk-cache-new
+        outputs: type=tar,dest=/tmp/bazel-disk.tar,compression=zstd,compression-level=3
         build-contexts: |
           bazel-disk-seed=/tmp/bazel-disk-cache
         build-args: |
@@ -239,7 +248,7 @@ runs:
       if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
       uses: actions/cache/save@v4
       with:
-        path: /tmp/bazel-disk-cache-new
+        path: /tmp/bazel-disk.tar
         key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
 
     - name: Export Bazel repo cache
@@ -251,7 +260,7 @@ runs:
         file: ${{ inputs.DOCKERFILE }}
         platforms: linux/${{ inputs.ARCHITECTURE }}
         target: bazel-repo-export
-        outputs: type=local,dest=/tmp/bazel-repo-cache-new
+        outputs: type=tar,dest=/tmp/bazel-repo.tar,compression=zstd,compression-level=3
         build-args: |
           BASE_IMAGE=${{ inputs.BASE_IMAGE }}
           BUILD_DATE=${{ inputs.BUILD_DATE }}
@@ -261,7 +270,7 @@ runs:
       if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
       uses: actions/cache/save@v4
       with:
-        path: /tmp/bazel-repo-cache-new
+        path: /tmp/bazel-repo.tar
         key: bazel-repo-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
 
     # SITREP GENERATION
diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax
index f964df995..4247c2624 100644
--- a/.github/container/Dockerfile.jax
+++ b/.github/container/Dockerfile.jax
@@ -72,6 +72,7 @@ RUN --mount=type=cache,id=bazel-disk-${TARGETARCH},target=/cache/bazel-disk,shar
     build-jax.sh \
     --bazel-cache ${BAZEL_CACHE} \
     --build-param --bazel_options=--repository_cache=/cache/bazel-repo \
+    --build-param --bazel_options=--disk_cache_max_size=32212254720 \
     --build-path-jaxlib ${BUILD_PATH_JAXLIB} \
     --extra-targets "${EXTRA_BAZEL_TARGETS}" \
     --extra-target-dest /builder/extra-targets \

From 52fee504f5bbd58d4f0eb5b29c67c5b85c7f124a Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 20 Mar 2026 09:17:09 +0000
Subject: [PATCH 08/15] i thought we have a bazel disk size

---
 .github/container/Dockerfile.jax | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/container/Dockerfile.jax b/.github/container/Dockerfile.jax
index 4247c2624..f964df995 100644
--- a/.github/container/Dockerfile.jax
+++ b/.github/container/Dockerfile.jax
@@ -72,7 +72,6 @@ RUN --mount=type=cache,id=bazel-disk-${TARGETARCH},target=/cache/bazel-disk,shar
     build-jax.sh \
     --bazel-cache ${BAZEL_CACHE} \
     --build-param --bazel_options=--repository_cache=/cache/bazel-repo \
-    --build-param --bazel_options=--disk_cache_max_size=32212254720 \
     --build-path-jaxlib ${BUILD_PATH_JAXLIB} \
     --extra-targets "${EXTRA_BAZEL_TARGETS}" \
     --extra-target-dest /builder/extra-targets \

From 5d6838e7fab758e3eaeb5b069049b1d0d37e1bc0 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 20 Mar 2026 11:25:57 +0000
Subject: [PATCH 09/15] fix version of buildkit and compression, to allow the
 artifact compression over limits

---
 .github/actions/build-container/action.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index b6d253ccc..5f5eb409b 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -108,7 +108,7 @@ runs:
       uses: docker/setup-buildx-action@v3
       with:
         driver-opts: |
-          image=moby/buildkit:v0.12.1
+          image=moby/buildkit:v0.19.0
 
     - name: Download nsys-jax version.py
       uses: actions/download-artifact@v4
@@ -236,7 +236,7 @@ runs:
         file: ${{ inputs.DOCKERFILE }}
         platforms: linux/${{ inputs.ARCHITECTURE }}
         target: bazel-disk-export
-        outputs: type=tar,dest=/tmp/bazel-disk.tar,compression=zstd,compression-level=3
+        outputs: type=tar,dest=/tmp/bazel-disk.tar
         build-contexts: |
           bazel-disk-seed=/tmp/bazel-disk-cache
         build-args: |
@@ -260,7 +260,7 @@ runs:
         file: ${{ inputs.DOCKERFILE }}
         platforms: linux/${{ inputs.ARCHITECTURE }}
         target: bazel-repo-export
-        outputs: type=tar,dest=/tmp/bazel-repo.tar,compression=zstd,compression-level=3
+        outputs: type=tar,dest=/tmp/bazel-repo.tar
         build-args: |
           BASE_IMAGE=${{ inputs.BASE_IMAGE }}
           BUILD_DATE=${{ inputs.BUILD_DATE }}

From d17ef5e4e407475047f6b473b81883869436002d Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Fri, 20 Mar 2026 15:00:02 +0000
Subject: [PATCH 10/15] fix teh platform

---
 .github/workflows/_build_base.yaml |  2 +-
 .github/workflows/_ci.yaml         | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/_build_base.yaml b/.github/workflows/_build_base.yaml
index 4774b6bb8..3251d5dd9 100644
--- a/.github/workflows/_build_base.yaml
+++ b/.github/workflows/_build_base.yaml
@@ -58,7 +58,7 @@ permissions:
 jobs:
 
   build-base:
-    runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu32m' || 'linux-arm64-cpu32m' }}
+    runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
     env:
       BADGE_FILENAME_FULL: ${{ inputs.BADGE_FILENAME }}-${{ inputs.ARCHITECTURE }}.json
     outputs:
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index 509b5ea99..a91ed0a6a 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -53,17 +53,17 @@ jobs:
       MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }}
     secrets: inherit
 
-  test-nccl:
-    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-    needs: build-base
-    uses: ./.github/workflows/_test_nccl.yaml
-    with:
-      CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }}
-    secrets: inherit
+  # test-nccl:
+  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+  #   needs: build-base
+  #   uses: ./.github/workflows/_test_nccl.yaml
+  #   with:
+  #     CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }}
+  #   secrets: inherit
 
   build-jax:
     needs: build-base
-    runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
+    runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu32m' || 'linux-arm64-cpu32m' }}
     steps:
         - name: Checkout repository
           uses: actions/checkout@v4

From 1a04c26f6422c8ad1087e2f67d41ac7ae8e87c57 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Sat, 21 Mar 2026 09:52:22 +0000
Subject: [PATCH 11/15] trigger a ci build

---
 simplefile | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 simplefile

diff --git a/simplefile b/simplefile
new file mode 100644
index 000000000..e69de29bb

From 57481ee650f662dee25be8ee216fb15211fb84fa Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 23 Mar 2026 10:35:42 +0000
Subject: [PATCH 12/15] trigger again the pipeline

---
 simplefile2trigger | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 simplefile2trigger

diff --git a/simplefile2trigger b/simplefile2trigger
new file mode 100644
index 000000000..e69de29bb

From 8429d77efef10a1999072534726924c5026058d3 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Mon, 23 Mar 2026 16:52:13 +0000
Subject: [PATCH 13/15] pruning on

---
 .github/actions/build-container/action.yml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index 5f5eb409b..e00ab5d35 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -224,9 +224,12 @@ runs:
           ${{ inputs.EXTRA_BUILD_ARGS }}
 
     # BAZEL CACHE EXPORT
-    # type=tar,compression=zstd streams a single archive instead of per-file
-    # copies — avoids the O(N-files) overhead that caused 3h+ timeouts with
-    # type=local on a large Bazel disk cache.
+    - name: Prune BuildKit cache to free space for export
+      if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' || inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
+      shell: bash
+      run: docker buildx prune --force
+
+    # type=tar streams a single archive instead of per-file copies
     - name: Export Bazel disk cache
       if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
       uses: docker/build-push-action@v5

From a422308b67f324641f9779f8f13c63dd291b9042 Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 25 Mar 2026 14:09:14 +0000
Subject: [PATCH 14/15] try new workflow and move jobs on eks

---
 .github/eks-workflow-files/jax/test.yml      |   48 +
 .github/eks-workflow-files/maxtext/test.yml  |   69 +
 .github/eks-workflow-files/nsys-jax/test.yml |   48 +
 .github/workflows/_ci.yaml                   | 1208 +++++++++---------
 4 files changed, 771 insertions(+), 602 deletions(-)
 create mode 100644 .github/eks-workflow-files/jax/test.yml
 create mode 100644 .github/eks-workflow-files/maxtext/test.yml
 create mode 100644 .github/eks-workflow-files/nsys-jax/test.yml

diff --git a/.github/eks-workflow-files/jax/test.yml b/.github/eks-workflow-files/jax/test.yml
new file mode 100644
index 000000000..c6b6c4bbe
--- /dev/null
+++ b/.github/eks-workflow-files/jax/test.yml
@@ -0,0 +1,48 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: PLACEHOLDER
+    labels:
+        kueue.x-k8s.io/queue-name: p5-queue
+        kueue.x-k8s.io/max-exec-time-seconds: "10800"
+spec:
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: jax
+                  image: PLACEHOLDER
+                  command:
+                    - bash
+                    - -c
+                    - |
+                      set -exo pipefail
+
+                      LOG_DIR="/output/${RUN_ID}"
+                      mkdir -p ${LOG_DIR}
+
+                      # backend-independent tests
+                      test-jax.sh -b backend-independent 2>&1 | tee ${LOG_DIR}/test-backend-independent.log
+
+                      # single-gpu tests
+                      nvidia-cuda-mps-control -d
+                      test-jax.sh -b single-gpu 2>&1 | tee ${LOG_DIR}/test-single-gpu.log
+
+                      # multi-gpu tests
+                      test-jax.sh -b multi-gpu 2>&1 | tee ${LOG_DIR}/test-multi-gpu.log
+                  env:
+                    - name: RUN_ID
+                      value: PLACEHOLDER
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
+                  volumeMounts:
+                    - name: s3-storage
+                      mountPath: /output
+                      subPath: jax
+            imagePullSecrets:
+                - name: PLACEHOLDER
+            volumes:
+                - name: s3-storage
+                  persistentVolumeClaim:
+                    claimName: s3-pvc
diff --git a/.github/eks-workflow-files/maxtext/test.yml b/.github/eks-workflow-files/maxtext/test.yml
new file mode 100644
index 000000000..455cf2f4d
--- /dev/null
+++ b/.github/eks-workflow-files/maxtext/test.yml
@@ -0,0 +1,69 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: PLACEHOLDER
+    labels:
+        kueue.x-k8s.io/queue-name: p5-queue
+        kueue.x-k8s.io/max-exec-time-seconds: "10800"
+spec:
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: maxtext
+                  image: PLACEHOLDER
+                  command:
+                    - bash
+                    - -c
+                    - |
+                      set -exo pipefail
+
+                      LOG_DIR="/output/${RUN_ID}"
+                      mkdir -p ${LOG_DIR}
+
+                      # single-process-multi-device: PP=1, DP=1, FSDP=2, TP=4
+                      test-maxtext.sh \
+                          --output ${LOG_DIR}/1DP2FSDP4TP1PP_single_process \
+                          --dtype bfloat16 \
+                          --mem-fraction 0.65 \
+                          --decoder-block default \
+                          --attn-type dot_product \
+                          --batch-per-gpu 2 \
+                          --steps 10 \
+                          --pipeline-parallel 1 \
+                          --data-parallel 1 \
+                          --fsdp 2 \
+                          --tensor-parallel 4 \
+                          --nodes 1
+
+                      # multi-process: PP=1, DP=2, FSDP=2, TP=2
+                      test-maxtext.sh \
+                          --output ${LOG_DIR}/2DP2FSDP2TP1PP \
+                          --dtype bfloat16 \
+                          --mem-fraction 0.65 \
+                          --decoder-block default \
+                          --attn-type dot_product \
+                          --batch-per-gpu 2 \
+                          --steps 10 \
+                          --pipeline-parallel 1 \
+                          --data-parallel 2 \
+                          --fsdp 2 \
+                          --tensor-parallel 2 \
+                          --nodes 1 \
+                          --multiprocess
+                  env:
+                    - name: RUN_ID
+                      value: PLACEHOLDER
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
+                  volumeMounts:
+                    - name: s3-storage
+                      mountPath: /output
+                      subPath: maxtext
+            imagePullSecrets:
+                - name: PLACEHOLDER
+            volumes:
+                - name: s3-storage
+                  persistentVolumeClaim:
+                    claimName: s3-pvc
diff --git a/.github/eks-workflow-files/nsys-jax/test.yml b/.github/eks-workflow-files/nsys-jax/test.yml
new file mode 100644
index 000000000..789eca17c
--- /dev/null
+++ b/.github/eks-workflow-files/nsys-jax/test.yml
@@ -0,0 +1,48 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+    name: PLACEHOLDER
+    labels:
+        kueue.x-k8s.io/queue-name: p5-queue
+        kueue.x-k8s.io/max-exec-time-seconds: "10800"
+spec:
+    template:
+        spec:
+            restartPolicy: Never
+            containers:
+                - name: nsys-jax
+                  image: PLACEHOLDER
+                  command:
+                    - bash
+                    - -c
+                    - |
+                      set -exo pipefail
+
+                      LOG_DIR="/output/${RUN_ID}"
+                      mkdir -p ${LOG_DIR}
+
+                      # nsys-jax is already installed, this is just adding the test dependencies
+                      pip install pytest-reportlog nsys-jax[test]
+                      # abuse knowledge that nsys-jax is installed editable, so the tests exist
+                      test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
+                      pytest \
+                          --basetemp=${LOG_DIR}/pytest-tmp \
+                          --report-log=${LOG_DIR}/pytest-report.jsonl \
+                          "${test_path}" \
+                          2>&1 | tee ${LOG_DIR}/test-nsys-jax.log
+                  env:
+                    - name: RUN_ID
+                      value: PLACEHOLDER
+                  resources:
+                    limits:
+                        nvidia.com/gpu: 8
+                  volumeMounts:
+                    - name: s3-storage
+                      mountPath: /output
+                      subPath: nsys-jax
+            imagePullSecrets:
+                - name: PLACEHOLDER
+            volumes:
+                - name: s3-storage
+                  persistentVolumeClaim:
+                    claimName: s3-pvc
diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
index a91ed0a6a..251352f68 100644
--- a/.github/workflows/_ci.yaml
+++ b/.github/workflows/_ci.yaml
@@ -53,13 +53,13 @@ jobs:
       MANIFEST_ARTIFACT_NAME: ${{ inputs.MANIFEST_ARTIFACT_NAME }}
     secrets: inherit
 
-  # test-nccl:
-  #   if: inputs.ARCHITECTURE == 'amd64' # build only amd64
-  #   needs: build-base
-  #   uses: ./.github/workflows/_test_nccl.yaml
-  #   with:
-  #     CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }}
-  #   secrets: inherit
+  test-nccl:
+    if: inputs.ARCHITECTURE == 'amd64' # build only amd64
+    needs: build-base
+    uses: ./.github/workflows/_test_nccl.yaml
+    with:
+      CONTAINER: ${{ needs.build-base.outputs.DOCKER_TAG }}
+    secrets: inherit
 
   build-jax:
     needs: build-base
@@ -94,598 +94,602 @@ jobs:
       DOCKER_TAG_MEALKIT: ${{ steps.build-jax.outputs.DOCKER_TAG_MEALKIT }}
       DOCKER_TAG_FINAL:   ${{ steps.build-jax.outputs.DOCKER_TAG_FINAL }}
 
-  # build-equinox:
-  #   needs: build-jax
-  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
-  #   outputs:
-  #     DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }}
-  #     DOCKER_TAG_FINAL:   ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }}
-  #   steps:
-  #     - name: Checkout repository
-  #       uses: actions/checkout@v4
-  #     - name: Build Equinox container
-  #       id: build-equinox
-  #       uses: ./.github/actions/build-container
-  #       with:
-  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #         ARTIFACT_NAME: artifact-equinox-build
-  #         BADGE_FILENAME: badge-equinox-build
-  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #         CONTAINER_NAME: equinox
-  #         DOCKERFILE: .github/container/Dockerfile.equinox
-  #         RUNNER_SIZE: small
-  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-  #         github-token: ${{ secrets.GITHUB_TOKEN }}
-  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-  #         EXTRA_BUILD_ARGS: |
-  #           URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
-
-  # build-maxtext:
-  #   needs: build-jax
-  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
-  #   outputs:
-  #     DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}
-  #     DOCKER_TAG_FINAL:   ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-  #   steps:
-  #     - name: Checkout repository
-  #       uses: actions/checkout@v4
-  #     - name: Build MaxText container
-  #       id: build-maxtext
-  #       uses: ./.github/actions/build-container
-  #       with:
-  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #         ARTIFACT_NAME: artifact-maxtext-build
-  #         BADGE_FILENAME: badge-maxtext-build
-  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #         CONTAINER_NAME: maxtext
-  #         DOCKERFILE: .github/container/Dockerfile.maxtext
-  #         RUNNER_SIZE: small
-  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-  #         github-token: ${{ secrets.GITHUB_TOKEN }}
-  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-  #         EXTRA_BUILD_ARGS: |
-  #           URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
-
-  # build-torchax:
-  #   needs: build-jax
-  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "small"]
-  #   outputs:
-  #     DOCKER_TAG_MEALKIT: ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }}
-  #     DOCKER_TAG_FINAL:   ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }}
-  #   steps:
-  #     - name: Checkout repository
-  #       uses: actions/checkout@v4
-  #     - name: Build TorchAX container
-  #       id: build-torchax
-  #       uses: ./.github/actions/build-container
-  #       with:
-  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #         ARTIFACT_NAME: artifact-torchax-build
-  #         BADGE_FILENAME: badge-torchax-build
-  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #         CONTAINER_NAME: torchax
-  #         DOCKERFILE: .github/container/Dockerfile.torchax
-  #         RUNNER_SIZE: small
-  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-  #         github-token: ${{ secrets.GITHUB_TOKEN }}
-  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-  #         EXTRA_BUILD_ARGS: |
-  #           URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }}
-
-  # build-axlearn:
-  #   needs: build-jax
-  #   runs-on: [self-hosted, "${{ inputs.ARCHITECTURE }}", "large"]
-  #   outputs:
-  #     DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}
-  #     DOCKER_TAG_FINAL:   ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-  #   steps:
-  #     - name: Checkout repository
-  #       uses: actions/checkout@v4
-  #     - name: Build AxLearn container
-  #       id: build-axlearn
-  #       uses: ./.github/actions/build-container
-  #       with:
-  #         ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
-  #         ARTIFACT_NAME: artifact-axlearn-build
-  #         BADGE_FILENAME: badge-axlearn-build
-  #         BUILD_DATE: ${{ inputs.BUILD_DATE }}
-  #         BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
-  #         CONTAINER_NAME: axlearn
-  #         DOCKERFILE: .github/container/Dockerfile.axlearn
-  #         RUNNER_SIZE: large
-  #         ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
-  #         ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
-  #         github-token: ${{ secrets.GITHUB_TOKEN }}
-  #         bazel-remote-cache-url: ${{ vars.BAZEL_REMOTE_CACHE_URL }}
-  #         EXTRA_BUILD_ARGS: |
-  #           URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }}
-
-  # collect-docker-tags:
-  #   runs-on: ubuntu-22.04
-  #   if: ${{ !cancelled() }}
-  #   needs:
-  #     - build-base
-  #     - build-jax
-  #     - build-equinox
-  #     - build-maxtext
-  #     - build-axlearn
-  #   outputs:
-  #     TAGS: ${{ steps.collect-tags.outputs.TAGS }}
-  #   steps:
-  #     - name: Save docker tags as a JSON object
-  #       id: collect-tags
-  #       run: |
-  #         TAGS=$(cat <<EOF | jq -c
-  #         [\
-  #           {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
-  #           {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
-  #           {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
-  #           {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
-  #           {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
-  #           {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
-  #           {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
-  #           {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
-  #           {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
-
-  #           {}\
-  #         ]
-  #         EOF
-  #         )
-
-  #         echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
-
-  # test-jax:
-  #   needs: build-jax
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'jax'
-  #     )
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: jax
-  #     EXECUTE: |
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-backend-independent.log
-  #         test-jax.sh -b backend-independent
-  #       EOF
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-single-gpu.log
-  #         nvidia-cuda-mps-control -d
-  #         test-jax.sh -b single-gpu
-  #       EOF
-  #       docker run -i --shm-size=1g --gpus all \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-multi-gpu.log
-  #         nvidia-cuda-mps-control -d
-  #         test-jax.sh -b multi-gpu
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       errors=$(cat test-*.log | grep -c 'ERROR:' || true)
-  #       failed_tests=$(cat test-*.log | grep -c 'FAILED in' || true)
-  #       passed_tests=$(cat test-*.log | grep -c 'PASSED in' || true)
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       test-backend-independent.log
-  #       test-multi-gpu.log
-  #       test-single-gpu.log
-  #   secrets: inherit
-
-  # test-nsys-jax:
-  #   needs: build-jax
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'jax'
-  #     )
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: nsys-jax
-  #     EXECUTE: |
-  #       set -o pipefail
-  #       mkdir -p output-results
-  #       docker run -i --shm-size=1g --gpus all \
-  #         -v $PWD/output-results:/opt/output \
-  #         ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #         bash <<"EOF" |& tee test-nsys-jax.log
-  #           # nsys-jax is already installed, this is just adding the test dependencies
-  #           pip install pytest-reportlog nsys-jax[test]
-  #           # abuse knowledge that nsys-jax is installed editable, so the tests exist
-  #           test_path=$(python -c 'import importlib.resources; print(importlib.resources.files("nsys_jax").joinpath("..", "tests").resolve())')
-  #           pytest --basetemp=/opt/output/pytest-tmp --report-log=/opt/output/pytest-report.jsonl "${test_path}"
-  #           chmod -R a+rwX /opt/output
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       summary_line=$(tail -n1 test-nsys-jax.log)
-  #       num_errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat output-results/pytest-report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-  #       total_tests=$(( passed_tests + failed_tests ))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${num_errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-  #     ARTIFACTS: |
-  #       # pytest-driven part
-  #       test-nsys-jax.log
-  #       output-results/pytest-report.jsonl
-  #       output-results/pytest-tmp/
-  #   secrets: inherit
-
-  # # test-nsys-jax generates several fresh .zip archive outputs by running nsys-jax with real GPU hardware; this test
-  # # runs on a regular GitHub Actions runner and checks that offline post-processing works in an environment that does
-  # # not already have nsys-jax installed
-  # test-nsys-jax-archive:
-  #   needs: test-nsys-jax
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'jax'
-  #     )
-  #   strategy:
-  #     matrix:
-  #       os: [ubuntu-22.04, ubuntu-24.04, macOS-latest]
-  #   runs-on: ${{ matrix.os }}
-  #   steps:
-  #   - name: Download nsys-jax output .zip files
-  #     uses: actions/download-artifact@v4
-  #     with:
-  #       name: nsys-jax-unit-test-A100
-  #   - name: Extract archives and execute install scripts
-  #     run: |
-  #       pip install virtualenv # for install.sh
-  #       for zip in $(ls *.zip); do
-  #         ZIP="${PWD}/${zip}"
-  #         pushd $(mktemp -d)
-  #         unzip "${ZIP}"
-  #         ls -l
-  #         # TODO: verify this isn't needed, or make sure it isn't needed
-  #         chmod 755 install.sh
-  #         # Run the notebook with IPython, not Jupyter Lab, so it exits and prints something informative to stdout
-  #         # Skip executing Jupyter lab
-  #         NSYS_JAX_JUPYTER_EXECUTE_NOT_LAB=1 ./install.sh
-  #         popd
-  #       done
-
-  # test-nsys-jax-eks:
-  #   needs: build-jax
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'jax'
-  #     )
-  #   runs-on: eks
-  #   env:
-  #     JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-  #     JOB_NAME: ${{ github.run_id }}-nsys-jax
-  #     # Service name cannot start with a number
-  #     SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax
-  #     POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
-  #   steps:
-  #   - name: Check out the repository
-  #     uses: actions/checkout@v4
-  #   - name: Login to GitHub Container Registry
-  #     uses: docker/login-action@v3
-  #     with:
-  #       registry: ghcr.io
-  #       username: ${{ github.repository_owner }}
-  #       password: ${{ secrets.GITHUB_TOKEN }}
-  #   - name: K8s GHCR store and delete token
-  #     id: store-token
-  #     uses: ./.github/actions/store-delete-k8s-ghcr
-  #   - name: Configure Kubernetes job
-  #     run: |
-  #       yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
-  #         | select(di == 0).metadata.name = strenv(SERVICE_NAME)
-  #         | select(di == 1).metadata.name = strenv(JOB_NAME)
-  #         | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
-  #         | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
-  #         | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
-  #         | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)
-  #         | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \
-  #         .github/eks-workflow-files/job.yml
-  #       git diff .github/eks-workflow-files/job.yml
-  #   - name: Submit Kubernetes job
-  #     uses: ./.github/actions/submit-delete-k8s-job
-  #     with:
-  #       job-config-file: .github/eks-workflow-files/job.yml
-  #       job-name: ${{ env.JOB_NAME }}
-  #   - name: Configure post-processing job
-  #     run: |
-  #       export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
-  #       yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
-  #         | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
-  #         | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
-  #         | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
-  #         .github/eks-workflow-files/post-process-job.yml
-  #       git diff .github/eks-workflow-files/post-process-job.yml
-  #   - name: Submit post process Kubernetes job
-  #     uses: ./.github/actions/submit-delete-k8s-job
-  #     with:
-  #       job-config-file: .github/eks-workflow-files/post-process-job.yml
-  #       job-name: ${{ env.POSTPROCESS_JOB_NAME }}
-
-  # test-te-h100:
-  #   needs: build-jax
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'te'
-  #     )
-  #   uses: ./.github/workflows/_transformer_engine_eks.yaml
-  #   with:
-  #     JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-  #     JOB_NAME: transformerengine-${{ github.run_id }}
-  #     S3_BUCKET: jax-toolbox-eks-output
-  #     CI_NAME: transformer-engine
-  #   secrets: inherit
-
-  # test-jax-cutlass-h100:
-  #   needs: build-jax
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'jax-cutlass'
-  #     )
-  #   uses: ./.github/workflows/_jax_cutlass_eks.yaml
-  #   with:
-  #     JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
-  #     JOB_NAME: jax-cutlass-${{ github.run_id }}
-  #     S3_BUCKET: jax-toolbox-eks-output
-  #     CI_NAME: jax-cutlass
-  #   secrets: inherit
-
-  # test-te-a100:
-  #   needs: build-jax
-  #   secrets: inherit
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'te'
-  #     )
-  #   uses: ./.github/workflows/_test_unit.yaml
-  #   with:
-  #     TEST_NAME: te
-  #     EXECUTE: |
-  #       docker run -i --gpus all --shm-size=1g -v $PWD:/log \
-  #       ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }} \
-  #       bash <<"EOF" |& tee test-te.log
-  #         set -xu -o pipefail
-
-  #         LOG_DIR=/log
-
-  #         pip install pytest-reportlog pytest-xdist
-  #         # Start MPS daemon
-  #         nvidia-cuda-mps-control -d
-  #         # TE's default is slightly different, without the hyphen
-  #         export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
-  #         # 1 GPU per worker, 3 workers per GPU
-  #         pytest-xdist.sh 1 3 ${LOG_DIR}/pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
-  #         ## 8 GPUs per worker, 1 worker per GPU. pytest-xdist.sh allows aggregation
-  #         ## into a single .jsonl file of results from multiple pytest invocations
-  #         ## inside the test.sh script, so it's useful even with a single worker per
-  #         ## device.
-  #         pytest-xdist.sh 8 1 ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_distributed_unittest/test.sh
-
-  #         # merge the log files
-  #         cat \
-  #           ${LOG_DIR}/pytest-report-L0-unittest.jsonl \
-  #           ${LOG_DIR}/pytest-report-L0-distributed-unittest.jsonl \
-  #           > ${LOG_DIR}/pytest-report.jsonl
-
-  #       EOF
-  #     STATISTICS_SCRIPT: |
-  #       report_json=pytest-report.jsonl
-  #       summary_line=$(tail -n1 test-te.log)
-  #       errors=$(echo $summary_line | grep -oE '[0-9]+ error' | awk '{print $1} END { if (!NR) print 0}')
-  #       passed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
-  #       failed_tests=$(cat $report_json | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
-  #       total_tests=$((failed_tests + passed_tests))
-  #       echo "TOTAL_TESTS=${total_tests}" >> $GITHUB_OUTPUT
-  #       echo "ERRORS=${errors}" >> $GITHUB_OUTPUT
-  #       echo "PASSED_TESTS=${passed_tests}" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=${failed_tests}" >> $GITHUB_OUTPUT
-
-  #       echo "$failed_tests tests failed"
-  #       if [[ $failed_tests -gt 0 ]]; then
-  #           exit 1
-  #       else
-  #           exit 0
-  #       fi
-
-  #     TIMEOUT_MINUTES: 120
-  #     ARTIFACTS: |
-  #       test-te.log
-  #       pytest-report.jsonl
-  #       pytest-report-L0-unittest.jsonl
-  #       pytest-report-L0-distributed-unittest.jsonl
-
-  # test-maxtext:
-  #   needs: build-maxtext
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'maxtext'
-  #     )
-  #   uses: ./.github/workflows/_test_maxtext.yaml
-  #   with:
-  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-maxtext-gke:
-  #   needs: build-maxtext
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'maxtext'
-  #     )
-  #   uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml
-  #   with:
-  #     MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
-  #   secrets: inherit
-
-  # test-axlearn-eks:
-  #   needs: build-axlearn
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'axlearn'
-  #     )
-  #   runs-on: eks
-  #   env:
-  #     AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-  #     JOB_NAME: axlearn-${{ github.run_id }}
-  #   steps:
-  #   - name: Check out the repository
-  #     uses: actions/checkout@v4
-  #   - name: Login to GitHub Container Registry
-  #     uses: docker/login-action@v3
-  #     with:
-  #       registry: ghcr.io
-  #       username: ${{ github.repository_owner }}
-  #       password: ${{ secrets.GITHUB_TOKEN }}
-  #   - name: K8s GHCR store and delete token
-  #     id: store-token
-  #     uses: ./.github/actions/store-delete-k8s-ghcr
-  #   - name: Configure axlearn test job
-  #     run: |
-  #       # Replace placeholders in axlearn-job.yml with environment variables
-  #       yq -i ea '
-  #          select(di == 0).metadata.name = strenv(JOB_NAME)
-  #         | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-  #         | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}"
-  #         | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
-  #       .github/eks-workflow-files/axlearn/axlearn-job.yml
-  #       git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
-  #   - name: Submit & delete axlearn test
-  #     uses: ./.github/actions/submit-delete-k8s-job
-  #     with:
-  #       job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
-  #       job-name: ${{ env.JOB_NAME }}
-  #   - name: Download logs from S3
-  #     id: log-s3
-  #     if: ${{ !cancelled() }}
-  #     run: |
-  #       mkdir -p axlearn-output
-  #       aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/
-  #       aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log"
-  #       aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml"
-
-
-  #       passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
-  #       failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
-  #       skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
-  #       total_tests=$((failed_tests + passed_tests + skipped_tests))
-
-  #       echo "Passed tests: $passed_tests"
-  #       echo "Failed tests: $failed_tests"
-  #       echo "Skipped tests: $skipped_tests"
-  #       echo "Total tests: $total_tests"
-  #       echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
-  #       echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
-  #       echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
-  #   - name: Generate sitrep
-  #     id: sitrep
-  #     if: ${{ !cancelled() }}
-  #     shell: bash -x -e {0}
-  #     run: |
-  #       # bring in utility functions
-  #       source .github/workflows/scripts/to_json.sh
-
-  #       badge_label='Axlearn EKS Unit'
-
-  #       total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
-  #       failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
-  #       passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
-  #       errors="0" \
-  #       summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
-  #       badge_message="Passed $passed_tests out of $total_tests." \
-  #       badge_color="brightgreen"
-  #       if [ "$failed_tests" -gt 0 ]; then
-  #         badge_color="red"
-  #       fi \
-
-  #       to_json \
-  #         summary \
-  #         errors total_tests passed_tests failed_tests \
-  #         badge_label badge_color badge_message \
-  #       > sitrep.json
-
-  #       schemaVersion=1 \
-  #       label="${badge_label}" \
-  #       message="Passed $passed_tests out of $total_tests." \
-  #       color=$badge_color \
-  #       to_json schemaVersion label message color \
-  #       > badge-axlearn-test.json
-
-  #   - name: Upload artifacts
-  #     if: ${{ !cancelled() }}
-  #     uses: actions/upload-artifact@v4
-  #     with:
-  #       name: "artifact-axlearn-test"
-  #       path: |
-  #         sitrep.json
-  #         badge-axlearn-test.json
-  #         axlearn-unittests.jsonl
-  #         axlearn-output/*
-
-  # test-axlearn-fuji-models-eks:
-  #   needs: build-axlearn
-  #   if: >-
-  #     inputs.ARCHITECTURE == 'amd64' &&
-  #     (
-  #       inputs.MODE == 'full' ||
-  #       inputs.MODE == 'axlearn'
-  #     )
-  #   runs-on: eks
-  #   env:
-  #     AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
-  #     JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
-  #   steps:
-  #   - name: Check out the repository
-  #     uses: actions/checkout@v4
-  #   - name: Login to GitHub Container Registry
-  #     uses: docker/login-action@v3
-  #     with:
-  #       registry: ghcr.io
-  #       username: ${{ github.repository_owner }}
-  #       password: ${{ secrets.GITHUB_TOKEN }}
-  #   - name: K8s GHCR store and delete token
-  #     id: store-token
-  #     uses: ./.github/actions/store-delete-k8s-ghcr
-  #   - name: Configure axlearn test job
-  #     run: |
-  #       yq -i ea '
-  #          select(di == 0).metadata.name = strenv(JOB_NAME)
-  #         | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
-  #         | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
-  #       .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
-  #       git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
-
-  #   - name: Submit & delete axlearn fuji model test
-  #     uses: ./.github/actions/submit-delete-k8s-job
-  #     with:
-  #       job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
-  #       job-name: ${{ env.JOB_NAME }}
+  build-equinox:
+    needs: build-jax
+    runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
+    outputs:
+      DOCKER_TAG_MEALKIT: ${{ steps.build-equinox.outputs.DOCKER_TAG_MEALKIT }}
+      DOCKER_TAG_FINAL:   ${{ steps.build-equinox.outputs.DOCKER_TAG_FINAL }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Build Equinox container
+        id: build-equinox
+        uses: ./.github/actions/build-container
+        with:
+          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+          ARTIFACT_NAME: artifact-equinox-build
+          BADGE_FILENAME: badge-equinox-build
+          BUILD_DATE: ${{ inputs.BUILD_DATE }}
+          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+          CONTAINER_NAME: equinox
+          DOCKERFILE: .github/container/Dockerfile.equinox
+          RUNNER_SIZE: small
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          bazel-remote-cache-url: ""
+          EXTRA_BUILD_ARGS: |
+            URLREF_EQUINOX=${{ fromJson(inputs.SOURCE_URLREFS).EQUINOX }}
+
+  build-maxtext:
+    needs: build-jax
+    runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
+    outputs:
+      DOCKER_TAG_MEALKIT: ${{ steps.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}
+      DOCKER_TAG_FINAL:   ${{ steps.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Build MaxText container
+        id: build-maxtext
+        uses: ./.github/actions/build-container
+        with:
+          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+          ARTIFACT_NAME: artifact-maxtext-build
+          BADGE_FILENAME: badge-maxtext-build
+          BUILD_DATE: ${{ inputs.BUILD_DATE }}
+          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+          CONTAINER_NAME: maxtext
+          DOCKERFILE: .github/container/Dockerfile.maxtext
+          RUNNER_SIZE: small
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          bazel-remote-cache-url: ""
+          EXTRA_BUILD_ARGS: |
+            URLREF_MAXTEXT=${{ fromJson(inputs.SOURCE_URLREFS).MAXTEXT }}
+
+  build-torchax:
+    needs: build-jax
+    runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
+    outputs:
+      DOCKER_TAG_MEALKIT: ${{ steps.build-torchax.outputs.DOCKER_TAG_MEALKIT }}
+      DOCKER_TAG_FINAL:   ${{ steps.build-torchax.outputs.DOCKER_TAG_FINAL }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Build TorchAX container
+        id: build-torchax
+        uses: ./.github/actions/build-container
+        with:
+          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+          ARTIFACT_NAME: artifact-torchax-build
+          BADGE_FILENAME: badge-torchax-build
+          BUILD_DATE: ${{ inputs.BUILD_DATE }}
+          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+          CONTAINER_NAME: torchax
+          DOCKERFILE: .github/container/Dockerfile.torchax
+          RUNNER_SIZE: small
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          bazel-remote-cache-url: ""
+          EXTRA_BUILD_ARGS: |
+            URLREF_TORCHAX=${{ fromJson(inputs.SOURCE_URLREFS).TORCHAX }}
+
+  build-axlearn:
+    needs: build-jax
+    runs-on: ${{ inputs.ARCHITECTURE == 'amd64' && 'linux-amd64-cpu16m' || 'linux-arm64-cpu16m' }}
+    outputs:
+      DOCKER_TAG_MEALKIT: ${{ steps.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}
+      DOCKER_TAG_FINAL:   ${{ steps.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Build AxLearn container
+        id: build-axlearn
+        uses: ./.github/actions/build-container
+        with:
+          ARCHITECTURE: ${{ inputs.ARCHITECTURE }}
+          ARTIFACT_NAME: artifact-axlearn-build
+          BADGE_FILENAME: badge-axlearn-build
+          BUILD_DATE: ${{ inputs.BUILD_DATE }}
+          BASE_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}
+          CONTAINER_NAME: axlearn
+          DOCKERFILE: .github/container/Dockerfile.axlearn
+          RUNNER_SIZE: large
+          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+          ssh-known-hosts: ${{ vars.SSH_KNOWN_HOSTS }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          bazel-remote-cache-url: ""
+          EXTRA_BUILD_ARGS: |
+            URLREF_AXLEARN=${{ fromJson(inputs.SOURCE_URLREFS).AXLEARN }}
+
+  collect-docker-tags:
+    runs-on: ubuntu-22.04
+    if: ${{ !cancelled() }}
+    needs:
+      - build-base
+      - build-jax
+      - build-equinox
+      - build-maxtext
+      - build-axlearn
+    outputs:
+      TAGS: ${{ steps.collect-tags.outputs.TAGS }}
+    steps:
+      - name: Save docker tags as a JSON object
+        id: collect-tags
+        run: |
+          TAGS=$(cat <<EOF | jq -c
+          [\
+            {"flavor": "base",         "stage": "final",   "priority": 800,  "tag": "${{ needs.build-base.outputs.DOCKER_TAG }}"},\
+            {"flavor": "jax",          "stage": "final",   "priority": 1000, "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "equinox",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "maxtext",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "axlearn",      "stage": "final",   "priority": 900,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}"},\
+            {"flavor": "jax",          "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-jax.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "equinox",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-equinox.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "maxtext",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-maxtext.outputs.DOCKER_TAG_MEALKIT }}"},\
+            {"flavor": "axlearn",      "stage": "mealkit", "priority": 500,  "tag": "${{ needs.build-axlearn.outputs.DOCKER_TAG_MEALKIT }}"},\
+
+            {}\
+          ]
+          EOF
+          )
+
+          echo "TAGS=${TAGS}" >> $GITHUB_OUTPUT
+
+  test-jax-eks:
+    needs: build-jax
+    if: >-
+      inputs.ARCHITECTURE == 'amd64' &&
+      (
+        inputs.MODE == 'full' ||
+        inputs.MODE == 'jax'
+      )
+    runs-on: eks
+    env:
+      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: jax-${{ github.run_id }}
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v6
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v4
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
+    - name: Configure JAX test job
+      run: |
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}"
+          | select(di == 0).spec.template.spec.imagePullSecrets[0].name = "${{ steps.store-token.outputs.token-name }}"' \
+        .github/eks-workflow-files/jax/test.yml
+        git diff .github/eks-workflow-files/jax/test.yml
+    - name: Submit & delete JAX unit test job
+      uses: ./.github/actions/submit-delete-k8s-job
+      with:
+        job-config-file: ".github/eks-workflow-files/jax/test.yml"
+        job-name: ${{ env.JOB_NAME }}
+    - name: Download logs from S3
+      id: log-s3
+      if: ${{ !cancelled() }}
+      run: |
+        mkdir -p jax-output
+        aws s3 cp s3://jax-toolbox-eks-output/jax/${{ github.run_id }}/ jax-output/ --recursive
+
+        errors=$(cat jax-output/test-*.log | grep -c 'ERROR:' || true)
+        failed_tests=$(cat jax-output/test-*.log | grep -c 'FAILED in' || true)
+        passed_tests=$(cat jax-output/test-*.log | grep -c 'PASSED in' || true)
+        total_tests=$((failed_tests + passed_tests))
+
+        echo "Passed tests: $passed_tests"
+        echo "Failed tests: $failed_tests"
+        echo "Total tests: $total_tests"
+        echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
+        echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
+        echo "ERRORS=$errors" >> $GITHUB_OUTPUT
+
+        if [[ $failed_tests -gt 0 ]] || [[ $errors -gt 0 ]]; then
+          exit 1
+        fi
+    - name: Generate sitrep
+      id: sitrep
+      if: ${{ !cancelled() }}
+      shell: bash -x -e {0}
+      run: |
+        # bring in utility functions
+        source .github/workflows/scripts/to_json.sh
+
+        badge_label='JAX EKS unittest (8)'
+
+        total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
+        failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
+        passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
+        errors=${{ steps.log-s3.outputs.ERRORS }} \
+        summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
+        badge_message="Passed $passed_tests out of $total_tests." \
+        badge_color="brightgreen"
+        if [ "$failed_tests" -gt 0 ] || [ "$errors" -gt 0 ]; then
+          badge_color="red"
+        fi \
+
+        to_json \
+          summary \
+          errors total_tests passed_tests failed_tests \
+          badge_label badge_color badge_message \
+        > sitrep.json
+
+        schemaVersion=1 \
+        label="${badge_label}" \
+        message="Passed $passed_tests out of $total_tests." \
+        color=$badge_color \
+        to_json schemaVersion label message color \
+        > badge-jax-unit-test-eks.json
+
+    - name: Upload artifacts
+      if: ${{ !cancelled() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: "jax-unit-test-H100-eks"
+        path: |
+          sitrep.json
+          badge-jax-unit-test-eks.json
+          jax-output/*
+
+  test-nsys-jax-eks:
+    needs: build-jax
+    if: >-
+      inputs.ARCHITECTURE == 'amd64' &&
+      (
+        inputs.MODE == 'full' ||
+        inputs.MODE == 'jax'
+      )
+    runs-on: eks
+    env:
+      JAX_DOCKER_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: ${{ github.run_id }}-nsys-jax
+      # Service name cannot start with a number
+      SERVICE_NAME: svc-${{ github.run_id}}-nsys-jax
+      POSTPROCESS_JOB_NAME: ${{ github.run_id }}-nsys-jax-postprocess
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
+    - name: Configure Kubernetes job
+      run: |
+        yq -i ea 'select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+          | select(di == 0).metadata.name = strenv(SERVICE_NAME)
+          | select(di == 1).metadata.name = strenv(JOB_NAME)
+          | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
+          | select(di == 1).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
+          | select(di == 1).spec.template.spec.containers[0].image = strenv(JAX_DOCKER_IMAGE)
+          | select(di == 1).spec.template.spec.containers[0].env[0].value = strenv(JOB_NAME)
+          | select(di == 1).spec.template.spec.containers[0].env[1].value = strenv(SERVICE_NAME)' \
+          .github/eks-workflow-files/job.yml
+        git diff .github/eks-workflow-files/job.yml
+    - name: Submit Kubernetes job
+      uses: ./.github/actions/submit-delete-k8s-job
+      with:
+        job-config-file: .github/eks-workflow-files/job.yml
+        job-name: ${{ env.JOB_NAME }}
+    - name: Configure post-processing job
+      run: |
+        export JOB_OUTPUT_PATTERN="${JOB_NAME}-rank*.zip"
+        yq -i '.metadata.name = strenv(POSTPROCESS_JOB_NAME)
+          | .spec.template.spec.containers[].image = strenv(JAX_DOCKER_IMAGE)
+          | .spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"
+          | .spec.template.spec.initContainers[].command[7] = strenv(JOB_OUTPUT_PATTERN)' \
+          .github/eks-workflow-files/post-process-job.yml
+        git diff .github/eks-workflow-files/post-process-job.yml
+    - name: Submit post process Kubernetes job
+      uses: ./.github/actions/submit-delete-k8s-job
+      with:
+        job-config-file: .github/eks-workflow-files/post-process-job.yml
+        job-name: ${{ env.POSTPROCESS_JOB_NAME }}
+
+  test-te-h100:
+    needs: build-jax
+    if: >-
+      inputs.ARCHITECTURE == 'amd64' &&
+      (
+        inputs.MODE == 'full' ||
+        inputs.MODE == 'te'
+      )
+    uses: ./.github/workflows/_transformer_engine_eks.yaml
+    with:
+      JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: transformerengine-${{ github.run_id }}
+      S3_BUCKET: jax-toolbox-eks-output
+      CI_NAME: transformer-engine
+    secrets: inherit
+
+  test-jax-cutlass-h100:
+    needs: build-jax
+    if: >-
+      inputs.ARCHITECTURE == 'amd64' &&
+      (
+        inputs.MODE == 'full' ||
+        inputs.MODE == 'jax-cutlass'
+      )
+    uses: ./.github/workflows/_jax_cutlass_eks.yaml
+    with:
+      JAX_IMAGE: ${{ needs.build-jax.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: jax-cutlass-${{ github.run_id }}
+      S3_BUCKET: jax-toolbox-eks-output
+      CI_NAME: jax-cutlass
+    secrets: inherit
+
+  test-maxtext-eks:
+    needs: build-maxtext
+    if: >-
+      inputs.ARCHITECTURE == 'amd64' &&
+      (
+        inputs.MODE == 'full' ||
+        inputs.MODE == 'maxtext'
+      )
+    runs-on: [eks]
+    env:
+      MAXTEXT_DOCKER_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: maxtext-${{ github.run_id }}
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
+    - name: Configure maxtext test job
+      run: |
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(MAXTEXT_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}"
+          | select(di == 0).spec.template.spec.imagePullSecrets[0].name = "${{ steps.store-token.outputs.token-name }}"' \
+        .github/eks-workflow-files/maxtext/test.yml
+        git diff .github/eks-workflow-files/maxtext/test.yml
+    - name: Submit & delete maxtext test job
+      uses: ./.github/actions/submit-delete-k8s-job
+      with:
+        job-config-file: ".github/eks-workflow-files/maxtext/test.yml"
+        job-name: ${{ env.JOB_NAME }}
+    - name: Download results from S3
+      id: s3-download
+      if: ${{ !cancelled() }}
+      run: |
+        mkdir -p maxtext-output
+        aws s3 cp s3://jax-toolbox-eks-output/maxtext/${{ github.run_id }}/ maxtext-output/ --recursive
+    - name: Run metrics
+      id: metrics
+      if: ${{ !cancelled() }}
+      run: |
+        pip install 'numpy<2.0.0' pytest pytest-reportlog tensorboard
+        RESULTS_DIR=maxtext-output BASELINES_DIR=MAXTEXT/upstream \
+          pytest --report-log=report.jsonl .github/workflows/baselines/test_maxtext_metrics.py || true
+    - name: Generate sitrep
+      id: sitrep
+      if: ${{ !cancelled() }}
+      shell: bash -x -e {0}
+      run: |
+        # bring in utility functions
+        source .github/workflows/scripts/to_json.sh
+
+        badge_label='MaxText EKS'
+
+        passed_tests=$(cat report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "passed") | .outcome' | wc -l)
+        failed_tests=$(cat report.jsonl | jq -r 'select(."$report_type" == "TestReport" and .when == "call" and .outcome == "failed") | .outcome' | wc -l)
+        total_tests=$(( passed_tests + failed_tests ))
+        errors=0
+        badge_color="brightgreen"
+        if [ "$failed_tests" -gt 0 ]; then
+          badge_color="red"
+        fi
+
+        total_tests=$total_tests \
+        failed_tests=$failed_tests \
+        passed_tests=$passed_tests \
+        errors=$errors \
+        summary="All metrics tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
+        badge_message="Passed $passed_tests out of $total_tests." \
+        badge_color=$badge_color \
+        to_json \
+          summary errors total_tests passed_tests failed_tests \
+          badge_label badge_color badge_message \
+        > sitrep.json
+
+        schemaVersion=1 \
+        label="${badge_label}" \
+        message="Passed $passed_tests out of $total_tests." \
+        color=$badge_color \
+        to_json schemaVersion label message color \
+        > badge-maxtext-test-eks.json
+
+    - name: Upload artifacts
+      if: ${{ !cancelled() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: "maxtext-test-H100-eks"
+        path: |
+          sitrep.json
+          badge-maxtext-test-eks.json
+          maxtext-output/
+          report.jsonl
+
+  test-maxtext-gke:
+    needs: build-maxtext
+    if: >-
+      inputs.ARCHITECTURE == 'amd64' &&
+      (
+        inputs.MODE == 'full' ||
+        inputs.MODE == 'maxtext'
+      )
+    uses: ./.github/workflows/_test_maxtext_gke_xpk.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ needs.build-maxtext.outputs.DOCKER_TAG_FINAL }}
+    secrets: inherit
+
+  test-axlearn-eks:
+    needs: build-axlearn
+    if: >-
+      inputs.ARCHITECTURE == 'amd64' &&
+      (
+        inputs.MODE == 'full' ||
+        inputs.MODE == 'axlearn'
+      )
+    runs-on: eks
+    env:
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: axlearn-${{ github.run_id }}
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
+    - name: Configure axlearn test job
+      run: |
+        # Replace placeholders in axlearn-job.yml with environment variables
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.containers[0].env[0].value = "${{ github.run_id }}"
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+        .github/eks-workflow-files/axlearn/axlearn-job.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-job.yml
+    - name: Submit & delete axlearn test
+      uses: ./.github/actions/submit-delete-k8s-job
+      with:
+        job-config-file: ".github/eks-workflow-files/axlearn/axlearn-job.yml"
+        job-name: ${{ env.JOB_NAME }}
+    - name: Download logs from S3
+      id: log-s3
+      if: ${{ !cancelled() }}
+      run: |
+        mkdir -p axlearn-output
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/summary.txt axlearn-output/
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.log"
+        aws s3 cp s3://jax-toolbox-eks-output/axlearn/${{ github.run_id }}/logs/ axlearn-output/ --recursive --exclude "*" --include "*.xml"
+
+
+        passed_tests=$(grep -Eo 'PASSED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
+        failed_tests=$(grep -Eo 'FAILED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
+        skipped_tests=$(grep -Eo 'SKIPPED:[[:space:]]*[0-9]+' axlearn-output/summary.txt | grep -Eo '[0-9]+' )
+        total_tests=$((failed_tests + passed_tests + skipped_tests))
+
+        echo "Passed tests: $passed_tests"
+        echo "Failed tests: $failed_tests"
+        echo "Skipped tests: $skipped_tests"
+        echo "Total tests: $total_tests"
+        echo "PASSED_TESTS=$passed_tests" >> $GITHUB_OUTPUT
+        echo "FAILED_TESTS=$failed_tests" >> $GITHUB_OUTPUT
+        echo "TOTAL_TESTS=$total_tests" >> $GITHUB_OUTPUT
+    - name: Generate sitrep
+      id: sitrep
+      if: ${{ !cancelled() }}
+      shell: bash -x -e {0}
+      run: |
+        # bring in utility functions
+        source .github/workflows/scripts/to_json.sh
+
+        badge_label='Axlearn EKS Unit'
+
+        total_tests=${{ steps.log-s3.outputs.TOTAL_TESTS }} \
+        failed_tests=${{ steps.log-s3.outputs.FAILED_TESTS }} \
+        passed_tests=${{ steps.log-s3.outputs.PASSED_TESTS }} \
+        errors="0" \
+        summary="All tests: $total_tests. Passed: $passed_tests. Failed: $failed_tests." \
+        badge_message="Passed $passed_tests out of $total_tests." \
+        badge_color="brightgreen"
+        if [ "$failed_tests" -gt 0 ]; then
+          badge_color="red"
+        fi \
+
+        to_json \
+          summary \
+          errors total_tests passed_tests failed_tests \
+          badge_label badge_color badge_message \
+        > sitrep.json
+
+        schemaVersion=1 \
+        label="${badge_label}" \
+        message="Passed $passed_tests out of $total_tests." \
+        color=$badge_color \
+        to_json schemaVersion label message color \
+        > badge-axlearn-test.json
+
+    - name: Upload artifacts
+      if: ${{ !cancelled() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: "artifact-axlearn-test"
+        path: |
+          sitrep.json
+          badge-axlearn-test.json
+          axlearn-unittests.jsonl
+          axlearn-output/*
+
+  test-axlearn-fuji-models-eks:
+    needs: build-axlearn
+    if: >-
+      inputs.ARCHITECTURE == 'amd64' &&
+      (
+        inputs.MODE == 'full' ||
+        inputs.MODE == 'axlearn'
+      )
+    runs-on: eks
+    env:
+      AXLEARN_DOCKER_IMAGE: ${{ needs.build-axlearn.outputs.DOCKER_TAG_FINAL }}
+      JOB_NAME: axlearn-fuji-3b-${{ github.run_id }}
+    steps:
+    - name: Check out the repository
+      uses: actions/checkout@v4
+    - name: Login to GitHub Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ghcr.io
+        username: ${{ github.repository_owner }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: K8s GHCR store and delete token
+      id: store-token
+      uses: ./.github/actions/store-delete-k8s-ghcr
+    - name: Configure axlearn test job
+      run: |
+        yq -i ea '
+           select(di == 0).metadata.name = strenv(JOB_NAME)
+          | select(di == 0).spec.template.spec.containers[0].image = strenv(AXLEARN_DOCKER_IMAGE)
+          | select(di == 0).spec.template.spec.imagePullSecrets[].name = "${{ steps.store-token.outputs.token-name }}"' \
+        .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+        git diff .github/eks-workflow-files/axlearn/axlearn-fuji-model.yml
+
+    - name: Submit & delete axlearn fuji model test
+      uses: ./.github/actions/submit-delete-k8s-job
+      with:
+        job-config-file:  ".github/eks-workflow-files/axlearn/axlearn-fuji-model.yml"
+        job-name: ${{ env.JOB_NAME }}

From 4ade0f5eab0e7acd4bf8b40a9c50457485eeb5da Mon Sep 17 00:00:00 2001
From: Steboss <stefanobosisio1@gmail.com>
Date: Wed, 1 Apr 2026 10:23:08 +0200
Subject: [PATCH 15/15] fix actioN

---
 .github/actions/build-container/action.yml | 26 ++++++++++++----------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index e00ab5d35..b945ca10f 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -224,11 +224,7 @@ runs:
           ${{ inputs.EXTRA_BUILD_ARGS }}
 
     # BAZEL CACHE EXPORT
-    - name: Prune BuildKit cache to free space for export
-      if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' || inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
-      shell: bash
-      run: docker buildx prune --force
-
+    # Snapshots are captured first; prune runs after to free space before upload.
     # type=tar streams a single archive instead of per-file copies
     - name: Export Bazel disk cache
       if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
@@ -247,13 +243,6 @@ runs:
           BUILD_DATE=${{ inputs.BUILD_DATE }}
           ${{ inputs.EXTRA_BUILD_ARGS }}
 
-    - name: Save Bazel disk cache
-      if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
-      uses: actions/cache/save@v4
-      with:
-        path: /tmp/bazel-disk.tar
-        key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
-
     - name: Export Bazel repo cache
       if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
       uses: docker/build-push-action@v5
@@ -269,6 +258,19 @@ runs:
           BUILD_DATE=${{ inputs.BUILD_DATE }}
           ${{ inputs.EXTRA_BUILD_ARGS }}
 
+    # Prune layer cache after snapshots are captured to free disk space before upload
+    - name: Prune BuildKit layer cache before upload
+      if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true' || inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
+      shell: bash
+      run: docker buildx prune --force
+
+    - name: Save Bazel disk cache
+      if: inputs.ENABLE_BAZEL_DISK_CACHE == 'true'
+      uses: actions/cache/save@v4
+      with:
+        path: /tmp/bazel-disk.tar
+        key: bazel-disk-cache-${{ inputs.ARCHITECTURE }}-${{ github.run_id }}
+
     - name: Save Bazel repo cache
       if: inputs.ENABLE_BAZEL_REPO_CACHE == 'true'
       uses: actions/cache/save@v4