Skip to content

Commit 3d53140

Browse files
committed
Align GKE with EKS actions
1 parent f0603fc commit 3d53140

3 files changed

Lines changed: 79 additions & 70 deletions

File tree

.github/actions/gke-xpk/action.yml

Lines changed: 73 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ inputs:
6262
required: false
6363
default: 'exit \$EXIT_CODE'
6464
type: string
65-
WORKLOAD_NAME_PREFIX:
65+
JOBSET_NAME_PREFIX:
6666
description: 'Workload name prefix for XPK, also used to name uploaded artifact'
6767
required: false
6868
default: 'xpk'
@@ -113,11 +113,11 @@ runs:
113113
if: steps.check.outputs.online == 'true'
114114
shell: bash -x -e -u {0}
115115
run: |
116-
WORKLOAD_NAME="${{ inputs.WORKLOAD_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
116+
JOBSET_NAME="${{ inputs.JOBSET_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
117117
DATE=$(date +'%Y-%m-%d')
118-
GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_NAME_PREFIX }}/${DATE}/${WORKLOAD_NAME}"
118+
GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.JOBSET_NAME_PREFIX }}/${DATE}/${JOBSET_NAME}"
119119
120-
echo "WORKLOAD_NAME=${WORKLOAD_NAME}" >> ${GITHUB_ENV}
120+
echo "JOBSET_NAME=${JOBSET_NAME}" >> ${GITHUB_ENV}
121121
echo "DATE=${DATE}" >> ${GITHUB_ENV}
122122
echo "GCS_ARTIFACT_PATH=${GCS_ARTIFACT_PATH}" >> ${GITHUB_ENV}
123123
@@ -146,27 +146,27 @@ runs:
146146
if: steps.check.outputs.online == 'true'
147147
shell: bash -x -e -u {0}
148148
run: |
149-
mkdir -p ${WORKLOAD_NAME}
150-
uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD_NAME}
151-
source ${WORKLOAD_NAME}/.venv/bin/activate
149+
mkdir -p ${JOBSET_NAME}
150+
uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${JOBSET_NAME}
151+
source ${JOBSET_NAME}/.venv/bin/activate
152152
153-
git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD_NAME}/xpk
153+
git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${JOBSET_NAME}/xpk
154154
155155
# apply XPK workload patch
156156
PATCH_PATH=.github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}
157-
ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${WORKLOAD_NAME}/xpk
157+
ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${JOBSET_NAME}/xpk
158158
159159
# install xpk
160-
sed 's@pip install -e \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD_NAME}/xpk/Makefile
161-
cd ${WORKLOAD_NAME}/xpk && sudo make pip-install; cd -
160+
sed 's@pip install -e \.@'$(which uv)' pip install \.@g' -i ${JOBSET_NAME}/xpk/Makefile
161+
cd ${JOBSET_NAME}/xpk && sudo make pip-install; cd -
162162
163163
- name: Show environment
164164
if: steps.check.outputs.online == 'true'
165165
shell: bash -x -e -u {0}
166166
run: |
167167
gcloud version
168168
169-
source ${WORKLOAD_NAME}/.venv/bin/activate
169+
source ${JOBSET_NAME}/.venv/bin/activate
170170
python --version
171171
xpk version
172172
@@ -209,14 +209,14 @@ runs:
209209
if: steps.check.outputs.online == 'true'
210210
shell: bash -x -e -u {0}
211211
run: |
212-
source ${WORKLOAD_NAME}/.venv/bin/activate
213-
cd ${WORKLOAD_NAME}/xpk
212+
source ${JOBSET_NAME}/.venv/bin/activate
213+
cd ${JOBSET_NAME}/xpk
214214
215215
args=(
216216
--project=${{ inputs.GCP_PROJECT }}
217217
--cluster=${{ inputs.GKE_CLUSTER }}
218218
--zone=${{ inputs.GCP_REGION }}
219-
--workload=${WORKLOAD_NAME}
219+
--workload=${JOBSET_NAME}
220220
--docker-image=${{ inputs.IMAGE }}
221221
--device-type=${{ inputs.CLUSTER_DEVICE }}
222222
--num-nodes=${{ inputs.NUM_NODES }}
@@ -256,61 +256,70 @@ runs:
256256
fi
257257
${XPK_COMMAND} workload create ${args[@]} --command="${CMD}"
258258
259+
- name: Show JobSet manifest
260+
if: steps.check.outputs.online == 'true'
261+
shell: bash -ux {0}
262+
run: |
263+
kubectl get jobset/${JOBSET_NAME} -o yaml
264+
259265
- name: Wait for JobSet to unsuspend on cluster
260266
if: steps.check.outputs.online == 'true'
261267
shell: bash -u {0}
262268
env:
263-
POLL_TIMEOUT: 3600
269+
POLL_TIMEOUT: 10800
264270
run: |
271+
set -euo pipefail
272+
265273
START=$(date +%s)
266-
JOBSET_ACTIVE=false
267-
while ! ${JOBSET_ACTIVE} || [ -z ${JOBSET_ACTIVE} ]; do
268-
JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD_NAME}'").status.replicatedJobsStatus[0] | .active == 1')
274+
JOBSET_ACTIVE=""
275+
276+
while [[ "${JOBSET_ACTIVE:-}" != "true" ]]; do
277+
JOBSET_ACTIVE="$(
278+
kubectl get jobset -o json | jq -r '
279+
.items[]
280+
| select(.metadata.name == "'"${JOBSET_NAME}"'")
281+
| .status.replicatedJobsStatus[0].active == 1
282+
'
283+
)"
284+
269285
NOW=$(date +%s)
270286
ELAPSED=$(( NOW - START ))
271-
if (( ELAPSED > POLL_TIMEOUT )) ; then
272-
echo "Timeout after waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
287+
if (( ELAPSED > POLL_TIMEOUT )); then
288+
echo "Timeout after waiting for JobSet ${JOBSET_NAME} to become active in the cluster"
273289
exit 1
274290
fi
275-
echo "Waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
276-
sleep 5
291+
292+
echo "Waiting for JobSet ${JOBSET_NAME} to become active in the cluster"
293+
sleep 10
277294
done
295+
296+
echo "JobSet ${JOBSET_NAME} has just become active in the cluster"
278297
279-
echo "JobSet ${WORKLOAD_NAME} has just become active in cluster ${{ inputs.GKE_CLUSTER }}"
280-
281-
- name: Set JobSet Pod name
282-
if: steps.check.outputs.online == 'true'
298+
- name: Set JobSet Pods
283299
shell: bash -u {0}
284300
run: |
285-
echo "POD=$(kubectl get pods -o json | jq -r '.items[] | select(.metadata.labels."'jobset.sigs.k8s.io/jobset-name'" == "'${WORKLOAD_NAME}'") | .metadata.name ' | sort | head -n1 )" >> ${GITHUB_ENV}
301+
echo "JOBSET_PODS=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${JOBSET_NAME}'") | .name' | tr '\n' ' '))" >> ${GITHUB_ENV}
286302

287-
- name: Wait for JobSet Pod readiness
303+
- name: Wait for JobSet readiness
288304
if: steps.check.outputs.online == 'true'
289-
shell: bash -u {0}
305+
shell: bash
290306
run: |
291-
POD_READY=false
292-
while ! ${POD_READY} || [ -z ${POD_READY} ]; do
293-
echo "Waiting for pod ${POD} in JobSet ${WORKLOAD_NAME} to become ready"
294-
sleep 10
307+
set -euo pipefail
295308
296-
POD_ERROR=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))')
297-
if ${POD_ERROR} ; then
298-
echo "There was an issue starting the JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }}"
299-
break
300-
fi
301-
302-
POD_READY=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'").ready')
303-
done;
309+
for jobset_pod in ${JOBSET_PODS//[()]/}; do
310+
[[ -z "$jobset_pod" ]] && continue
311+
echo "Waiting for pod $jobset_pod in JobSet $JOBSET_NAME to become ready"
312+
kubectl wait --for=condition=Ready "pod/$jobset_pod" --timeout=30m
313+
done
304314
305315
- name: Stream logs from JobSet Pods
306316
if: steps.check.outputs.online == 'true'
307317
shell: bash -u {0}
308318
run: |
309-
JOBSET_PODS=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD_NAME}'") | .name' | tr '\n' ' '))
310-
echo "JOBSET_PODS=${JOBSET_PODS[@]}" >> ${GITHUB_ENV}
319+
mkdir -p ${JOBSET_NAME}
311320
312-
for jobset_pod in ${JOBSET_PODS[@]}; do
313-
kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD_NAME}/${jobset_pod}.log &
321+
for jobset_pod in ${JOBSET_PODS//[()]/}; do
322+
kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${JOBSET_NAME}/${jobset_pod}.log &
314323
done
315324
wait < <(jobs -p)
316325
@@ -320,28 +329,28 @@ runs:
320329
run: |
321330
parse_pod_exit_code() {
322331
local pod=$1
323-
MAYBE_XPK_EXIT_CODE="$(tail -n 1 ${WORKLOAD_NAME}/${pod}.log | awk '{ print $3 }' )"
324-
echo ${MAYBE_XPK_EXIT_CODE} | grep -E 'EXIT\_CODE=[0-9]+$' > /dev/null
332+
MAYBE_JOBSET_EXIT_CODE="$(cat ${JOBSET_NAME}/${pod}.log | grep -oE 'EXIT\_CODE=[0-9]+$')"
333+
echo ${MAYBE_JOBSET_EXIT_CODE} | grep -E 'EXIT\_CODE=[0-9]+$' > /dev/null
325334
326335
if [ $? -ne 0 ]; then
327-
echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
328-
echo "XPK_EXIT_CODE=1" >> ${GITHUB_ENV}
336+
echo "The JobSet ${JOBSET_NAME} did not complete as expected " >&2
337+
echo "JOBSET_EXIT_CODE=1" >> ${GITHUB_ENV}
329338
exit 1
330339
fi
331340
332-
echo "Pod ${pod} exited with ${MAYBE_XPK_EXIT_CODE}" >&2
341+
echo "Pod ${pod} exited with ${MAYBE_JOBSET_EXIT_CODE}" >&2
333342
334-
eval "export ${MAYBE_XPK_EXIT_CODE}"
343+
eval "export ${MAYBE_JOBSET_EXIT_CODE}"
335344
echo ${EXIT_CODE}
336345
}
337346
338347
ALL_EXIT_CODES=0
339-
for jobset_pod in ${JOBSET_PODS[@]}; do
348+
for jobset_pod in ${JOBSET_PODS//[()]/}; do
340349
POD_EXIT_CODE=$(parse_pod_exit_code ${jobset_pod})
341350
ALL_EXIT_CODES=$(( ALL_EXIT_CODES + POD_EXIT_CODE ))
342351
done
343352
344-
echo "XPK_EXIT_CODE=${ALL_EXIT_CODES}" >> ${GITHUB_ENV}
353+
echo "JOBSET_EXIT_CODE=${ALL_EXIT_CODES}" >> ${GITHUB_ENV}
345354
if [ ${ALL_EXIT_CODES} -gt 0 ]; then
346355
exit 1
347356
fi
@@ -351,28 +360,28 @@ runs:
351360
if: steps.check.outputs.online == 'true'
352361
shell: bash -x -u {0}
353362
run: |
354-
kubectl delete jobset --wait ${WORKLOAD_NAME} || echo "JobSet ${WORKLOAD_NAME} does not exist in ${{ inputs.GKE_CLUSTER }}"
363+
kubectl delete jobset --wait ${JOBSET_NAME} || echo "JobSet ${JOBSET_NAME} does not exist"
355364
356365
- name: Download artifacts from GCS to runner
357366
if: steps.check.outputs.online == 'true'
358367
shell: bash -x -u {0}
359368
run: |
360-
mkdir -p ${WORKLOAD_NAME}/output
361-
gsutil cp -r ${GCS_ARTIFACT_PATH} ${WORKLOAD_NAME}/output
362-
cp ${WORKLOAD_NAME}/*.log ${WORKLOAD_NAME}/output
369+
mkdir -p ${JOBSET_NAME}/output
370+
gsutil cp -r ${GCS_ARTIFACT_PATH} ${JOBSET_NAME}/output
371+
cp ${JOBSET_NAME}/*.log ${JOBSET_NAME}/output
363372
364373
- name: Upload artifacts to GitHub Actions from runner
365374
if: steps.check.outputs.online == 'true'
366375
uses: actions/upload-artifact@v6
367376
with:
368-
name: ${{ inputs.WORKLOAD_NAME_PREFIX }}
369-
path: ${{ env.WORKLOAD_NAME }}/output/*
377+
name: ${{ inputs.JOBSET_NAME_PREFIX }}
378+
path: ${{ env.JOBSET_NAME }}/output/*
370379

371380
- name: Clean up xpk environment from runner
372381
if: steps.check.outputs.online == 'true'
373382
shell: bash -x -u {0}
374383
run: |
375-
sudo rm -rf ${WORKLOAD_NAME}
384+
sudo rm -rf ${JOBSET_NAME}
376385
377386
- name: Generate sitrep
378387
if: steps.check.outputs.online == 'true'
@@ -382,9 +391,9 @@ runs:
382391
source .github/workflows/scripts/to_json.sh
383392
badge_label="${{ matrix.test }}"
384393
385-
summary="${{ inputs.WORKLOAD_NAME_PREFIX }}"
394+
summary="${{ inputs.JOBSET_NAME_PREFIX }}"
386395
outcome=success
387-
badge_label="${{ inputs.WORKLOAD_NAME_PREFIX }}"
396+
badge_label="${{ inputs.JOBSET_NAME_PREFIX }}"
388397
badge_color=brightgreen
389398
390399
if [ "${XPK_EXIT_CODE}" -gt 0 ]; then
@@ -405,6 +414,6 @@ runs:
405414
if: steps.check.outputs.online == 'true'
406415
uses: actions/upload-artifact@v6
407416
with:
408-
name: ${{ inputs.WORKLOAD_NAME_PREFIX }}-sitrep
417+
name: ${{ inputs.JOBSET_NAME_PREFIX }}-sitrep
409418
path: |
410419
sitrep.json

.github/workflows/_test_maxtext_gke_xpk.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
runs-on: gke-a3mega
1515

1616
env:
17-
WORKLOAD_NAME_PREFIX: gke-maxtext-train
17+
JOBSET_NAME_PREFIX: gke-maxtext-train
1818
MAXTEXT_MODEL: llama2-7b
1919
MAXTEXT_ATTENTION_TYPE: cudnn_flash_te
2020
MAXTEXT_REMAT_POLICY: minimal_flash
@@ -30,7 +30,7 @@ jobs:
3030
uses: ./.github/actions/gke-xpk
3131
with:
3232
IMAGE: ${{ env.MAXTEXT_IMAGE }}
33-
WORKLOAD_NAME_PREFIX: ${{ env.WORKLOAD_NAME_PREFIX }}
33+
JOBSET_NAME_PREFIX: ${{ env.JOBSET_NAME_PREFIX }}
3434
ENVS: |
3535
JAX_COORDINATOR_PORT=3389;
3636
JAX_COORDINATOR_ADDRESS=\$(JOBSET_NAME)-\$(REPLICATED_JOB_NAME)-0-0.\$(JOBSET_NAME):\$(JAX_COORDINATOR_PORT);

.github/workflows/_test_nccl_gke.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252
env:
5353
BASE_IMAGE: ${{ needs.build-nccl-gke.outputs.DOCKER_TAG_FINAL }}
5454
TEST_NAME: ${{ matrix.test }}
55-
WORKLOAD_NAME_PREFIX: nccl-gke
55+
JOBSET_NAME_PREFIX: nccl-gke
5656
NHOSTS: 2
5757
NCCL_MINBYTES: 8
5858
NCCL_MAXBYTES: 16G
@@ -66,15 +66,15 @@ jobs:
6666
id: workload-name
6767
run: |
6868
TEST_NAME=$(echo "${{ matrix.test }}" | sed 's/_perf_mpi//g' | sed 's/_/-/g')
69-
WORKLOAD_PREFIX="${{ env.WORKLOAD_NAME_PREFIX }}-${TEST_NAME}"
69+
JOBSET_PREFIX="${{ env.JOBSET_NAME_PREFIX }}-${TEST_NAME}"
7070
71-
echo "WORKLOAD_PREFIX=${WORKLOAD_PREFIX}" >> ${GITHUB_OUTPUT}
71+
echo "JOBSET_PREFIX=${JOBSET_PREFIX}" >> ${GITHUB_OUTPUT}
7272
7373
- name: Run XPK workload on cluster
7474
uses: ./.github/actions/gke-xpk
7575
with:
7676
IMAGE: ${{ env.BASE_IMAGE }}
77-
WORKLOAD_NAME_PREFIX: ${{ steps.workload-name.outputs.WORKLOAD_PREFIX }}
77+
JOBSET_NAME_PREFIX: ${{ steps.workload-name.outputs.JOBSET_PREFIX }}
7878
ENVS: |
7979
JAX_COORDINATOR_PORT=3389;
8080
JAX_COORDINATOR_ADDRESS=\$(JOBSET_NAME)-\$(REPLICATED_JOB_NAME)-0-0.\$(JOBSET_NAME):\$(JAX_COORDINATOR_PORT);

0 commit comments

Comments
 (0)