@@ -62,7 +62,7 @@ inputs:
6262 required : false
6363 default : ' exit \$EXIT_CODE'
6464 type : string
65- WORKLOAD_NAME_PREFIX :
65+ JOBSET_NAME_PREFIX :
6666 description : ' Workload name prefix for XPK, also used to name uploaded artifact'
6767 required : false
6868 default : ' xpk'
@@ -113,11 +113,11 @@ runs:
113113 if : steps.check.outputs.online == 'true'
114114 shell : bash -x -e -u {0}
115115 run : |
116- WORKLOAD_NAME ="${{ inputs.WORKLOAD_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
116+ JOBSET_NAME ="${{ inputs.JOBSET_NAME_PREFIX }}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
117117 DATE=$(date +'%Y-%m-%d')
118- GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.WORKLOAD_NAME_PREFIX }}/${DATE}/${WORKLOAD_NAME }"
118+ GCS_ARTIFACT_PATH="gs://${{ inputs.GCS_BUCKET }}/${{ inputs.JOBSET_NAME_PREFIX }}/${DATE}/${JOBSET_NAME }"
119119
120- echo "WORKLOAD_NAME =${WORKLOAD_NAME }" >> ${GITHUB_ENV}
120+ echo "JOBSET_NAME =${JOBSET_NAME }" >> ${GITHUB_ENV}
121121 echo "DATE=${DATE}" >> ${GITHUB_ENV}
122122 echo "GCS_ARTIFACT_PATH=${GCS_ARTIFACT_PATH}" >> ${GITHUB_ENV}
123123
@@ -146,27 +146,27 @@ runs:
146146 if : steps.check.outputs.online == 'true'
147147 shell : bash -x -e -u {0}
148148 run : |
149- mkdir -p ${WORKLOAD_NAME }
150- uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${WORKLOAD_NAME }
151- source ${WORKLOAD_NAME }/.venv/bin/activate
149+ mkdir -p ${JOBSET_NAME }
150+ uv venv --verbose --python=${{ inputs.XPK_PYTHON }} --directory=${JOBSET_NAME }
151+ source ${JOBSET_NAME }/.venv/bin/activate
152152
153- git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${WORKLOAD_NAME }/xpk
153+ git clone --depth=1 --branch=${{ inputs.XPK_VERSION }} https://github.com/AI-Hypercomputer/xpk.git ${JOBSET_NAME }/xpk
154154
155155 # apply XPK workload patch
156156 PATCH_PATH=.github/gke-workflow/xpk/${{ inputs.XPK_VERSION}}
157- ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${WORKLOAD_NAME }/xpk
157+ ls ${PATCH_PATH}/*.patch | xargs -I {} git apply --unsafe-paths {} --directory ${JOBSET_NAME }/xpk
158158
159159 # install xpk
160- sed 's@pip install -e \.@'$(which uv)' pip install \.@g' -i ${WORKLOAD_NAME }/xpk/Makefile
161- cd ${WORKLOAD_NAME }/xpk && sudo make pip-install; cd -
160+ sed 's@pip install -e \.@'$(which uv)' pip install \.@g' -i ${JOBSET_NAME }/xpk/Makefile
161+ cd ${JOBSET_NAME }/xpk && sudo make pip-install; cd -
162162
163163 - name : Show environment
164164 if : steps.check.outputs.online == 'true'
165165 shell : bash -x -e -u {0}
166166 run : |
167167 gcloud version
168168
169- source ${WORKLOAD_NAME }/.venv/bin/activate
169+ source ${JOBSET_NAME }/.venv/bin/activate
170170 python --version
171171 xpk version
172172
@@ -209,14 +209,14 @@ runs:
209209 if : steps.check.outputs.online == 'true'
210210 shell : bash -x -e -u {0}
211211 run : |
212- source ${WORKLOAD_NAME }/.venv/bin/activate
213- cd ${WORKLOAD_NAME }/xpk
212+ source ${JOBSET_NAME }/.venv/bin/activate
213+ cd ${JOBSET_NAME }/xpk
214214
215215 args=(
216216 --project=${{ inputs.GCP_PROJECT }}
217217 --cluster=${{ inputs.GKE_CLUSTER }}
218218 --zone=${{ inputs.GCP_REGION }}
219- --workload=${WORKLOAD_NAME }
219+ --workload=${JOBSET_NAME }
220220 --docker-image=${{ inputs.IMAGE }}
221221 --device-type=${{ inputs.CLUSTER_DEVICE }}
222222 --num-nodes=${{ inputs.NUM_NODES }}
@@ -256,61 +256,70 @@ runs:
256256 fi
257257 ${XPK_COMMAND} workload create ${args[@]} --command="${CMD}"
258258
259+ - name : Show JobSet manifest
260+ if : steps.check.outputs.online == 'true'
261+ shell : bash -ux {0}
262+ run : |
263+ kubectl get jobset/${JOBSET_NAME} -o yaml
264+
259265 - name : Wait for JobSet to unsuspend on cluster
260266 if : steps.check.outputs.online == 'true'
261267 shell : bash -u {0}
262268 env :
263- POLL_TIMEOUT : 3600
269+ POLL_TIMEOUT : 10800
264270 run : |
271+ set -euo pipefail
272+
265273 START=$(date +%s)
266- JOBSET_ACTIVE=false
267- while ! ${JOBSET_ACTIVE} || [ -z ${JOBSET_ACTIVE} ]; do
268- JOBSET_ACTIVE=$(kubectl get jobset -o json | jq -r '.items[] | select(.metadata.name == "'${WORKLOAD_NAME}'").status.replicatedJobsStatus[0] | .active == 1')
274+ JOBSET_ACTIVE=""
275+
276+ while [[ "${JOBSET_ACTIVE:-}" != "true" ]]; do
277+ JOBSET_ACTIVE="$(
278+ kubectl get jobset -o json | jq -r '
279+ .items[]
280+ | select(.metadata.name == "'"${JOBSET_NAME}"'")
281+ | .status.replicatedJobsStatus[0].active == 1
282+ '
283+ )"
284+
269285 NOW=$(date +%s)
270286 ELAPSED=$(( NOW - START ))
271- if (( ELAPSED > POLL_TIMEOUT )) ; then
272- echo "Timeout after waiting for JobSet ${WORKLOAD_NAME } to become active in cluster ${{ inputs.GKE_CLUSTER }} "
287+ if (( ELAPSED > POLL_TIMEOUT )); then
288+ echo "Timeout after waiting for JobSet ${JOBSET_NAME } to become active in the cluster "
273289 exit 1
274290 fi
275- echo "Waiting for JobSet ${WORKLOAD_NAME} to become active in cluster ${{ inputs.GKE_CLUSTER }}"
276- sleep 5
291+
292+ echo "Waiting for JobSet ${JOBSET_NAME} to become active in the cluster"
293+ sleep 10
277294 done
295+
296+ echo "JobSet ${JOBSET_NAME} has just become active in the cluster"
278297
279- echo "JobSet ${WORKLOAD_NAME} has just become active in cluster ${{ inputs.GKE_CLUSTER }}"
280-
281- - name : Set JobSet Pod name
282- if : steps.check.outputs.online == 'true'
298+ - name: Set JobSet Pods
283299 shell: bash -u {0}
284300 run: |
285- echo "POD= $(kubectl get pods -o json | jq -r '.items[] | select(.metadata. labels."' jobset.sigs.k8s.io/jobset-name' " == "'${WORKLOAD_NAME }'") | .metadata. name ' | sort | head -n1 )" >> ${GITHUB_ENV}
301+ echo "JOBSET_PODS=( $(kubectl get pods -o json | jq -r ' .items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${JOBSET_NAME }'") | .name' | tr '\n' ' ') )" >> ${GITHUB_ENV}
286302
287- - name : Wait for JobSet Pod readiness
303+ - name : Wait for JobSet readiness
288304 if : steps.check.outputs.online == 'true'
289- shell : bash -u {0}
305+ shell : bash
290306 run : |
291- POD_READY=false
292- while ! ${POD_READY} || [ -z ${POD_READY} ]; do
293- echo "Waiting for pod ${POD} in JobSet ${WORKLOAD_NAME} to become ready"
294- sleep 10
307+ set -euo pipefail
295308
296- POD_ERROR=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'") | .state | ( has("terminated") and (.terminated.reason == "Error" ))')
297- if ${POD_ERROR} ; then
298- echo "There was an issue starting the JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }}"
299- break
300- fi
301-
302- POD_READY=$(kubectl get pod ${POD} -o json | jq -r '.status.containerStatuses[]? | select(.name == "'${{ inputs.MAIN_CONTAINER_NAME }}'").ready')
303- done;
309+ for jobset_pod in ${JOBSET_PODS//[()]/}; do
310+ [[ -z "$jobset_pod" ]] && continue
311+ echo "Waiting for pod $jobset_pod in JobSet $JOBSET_NAME to become ready"
312+ kubectl wait --for=condition=Ready "pod/$jobset_pod" --timeout=30m
313+ done
304314
305315 - name : Stream logs from JobSet Pods
306316 if : steps.check.outputs.online == 'true'
307317 shell : bash -u {0}
308318 run : |
309- JOBSET_PODS=($(kubectl get pods -o json | jq -r '.items[].metadata | select(.labels."jobset.sigs.k8s.io/jobset-name" == "'${WORKLOAD_NAME}'") | .name' | tr '\n' ' '))
310- echo "JOBSET_PODS=${JOBSET_PODS[@]}" >> ${GITHUB_ENV}
319+ mkdir -p ${JOBSET_NAME}
311320
312- for jobset_pod in ${JOBSET_PODS[@] }; do
313- kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${WORKLOAD_NAME }/${jobset_pod}.log &
321+ for jobset_pod in ${JOBSET_PODS//[()]/ }; do
322+ kubectl logs --pod-running-timeout=1m -f --prefix=true --timestamps=true -c gpu-image ${jobset_pod} 2>&1 | tee -a ${JOBSET_NAME }/${jobset_pod}.log &
314323 done
315324 wait < <(jobs -p)
316325
@@ -320,28 +329,28 @@ runs:
320329 run : |
321330 parse_pod_exit_code() {
322331 local pod=$1
323- MAYBE_XPK_EXIT_CODE ="$(tail -n 1 ${WORKLOAD_NAME }/${pod}.log | awk '{ print $3 }' )"
324- echo ${MAYBE_XPK_EXIT_CODE } | grep -E 'EXIT\_CODE=[0-9]+$' > /dev/null
332+ MAYBE_JOBSET_EXIT_CODE ="$(cat ${JOBSET_NAME }/${pod}.log | grep -oE 'EXIT\_CODE=[0-9]+$' )"
333+ echo ${MAYBE_JOBSET_EXIT_CODE } | grep -E 'EXIT\_CODE=[0-9]+$' > /dev/null
325334
326335 if [ $? -ne 0 ]; then
327- echo "The JobSet ${WORKLOAD_NAME} on ${{ inputs.GKE_CLUSTER }} did not complete as expected "
328- echo "XPK_EXIT_CODE =1" >> ${GITHUB_ENV}
336+ echo "The JobSet ${JOBSET_NAME} did not complete as expected " >&2
337+ echo "JOBSET_EXIT_CODE =1" >> ${GITHUB_ENV}
329338 exit 1
330339 fi
331340
332- echo "Pod ${pod} exited with ${MAYBE_XPK_EXIT_CODE }" >&2
341+ echo "Pod ${pod} exited with ${MAYBE_JOBSET_EXIT_CODE }" >&2
333342
334- eval "export ${MAYBE_XPK_EXIT_CODE }"
343+ eval "export ${MAYBE_JOBSET_EXIT_CODE }"
335344 echo ${EXIT_CODE}
336345 }
337346
338347 ALL_EXIT_CODES=0
339- for jobset_pod in ${JOBSET_PODS[@] }; do
348+ for jobset_pod in ${JOBSET_PODS//[()]/ }; do
340349 POD_EXIT_CODE=$(parse_pod_exit_code ${jobset_pod})
341350 ALL_EXIT_CODES=$(( ALL_EXIT_CODES + POD_EXIT_CODE ))
342351 done
343352
344- echo "XPK_EXIT_CODE =${ALL_EXIT_CODES}" >> ${GITHUB_ENV}
353+ echo "JOBSET_EXIT_CODE =${ALL_EXIT_CODES}" >> ${GITHUB_ENV}
345354 if [ ${ALL_EXIT_CODES} -gt 0 ]; then
346355 exit 1
347356 fi
@@ -351,28 +360,28 @@ runs:
351360 if : steps.check.outputs.online == 'true'
352361 shell : bash -x -u {0}
353362 run : |
354- kubectl delete jobset --wait ${WORKLOAD_NAME } || echo "JobSet ${WORKLOAD_NAME } does not exist in ${{ inputs.GKE_CLUSTER }} "
363+ kubectl delete jobset --wait ${JOBSET_NAME } || echo "JobSet ${JOBSET_NAME } does not exist"
355364
356365 - name : Download artifacts from GCS to runner
357366 if : steps.check.outputs.online == 'true'
358367 shell : bash -x -u {0}
359368 run : |
360- mkdir -p ${WORKLOAD_NAME }/output
361- gsutil cp -r ${GCS_ARTIFACT_PATH} ${WORKLOAD_NAME }/output
362- cp ${WORKLOAD_NAME }/*.log ${WORKLOAD_NAME }/output
369+ mkdir -p ${JOBSET_NAME }/output
370+ gsutil cp -r ${GCS_ARTIFACT_PATH} ${JOBSET_NAME }/output
371+ cp ${JOBSET_NAME }/*.log ${JOBSET_NAME }/output
363372
364373 - name : Upload artifacts to GitHub Actions from runner
365374 if : steps.check.outputs.online == 'true'
366375 uses : actions/upload-artifact@v6
367376 with :
368- name : ${{ inputs.WORKLOAD_NAME_PREFIX }}
369- path : ${{ env.WORKLOAD_NAME }}/output/*
377+ name : ${{ inputs.JOBSET_NAME_PREFIX }}
378+ path : ${{ env.JOBSET_NAME }}/output/*
370379
371380 - name : Clean up xpk environment from runner
372381 if : steps.check.outputs.online == 'true'
373382 shell : bash -x -u {0}
374383 run : |
375- sudo rm -rf ${WORKLOAD_NAME }
384+ sudo rm -rf ${JOBSET_NAME }
376385
377386 - name : Generate sitrep
378387 if : steps.check.outputs.online == 'true'
@@ -382,9 +391,9 @@ runs:
382391 source .github/workflows/scripts/to_json.sh
383392 badge_label="${{ matrix.test }}"
384393
385- summary="${{ inputs.WORKLOAD_NAME_PREFIX }}"
394+ summary="${{ inputs.JOBSET_NAME_PREFIX }}"
386395 outcome=success
387- badge_label="${{ inputs.WORKLOAD_NAME_PREFIX }}"
396+ badge_label="${{ inputs.JOBSET_NAME_PREFIX }}"
388397 badge_color=brightgreen
389398
390399 if [ "${XPK_EXIT_CODE}" -gt 0 ]; then
@@ -405,6 +414,6 @@ runs:
405414 if : steps.check.outputs.online == 'true'
406415 uses : actions/upload-artifact@v6
407416 with :
408- name : ${{ inputs.WORKLOAD_NAME_PREFIX }}-sitrep
417+ name : ${{ inputs.JOBSET_NAME_PREFIX }}-sitrep
409418 path : |
410419 sitrep.json
0 commit comments