diff --git a/.github/workflows/dex_oauth2-proxy_test.yaml b/.github/workflows/dex_oauth2-proxy_test.yaml index cdcd738580..f986e752f6 100644 --- a/.github/workflows/dex_oauth2-proxy_test.yaml +++ b/.github/workflows/dex_oauth2-proxy_test.yaml @@ -7,7 +7,7 @@ on: - common/cert-manager/** - common/oauth2-proxy/** - common/istio*/** - - experimental/security/PSS/* + - applications/profiles/pss/** - common/dex/base/** - tests/istio* - tests/dex_login_test.py diff --git a/.github/workflows/katib_test.yaml b/.github/workflows/katib_test.yaml index 921da769b1..0ad53f453b 100644 --- a/.github/workflows/katib_test.yaml +++ b/.github/workflows/katib_test.yaml @@ -3,13 +3,13 @@ on: pull_request: paths: - tests/install_KinD_create_KinD_cluster_install_kustomize.sh - - tests/katib_install.sh + - tests/katib* - .github/workflows/katib_test.yaml - applications/katib/upstream/** - common/istio*/** - tests/istio* - common/cert-manager/** - - experimental/security/PSS/* + - applications/profiles/pss/** permissions: contents: read diff --git a/.github/workflows/pipeline_run_from_notebook.yaml b/.github/workflows/pipeline_run_from_notebook.yaml index 32e524d543..3bece17b41 100644 --- a/.github/workflows/pipeline_run_from_notebook.yaml +++ b/.github/workflows/pipeline_run_from_notebook.yaml @@ -4,6 +4,7 @@ on: paths: - tests/install_KinD_create_KinD_cluster_install_kustomize.sh - .github/workflows/pipeline_run_from_notebook.yaml + - tests/pipeline* - applications/jupyter/notebook-controller/upstream/** - applications/pipeline/upstream/** - tests/istio* diff --git a/.github/workflows/pipeline_test.yaml b/.github/workflows/pipeline_test.yaml index 6c479db1b7..629da2ca0e 100644 --- a/.github/workflows/pipeline_test.yaml +++ b/.github/workflows/pipeline_test.yaml @@ -10,9 +10,8 @@ on: - common/cert-manager/** - common/oauth2-proxy/** - common/istio*/** - - tests/pipeline_v1_test.py - - tests/pipeline_v2_test.py - - experimental/security/PSS/* + - tests/pipeline* + - applications/profiles/pss/** permissions: contents: read diff --git a/.github/workflows/trainer_test.yaml b/.github/workflows/trainer_test.yaml index f23f28206b..0607de9c6a 100644 --- a/.github/workflows/trainer_test.yaml +++ b/.github/workflows/trainer_test.yaml @@ -11,7 +11,7 @@ on: - common/cert-manager/** - common/oauth2-proxy/** - common/istio*/** - - experimental/security/PSS/* + - applications/profiles/pss/** permissions: contents: read diff --git a/.github/workflows/training_operator_test.yaml b/.github/workflows/training_operator_test.yaml index 5d05f0f80e..c63be238ab 100644 --- a/.github/workflows/training_operator_test.yaml +++ b/.github/workflows/training_operator_test.yaml @@ -11,7 +11,7 @@ on: - common/cert-manager/** - common/oauth2-proxy/** - common/istio*/** - - experimental/security/PSS/* + - applications/profiles/pss/** permissions: contents: read diff --git a/applications/katib/upstream/installs/katib-standalone/katib-config.yaml b/applications/katib/upstream/installs/katib-standalone/katib-config.yaml index 666fbc87f3..55fe20a586 100644 --- a/applications/katib/upstream/installs/katib-standalone/katib-config.yaml +++ b/applications/katib/upstream/installs/katib-standalone/katib-config.yaml @@ -5,6 +5,7 @@ init: enable: true controller: webhookPort: 8443 + injectSecurityContext: true trialResources: - TrainJob.v1alpha1.trainer.kubeflow.org - Job.v1.batch @@ -16,39 +17,209 @@ runtime: metricsCollectors: - kind: StdOut image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.19.0 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - kind: File image: ghcr.io/kubeflow/katib/file-metrics-collector:v0.19.0 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - kind: TensorFlowEvent image: ghcr.io/kubeflow/katib/tfevent-metrics-collector:v0.19.0 resources: limits: memory: 1Gi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault suggestions: - algorithmName: random image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: tpe image: ghcr.io/kubeflow/katib/suggestion-hyperopt:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: grid image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: hyperband image: ghcr.io/kubeflow/katib/suggestion-hyperband:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: bayesianoptimization image: ghcr.io/kubeflow/katib/suggestion-skopt:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: cmaes image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: sobol image: ghcr.io/kubeflow/katib/suggestion-goptuna:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: multivariate-tpe image: ghcr.io/kubeflow/katib/suggestion-optuna:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: enas image: ghcr.io/kubeflow/katib/suggestion-enas:v0.19.0 resources: limits: memory: 400Mi + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: darts image: ghcr.io/kubeflow/katib/suggestion-darts:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault - algorithmName: pbt image: ghcr.io/kubeflow/katib/suggestion-pbt:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault persistentVolumeClaimSpec: accessModes: - ReadWriteMany @@ -58,3 +229,16 @@ runtime: earlyStoppings: - algorithmName: medianstop image: ghcr.io/kubeflow/katib/earlystopping-medianstop:v0.19.0 + podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault diff --git a/applications/katib/upstream/installs/katib-standalone/kustomization.yaml b/applications/katib/upstream/installs/katib-standalone/kustomization.yaml index 166c7bd002..72ecfee8bf 100644 --- a/applications/katib/upstream/installs/katib-standalone/kustomization.yaml +++ b/applications/katib/upstream/installs/katib-standalone/kustomization.yaml @@ -39,3 +39,4 @@ secretGenerator: - name: katib-webhook-cert options: disableNameSuffixHash: true + type: kubernetes.io/tls diff --git a/applications/pipeline/upstream/third-party/argo/base/workflow-controller-configmap-patch.yaml b/applications/pipeline/upstream/third-party/argo/base/workflow-controller-configmap-patch.yaml index 7294451962..4d526c9acb 100644 --- a/applications/pipeline/upstream/third-party/argo/base/workflow-controller-configmap-patch.yaml +++ b/applications/pipeline/upstream/third-party/argo/base/workflow-controller-configmap-patch.yaml @@ -48,3 +48,13 @@ data: - ALL seccompProfile: type: RuntimeDefault + container: | + securityContext: + runAsNonRoot: true + runAsUser: 1000 + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault diff --git a/applications/profiles/pss/namespace-labels.yaml b/applications/profiles/pss/namespace-labels.yaml index 08f6690272..7d1fc7d114 100644 --- a/applications/profiles/pss/namespace-labels.yaml +++ b/applications/profiles/pss/namespace-labels.yaml @@ -20,4 +20,4 @@ katib.kubeflow.org/metrics-collector-injection: "enabled" serving.kubeflow.org/inferenceservice: "enabled" pipelines.kubeflow.org/enabled: "true" app.kubernetes.io/part-of: "kubeflow-profile" -pod-security.kubernetes.io/enforce: "baseline" +pod-security.kubernetes.io/enforce: "restricted" diff --git a/applications/trainer/upstream/base/runtimes/torch_distributed.yaml b/applications/trainer/upstream/base/runtimes/torch_distributed.yaml index 86aa6b6a73..a19c257fbf 100644 --- a/applications/trainer/upstream/base/runtimes/torch_distributed.yaml +++ b/applications/trainer/upstream/base/runtimes/torch_distributed.yaml @@ -17,8 +17,22 @@ spec: labels: trainer.kubeflow.org/trainjob-ancestor-step: trainer spec: + securityContext: + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault template: spec: containers: - name: node image: pytorch/pytorch:2.10.0-cuda12.8-cudnn9-runtime + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault diff --git a/applications/trainer/upstream/base/runtimes/xgboost_distributed.yaml b/applications/trainer/upstream/base/runtimes/xgboost_distributed.yaml index 9c1aa87087..706bf16f7c 100644 --- a/applications/trainer/upstream/base/runtimes/xgboost_distributed.yaml +++ b/applications/trainer/upstream/base/runtimes/xgboost_distributed.yaml @@ -17,8 +17,22 @@ spec: labels: trainer.kubeflow.org/trainjob-ancestor-step: trainer spec: + securityContext: + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault template: spec: containers: - name: node image: ghcr.io/kubeflow/trainer/xgboost-runtime:latest + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault diff --git a/applications/trainer/upstream/overlays/kubeflow-platform/kustomization.yaml b/applications/trainer/upstream/overlays/kubeflow-platform/kustomization.yaml index 5f0da72126..b06a98c2f4 100644 --- a/applications/trainer/upstream/overlays/kubeflow-platform/kustomization.yaml +++ b/applications/trainer/upstream/overlays/kubeflow-platform/kustomization.yaml @@ -1,6 +1,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -namespace: kubeflow +namespace: kubeflow-system resources: - ../../overlays/manager - ../../overlays/runtimes diff --git a/applications/trainer/upstream/overlays/manager/kustomization.yaml b/applications/trainer/upstream/overlays/manager/kustomization.yaml index f5bd56ad32..f4ae643089 100644 --- a/applications/trainer/upstream/overlays/manager/kustomization.yaml +++ b/applications/trainer/upstream/overlays/manager/kustomization.yaml @@ -20,7 +20,6 @@ images: # Secret for the Kubeflow Training webhook. secretGenerator: - name: kubeflow-trainer-webhook-cert - namespace: kubeflow-system options: disableNameSuffixHash: true diff --git a/tests/PSS_enable.sh b/tests/PSS_enable.sh index e0fa461de5..164ef1adb2 100755 --- a/tests/PSS_enable.sh +++ b/tests/PSS_enable.sh @@ -9,7 +9,9 @@ PSS_LEVEL="${1:-restricted}" } NAMESPACES=("istio-system" "auth" "cert-manager" "oauth2-proxy" "kubeflow" "knative-serving" "kubeflow-system") -[[ "$PSS_LEVEL" == "baseline" ]] && NAMESPACES+=("kubeflow-user-example-com") +if [[ "$PSS_LEVEL" == "restricted" ]]; then + NAMESPACES+=("kubeflow-user-example-com") +fi echo "Applying PSS $PSS_LEVEL to: ${NAMESPACES[*]}" diff --git a/tests/katib_test.sh b/tests/katib_test.sh index cfc11fc901..f50d199271 100755 --- a/tests/katib_test.sh +++ b/tests/katib_test.sh @@ -2,11 +2,36 @@ set -euxo pipefail KF_PROFILE=${1:-kubeflow-user-example-com} +KIND_CLUSTER=${2:-kubeflow} + +function debug_on_failure { + echo "=== Test failed! Collecting debug info ===" + kubectl describe experiment -n "$KF_PROFILE" || true + kubectl describe trials -n "$KF_PROFILE" || true + kubectl get pods -n "$KF_PROFILE" || true + kubectl logs -n kubeflow -l katib.kubeflow.org/component=controller --tail=200 || true +} +trap debug_on_failure ERR + +# Pre-pull image to avoid CI wait time being consumed by image pulls +echo "Pre-pulling training image..." +if command -v docker &>/dev/null; then + docker pull ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.19.0 || true + kind load docker-image ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.19.0 \ + --name "$KIND_CLUSTER" || true +else + echo "Docker not available, skipping pre-pull" +fi kubectl apply -f tests/katib_test.yaml -kubectl wait --for=condition=Running experiments.kubeflow.org -n $KF_PROFILE --all --timeout=60s -echo "Waiting for all Trials to be Completed..." -kubectl wait --for=condition=Created trials.kubeflow.org -n $KF_PROFILE --all --timeout=60s -kubectl get trials.kubeflow.org -n $KF_PROFILE -kubectl wait --for=condition=Succeeded trials.kubeflow.org -n $KF_PROFILE --all --timeout 600s -kubectl get trials.kubeflow.org -n $KF_PROFILE + +echo "Waiting for experiment to reach Running state..." +kubectl wait --for=condition=Running experiments.kubeflow.org \ + -n "$KF_PROFILE" --all --timeout=300s + +echo "Waiting for trials to be Succeeded..." +kubectl wait --for=condition=Succeeded trials.kubeflow.org \ + -n "$KF_PROFILE" --all --timeout=600s + +kubectl get trials.kubeflow.org -n "$KF_PROFILE" +echo "Katib test passed!" diff --git a/tests/katib_test.yaml b/tests/katib_test.yaml index e369c5f559..cc7f4b4f3d 100644 --- a/tests/katib_test.yaml +++ b/tests/katib_test.yaml @@ -44,14 +44,35 @@ spec: annotations: sidecar.istio.io/inject: "false" spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault containers: - name: training-container image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.19.0 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1000 command: - "python3" - "/opt/pytorch-mnist/mnist.py" - "--epochs=1" + - "--no-cuda" - "--batch-size=16" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" + volumeMounts: + - mountPath: /opt/pytorch-mnist/data + name: data-dir restartPolicy: Never + volumes: + - name: data-dir + emptyDir: {} diff --git a/tests/kubeflow_profile_install.sh b/tests/kubeflow_profile_install.sh index 4176836970..c807430633 100755 --- a/tests/kubeflow_profile_install.sh +++ b/tests/kubeflow_profile_install.sh @@ -6,4 +6,20 @@ PROFILE_CONTROLLER_POD=$(kubectl get pods -n kubeflow -o json | jq -r '.items[] kubectl logs -n kubeflow "$PROFILE_CONTROLLER_POD" KF_PROFILE=kubeflow-user-example-com kubectl -n $KF_PROFILE get pods,configmaps,secrets -kubectl label namespace $KF_PROFILE pod-security.kubernetes.io/enforce=baseline --overwrite + +echo "Verifying PSS Restricted enforcement on namespace $KF_PROFILE..." +# Profiles controller should automatically add the label via the 'pss' overlay +MAX_RETRIES=10 +for i in $(seq 1 $MAX_RETRIES); do + PSS_LABEL=$(kubectl get namespace "$KF_PROFILE" -o jsonpath='{.metadata.labels.pod-security\.kubernetes\.io/enforce}') + if [[ "$PSS_LABEL" == "restricted" ]]; then + echo "✅ Namespace $KF_PROFILE is correctly labeled as restricted." + exit 0 + fi + echo "Wait for Profiles controller to label the namespace (attempt $i/$MAX_RETRIES)..." + sleep 5 +done + +echo "❌ ERROR: Namespace $KF_PROFILE is NOT labeled as restricted." +kubectl get namespace "$KF_PROFILE" -o yaml +exit 1 diff --git a/tests/notebook.test.kubeflow-user-example.com.yaml b/tests/notebook.test.kubeflow-user-example.com.yaml index efc6aeec33..720eef7ac1 100644 --- a/tests/notebook.test.kubeflow-user-example.com.yaml +++ b/tests/notebook.test.kubeflow-user-example.com.yaml @@ -12,11 +12,29 @@ metadata: namespace: kubeflow-user-example-com spec: template: + metadata: + annotations: + sidecar.istio.io/inject: "false" + labels: + sidecar.istio.io/inject: "false" spec: + securityContext: + runAsNonRoot: true + runAsGroup: 1000 + fsGroup: 100 + seccompProfile: + type: RuntimeDefault containers: - name: test image: ghcr.io/kubeflow/kubeflow/notebook-servers/jupyter-scipy:v1.10.0 imagePullPolicy: IfNotPresent + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1000 resources: limits: cpu: "0.6" diff --git a/tests/training_operator_job.yaml b/tests/training_operator_job.yaml index 341db77599..34d8dec664 100644 --- a/tests/training_operator_job.yaml +++ b/tests/training_operator_job.yaml @@ -11,12 +11,28 @@ spec: restartPolicy: OnFailure template: metadata: + annotations: + sidecar.istio.io/inject: "false" labels: sidecar.istio.io/inject: "false" spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1000 + image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.19.0 imagePullPolicy: Always command: - "python3" @@ -45,17 +61,39 @@ spec: limits: memory: "1Gi" cpu: "4000m" + volumeMounts: + - mountPath: /opt/pytorch-mnist/data + name: data-dir + volumes: + - name: data-dir + emptyDir: {} Worker: replicas: 1 restartPolicy: OnFailure template: metadata: + annotations: + sidecar.istio.io/inject: "false" labels: sidecar.istio.io/inject: "false" spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault containers: - name: pytorch - image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-45c5727 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + runAsNonRoot: true + runAsUser: 1000 + image: ghcr.io/kubeflow/katib/pytorch-mnist-cpu:v0.19.0 imagePullPolicy: Always command: - "python3" @@ -84,3 +122,9 @@ spec: limits: memory: "1Gi" cpu: "4000m" + volumeMounts: + - mountPath: /opt/pytorch-mnist/data + name: data-dir + volumes: + - name: data-dir + emptyDir: {} diff --git a/tests/training_operator_test.sh b/tests/training_operator_test.sh index f338b1632b..667330856b 100755 --- a/tests/training_operator_test.sh +++ b/tests/training_operator_test.sh @@ -6,8 +6,28 @@ cat tests/training_operator_job.yaml | \ sed 's/name: pytorch-simple/name: pytorch-simple\n namespace: '"$KF_PROFILE"'/g' > /tmp/pytorch-job.yaml kubectl apply -f /tmp/pytorch-job.yaml -kubectl wait --for=jsonpath='{.status.conditions[0].type}=Created' pytorchjob.kubeflow.org/pytorch-simple -n $KF_PROFILE --timeout=60s - +# Wait for the PyTorchJob status conditions to be populated by the operator. +echo "Waiting for PyTorchJob status to be populated..." +pytorch_job_status_timeout_seconds=120 +pytorch_job_status_poll_interval_seconds=2 +pytorch_job_status_is_populated=false +for ((elapsed_seconds=0; elapsed_seconds/dev/null || true) + if [[ -n "$pytorch_job_condition_type" ]]; then + pytorch_job_status_is_populated=true + break + fi + sleep "$pytorch_job_status_poll_interval_seconds" +done +if [[ "$pytorch_job_status_is_populated" != "true" ]]; then + echo "ERROR: Timeout waiting for PyTorchJob status. Collecting diagnostics..." + kubectl describe pytorchjob pytorch-simple -n "$KF_PROFILE" + kubectl get pods -n "$KF_PROFILE" -l training.kubeflow.org/job-name=pytorch-simple + kubectl get events -n "$KF_PROFILE" --sort-by=.metadata.creationTimestamp + exit 1 +fi + +echo "PyTorchJob created successfully. Waiting for pods..." kubectl get pods -n $KF_PROFILE --show-labels kubectl wait --for=condition=Ready pod -l training.kubeflow.org/replica-type=master -n $KF_PROFILE --timeout=240s diff --git a/tests/trainjob_test.yaml b/tests/trainjob_test.yaml new file mode 100644 index 0000000000..bb8e0002d7 --- /dev/null +++ b/tests/trainjob_test.yaml @@ -0,0 +1,21 @@ +apiVersion: trainer.kubeflow.org/v1alpha1 +kind: TrainJob +metadata: + name: torch-simple +spec: + runtimeRef: + name: torch-distributed + trainer: + image: pytorch/pytorch:2.10.0-cuda12.8-cudnn9-runtime + command: + - "python3" + - "-c" + - "import torch; print(f\"CUDA available: {torch.cuda.is_available()}\")" + + resourcesPerNode: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "1" + memory: "1Gi"