diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_gpu_rollout.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_gpu_rollout.sh new file mode 100755 index 0000000000..2c73ccd330 --- /dev/null +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_gpu_rollout.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# This file is part of Checkbox. +# +# Copyright 2022 Canonical Ltd. +# +# Authors: +# Abdullah (@motjuste) +# +# Checkbox is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 3, +# as published by the Free Software Foundation. +# +# Checkbox is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Checkbox. If not, see . +set -eou pipefail + +check_nvidia_gpu_rollout() { + NAMESPACE="gpu-operator-resources" + sleep 10 + kubectl -n "$NAMESPACE" rollout status ds/gpu-operator-feature-discovery + sleep 10 + kubectl -n "$NAMESPACE" rollout status ds/nvidia-device-plugin-daemonset + sleep 10 + kubectl -n "$NAMESPACE" rollout status ds/nvidia-operator-validator +} + +check_intel_gpu_rollout() { + sleep 10 + kubectl -n node-feature-discovery rollout status ds/nfd-worker + sleep 10 + kubectl -n default rollout status ds/intel-gpu-plugin +} + +help_function() { + echo "This script is used for checking rollout of GPU-related daemonsets" + echo "Usage: check_gpu_rollout.sh " + exit 2 +} + +main() { + case ${1} in + nvidia) check_nvidia_gpu_rollout ;; + intel) check_intel_gpu_rollout ;; + *) help_function ;; + esac +} + +main "$@" diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh deleted file mode 100755 index d0754acfea..0000000000 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_intel.sh +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env bash - -set -euxo pipefail - -# IMPORTANT NOTE: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation -SLOTS_PER_GPU=10 - -check_intel_gpu_plugin_can_be_installed() { - if microk8s.kubectl get daemonset.apps | grep -q "intel-gpu-plugin"; then - echo "Test success: 'intel-gpu-plugin' daemonset is already deployed!" - exit 0 - fi - - # NOTE: Using kubectl directly due to this bug: https://github.com/canonical/microk8s/issues/4453 - - # TODO: make version a param - VERSION=v0.30.0 - # hack as redirecting stdout anywhere but /dev/null throws a permission denied error - # see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4 - kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION} | tee /tmp/node_feature_discovery.yaml >/dev/null - kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION} | tee /tmp/node_feature_rules.yaml >/dev/null - kubectl kustomize https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION} | tee /tmp/gpu_plugin.yaml >/dev/null - sed -i "s/enable-monitoring/enable-monitoring\n - -shared-dev-num=${SLOTS_PER_GPU}/" /tmp/gpu_plugin.yaml - kubectl apply -f /tmp/node_feature_discovery.yaml - kubectl apply -f /tmp/node_feature_rules.yaml - kubectl apply -f /tmp/gpu_plugin.yaml - SLEEP_SECS=15 - echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking rollout status." - sleep ${SLEEP_SECS} - kubectl -n node-feature-discovery rollout status ds/nfd-worker - kubectl -n default rollout status ds/intel-gpu-plugin - echo "[INFO]: sleeping for ${SLEEP_SECS} seconds to allow pod status to update for subsequent tests." - sleep ${SLEEP_SECS} - echo "Test success: Intel K8s GPU Device Plugin deployed." -} - -check_intel_gpu_plugin_daemonset_is_deployed() { - result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].metadata.name}') - if [ "${result}" = "intel-gpu-plugin" ]; then - echo "Test success: 'intel-gpu-plugin' daemonset is deployed!" - else - >&2 echo "Test failure: expected daemonset name 'intel-gpu-plugin' but got ${result}" - exit 1 - fi -} - -check_one_intel_gpu_plugin_daemonset_is_available() { - result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberAvailable}') - if [ "${result}" = "1" ]; then - echo "Test success: 1 daemonset in numberAvailable status." - else - >&2 echo "Test failure: expected numberAvailable to be 1 but got ${result}" - exit 1 - fi -} - -check_one_intel_gpu_plugin_daemonset_is_ready() { - result=$(microk8s.kubectl get daemonset.apps -o jsonpath='{.items[0].status.numberReady}') - if [ "${result}" = "1" ]; then - echo "Test success: 1 daemonset in numberReady status." - else - >&2 echo "Test failure: expected numberReady to be 1 but got ${result}" - exit 1 - fi -} - -check_intel_gpu_node_label_is_attached() { - result=$(microk8s.kubectl get node -o jsonpath='{.items[0].metadata.labels.intel\.feature\.node\.kubernetes\.io/gpu}') - if [ "${result}" = "true" ]; then - echo "Test success: found expected label: 'intel.feature.node.kubernetes.io/gpu': 'true'" - else - >&2 echo "Test failure: expected 'true' but got ${result}" - exit 1 - fi -} - -check_at_least_one_intel_gpu_is_available() { - # IMPORTANT NOTE: this test also counts NVIDIA GPUs once their plugin is enabled. - # The inaccuracy in gpu.intel.com label's value and not controlled by us - result=$(microk8s.kubectl get node -o json | jq '.items[0].metadata.labels | with_entries(select(.key|match("gpu.intel.com/device-id.*.count";"i")))[] | tonumber' | awk '{cnt+=$1} END{print cnt}') - if [ "${result}" -ge 1 ]; then - echo "Test success: Found ${result} GPUs on system." - else - >&2 echo "Test failure: expected at least 1 GPU but got ${result}" - exit 1 - fi -} - -check_capacity_slots_for_intel_gpus_match() { - result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.capacity.gpu\.intel\.com/i915}') - if [ "${result}" -ge "${SLOTS_PER_GPU}" ]; then - echo "Test success: Found ${result} GPU capacity slots on k8s node." - else - >&2 echo "Test failure: expected more than ${SLOTS_PER_GPU} GPU capacity slots but got ${result}" - exit 1 - fi -} - -check_allocatable_slots_for_intel_gpus_match() { - result=$(microk8s.kubectl get node -o jsonpath='{.items[0].status.allocatable.gpu\.intel\.com/i915}') - if [ "${result}" -ge "${SLOTS_PER_GPU}" ]; then - echo "Test success: Found ${result} GPU allocatable slots on k8s node." - else - >&2 echo "Test failure: expected ${SLOTS_PER_GPU} GPU allocatable slots but got ${result}" - exit 1 - fi -} - -help_function() { - echo "This script is used for tests related to Intel GPUs" - echo "Usage: check.sh " - echo - echo "Test cases currently implemented:" - echo -e "\t: check_intel_gpu_plugin_can_be_installed" - echo -e "\t: check_intel_gpu_plugin_daemonset_is_deployed" - echo -e "\t: check_one_intel_gpu_plugin_daemonset_is_available" - echo -e "\t: check_one_intel_gpu_plugin_daemonset_is_ready" - echo -e "\t: check_intel_gpu_node_label_is_attached" - echo -e "\t: check_at_least_one_intel_gpu_is_available" - echo -e "\t: check_capacity_slots_for_intel_gpus_match" - echo -e "\t: check_allocatable_slots_for_intel_gpus_match" -} - -main() { - case ${1} in - gpu_plugin_can_be_installed) check_intel_gpu_plugin_can_be_installed ;; - gpu_plugin_daemonset_is_deployed) check_intel_gpu_plugin_daemonset_is_deployed ;; - one_daemonset_is_available) check_one_intel_gpu_plugin_daemonset_is_available ;; - one_daemonset_is_ready) check_one_intel_gpu_plugin_daemonset_is_ready ;; - gpu_node_label_is_attached) check_intel_gpu_node_label_is_attached ;; - at_least_one_gpu_is_available) check_at_least_one_intel_gpu_is_available ;; - capacity_slots_for_gpus_match) check_capacity_slots_for_intel_gpus_match ;; - allocatable_slots_for_gpus_match) check_allocatable_slots_for_intel_gpus_match ;; - *) help_function ;; - esac -} - -main "$@" diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_nvidia_gpu_rollout.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_nvidia_gpu_rollout.sh deleted file mode 100755 index 80f00d8a38..0000000000 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/check_nvidia_gpu_rollout.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash -# This file is part of Checkbox. -# -# Copyright 2022 Canonical Ltd. -# -# Authors: -# Abdullah (@motjuste) -# -# Checkbox is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 3, -# as published by the Free Software Foundation. -# -# Checkbox is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Checkbox. If not, see . -set -eou pipefail - -NAMESPACE="${1:-"gpu-operator-resources"}" -sleep 10 -kubectl -n "$NAMESPACE" rollout status ds/gpu-operator-node-feature-discovery-worker -sleep 10 -kubectl -n "$NAMESPACE" rollout status ds/nvidia-device-plugin-daemonset -sleep 10 -kubectl -n "$NAMESPACE" rollout status ds/nvidia-operator-validator diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/enable_intel_gpu_plugin.sh b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/enable_intel_gpu_plugin.sh new file mode 100755 index 0000000000..29e3a7a8b3 --- /dev/null +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/bin/enable_intel_gpu_plugin.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +# +# This file is part of Checkbox. +# +# Copyright 2022 Canonical Ltd. +# +# Authors: +# Abdullah (@motjuste) +# +# Checkbox is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 3, +# as published by the Free Software Foundation. +# +# Checkbox is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Checkbox. If not, see . +# +set -eo pipefail + +# This follows the how-to from the DSS team, see: +# + +VERSION="${1:-"v0.30.0"}" + +# IMPORTANT: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation +SLOTS_PER_GPU="${2:-"10"}" + +# hack with tee as redirecting stdout anywhere but /dev/null throws a permission denied error +# see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4 +kubectl kustomize "https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION}" | tee /tmp/node_feature_discovery.yaml >/dev/null +kubectl kustomize "https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION}" | tee /tmp/node_feature_rules.yaml >/dev/null +kubectl kustomize "https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION}" | tee /tmp/gpu_plugin.yaml >/dev/null + +sed -i "s/enable-monitoring/enable-monitoring\n - -shared-dev-num=${SLOTS_PER_GPU}/" /tmp/gpu_plugin.yaml + +kubectl apply -f /tmp/node_feature_discovery.yaml +kubectl apply -f /tmp/node_feature_rules.yaml +kubectl apply -f /tmp/gpu_plugin.yaml diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu index 6825afecd1..517c7c4e57 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/jobs.pxu @@ -108,84 +108,30 @@ requires: depends: dss/initialize _summary: Install Intel K8s GPU Device Plugin estimated_duration: 2m -command: check_intel.sh gpu_plugin_can_be_installed - -id: intel_gpu_plugin/daemonset_name -category_id: dss-regress -flags: simple -imports: from com.canonical.certification import executable -requires: executable.name == 'microk8s' -depends: intel_gpu_plugin/install -_summary: Check that Intel GPU plugin daemonset is deployed -estimated_duration: 5s -command: check_intel.sh gpu_plugin_daemonset_is_deployed - -id: intel_gpu_plugin/daemonset_number_available -category_id: dss-regress -flags: simple -imports: from com.canonical.certification import executable -requires: executable.name == 'microk8s' -depends: intel_gpu_plugin/install -_summary: Check that at least one Intel GPU daemonset is available -estimated_duration: 5s -command: check_intel.sh one_daemonset_is_available - -id: intel_gpu_plugin/daemonset_number_ready -category_id: dss-regress -flags: simple -imports: from com.canonical.certification import executable -requires: executable.name == 'microk8s' -depends: intel_gpu_plugin/daemonset_number_available -_summary: Check that at least one Intel GPU daemonset is ready -estimated_duration: 5s -command: check_intel.sh one_daemonset_is_ready +command: + set -eou pipefail + enable_intel_gpu_plugin.sh "v0.30.0" "10" + check_gpu_rollout.sh intel id: intel_gpu_plugin/labels category_id: dss-regress flags: simple imports: from com.canonical.certification import executable -requires: executable.name == 'microk8s' -depends: intel_gpu_plugin/daemonset_number_ready -_summary: Check that Kubernetes has label intel.feature.node.kubernetes.io/gpu -estimated_duration: 5s -command: check_intel.sh gpu_node_label_is_attached - -id: intel_gpu_plugin/gpu_count -category_id: dss-regress -flags: simple -imports: from com.canonical.certification import executable -requires: executable.name == 'microk8s' -depends: intel_gpu_plugin/labels -_summary: Check that at least one Intel GPU is available on k8s node -estimated_duration: 5s -command: check_intel.sh at_least_one_gpu_is_available - -id: intel_gpu_plugin/node_gpu_capacity -category_id: dss-regress -flags: simple -imports: from com.canonical.certification import executable -requires: executable.name == 'microk8s' -depends: intel_gpu_plugin/gpu_count -_summary: Check that at least expected capacity slots for Intel GPU are availabled -estimated_duration: 5s -command: check_intel.sh capacity_slots_for_gpus_match - -id: intel_gpu_plugin/node_gpu_allocatable -category_id: dss-regress -flags: simple -imports: from com.canonical.certification import executable -requires: executable.name == 'microk8s' -depends: intel_gpu_plugin/node_gpu_capacity -_summary: Check that at least expected allocatable slots for Intel GPU are available +requires: executable.name == 'kubectl' +depends: intel_gpu_plugin/install +_summary: Check that at least one k8s node has label Intel GPU label attached estimated_duration: 5s -command: check_intel.sh allocatable_slots_for_gpus_match +command: + set -eou pipefail + result="$(kubectl get nodes -o jsonpath='{.items[*].metadata.labels.intel\.feature\.node\.kubernetes\.io/gpu}')" + echo "$result" | grep "true" id: dss/status_intel_gpu category_id: dss-regress flags: simple imports: from com.canonical.certification import executable requires: executable.name == 'dss' -depends: intel_gpu_plugin/node_gpu_allocatable +depends: intel_gpu_plugin/labels _summary: Check that DSS status reports Intel GPU acceleration is enabled estimated_duration: 5s command: @@ -271,7 +217,7 @@ command: set -eou pipefail OPERATOR_VERSION="24.6.2" microk8s enable gpu --driver=operator --version="${OPERATOR_VERSION}" - check_cuda_rollout.sh + check_gpu_rollout.sh nvidia id: nvidia_gpu_addon/validations_succeed category_id: dss-regress diff --git a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu index 4ea8166349..7f71ba1e32 100644 --- a/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu +++ b/contrib/checkbox-dss-validation/checkbox-provider-dss/units/test-plan.pxu @@ -12,13 +12,7 @@ include: cpu/tensorflow_can_use_cpu dss/remove_tensorflow_cpu_notebook intel_gpu_plugin/install - intel_gpu_plugin/daemonset_name - intel_gpu_plugin/daemonset_number_available - intel_gpu_plugin/daemonset_number_ready intel_gpu_plugin/labels - intel_gpu_plugin/gpu_count - intel_gpu_plugin/node_gpu_capacity - intel_gpu_plugin/node_gpu_allocatable dss/status_intel_gpu dss/create_tensorflow_intel_notebook xpu/tensorflow_can_use_xpu