canonical · fernando79513 · Mar 19, 2025 · Mar 11, 2025 · Mar 11, 2025 · Mar 11, 2025
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# This file is part of Checkbox.
+#
+# Copyright 2022 Canonical Ltd.
+#
+# Authors:
+#     Abdullah (@motjuste) <abdullah.abdullah@canonical.com>
+#
+# Checkbox is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 3,
+# as published by the Free Software Foundation.
+#
+# Checkbox is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Checkbox.  If not, see <http://www.gnu.org/licenses/>.
+set -eou pipefail
+
+check_nvidia_gpu_rollout() {
+    NAMESPACE="gpu-operator-resources"
+    sleep 10
+    kubectl -n "$NAMESPACE" rollout status ds/gpu-operator-feature-discovery
+    sleep 10
+    kubectl -n "$NAMESPACE" rollout status ds/nvidia-device-plugin-daemonset
+    sleep 10
+    kubectl -n "$NAMESPACE" rollout status ds/nvidia-operator-validator
+}
+
+check_intel_gpu_rollout() {
+    sleep 10
+    kubectl -n node-feature-discovery rollout status ds/nfd-worker
+    sleep 10
+    kubectl -n default rollout status ds/intel-gpu-plugin
+}
+
+help_function() {
+    echo "This script is used for checking rollout of GPU-related daemonsets"
+    echo "Usage: check_gpu_rollout.sh <nvidia | intel>"
+    exit 2
+}
+
+main() {
+    case ${1} in
+    nvidia) check_nvidia_gpu_rollout ;;
+    intel) check_intel_gpu_rollout ;;
+    *) help_function ;;
+    esac
+}
+
+main "$@"
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+#
+# This file is part of Checkbox.
+#
+# Copyright 2022 Canonical Ltd.
+#
+# Authors:
+#     Abdullah (@motjuste) <abdullah.abdullah@canonical.com>
+#
+# Checkbox is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 3,
+# as published by the Free Software Foundation.
+#
+# Checkbox is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Checkbox.  If not, see <http://www.gnu.org/licenses/>.
+#
+set -eo pipefail
+
+# This follows the how-to from the DSS team, see:
+# <https://github.com/canonical/data-science-stack/blob/e495bacef97a1b6bf8bdb63dca29912be317be8c/docs/how-to/enable-gpus/enable-intel-gpu.rst>
+
+VERSION="${1:-"v0.30.0"}"
+
+# IMPORTANT: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
+SLOTS_PER_GPU="${2:-"10"}"
+
+# hack with tee as redirecting stdout anywhere but /dev/null throws a permission denied error
+# see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4
+kubectl kustomize "https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION}" | tee /tmp/node_feature_discovery.yaml >/dev/null
+kubectl kustomize "https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION}" | tee /tmp/node_feature_rules.yaml >/dev/null
+kubectl kustomize "https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION}" | tee /tmp/gpu_plugin.yaml >/dev/null
+
+sed -i "s/enable-monitoring/enable-monitoring\n        - -shared-dev-num=${SLOTS_PER_GPU}/" /tmp/gpu_plugin.yaml
+
+kubectl apply -f /tmp/node_feature_discovery.yaml
+kubectl apply -f /tmp/node_feature_rules.yaml
+kubectl apply -f /tmp/gpu_plugin.yaml
@@ -108,84 +108,30 @@ requires:
 depends: dss/initialize
 _summary: Install Intel K8s GPU Device Plugin
 estimated_duration: 2m
-command: check_intel.sh gpu_plugin_can_be_installed
-
-id: intel_gpu_plugin/daemonset_name
-category_id: dss-regress
-flags: simple
-imports: from com.canonical.certification import executable
-requires: executable.name == 'microk8s'
-depends: intel_gpu_plugin/install
-_summary: Check that Intel GPU plugin daemonset is deployed
-estimated_duration: 5s
-command: check_intel.sh gpu_plugin_daemonset_is_deployed
-
-id: intel_gpu_plugin/daemonset_number_available
-category_id: dss-regress
-flags: simple
-imports: from com.canonical.certification import executable
-requires: executable.name == 'microk8s'
-depends: intel_gpu_plugin/install
-_summary: Check that at least one Intel GPU daemonset is available
-estimated_duration: 5s
-command: check_intel.sh one_daemonset_is_available
-
-id: intel_gpu_plugin/daemonset_number_ready
-category_id: dss-regress
-flags: simple
-imports: from com.canonical.certification import executable
-requires: executable.name == 'microk8s'
-depends: intel_gpu_plugin/daemonset_number_available
-_summary: Check that at least one Intel GPU daemonset is ready
-estimated_duration: 5s
-command: check_intel.sh one_daemonset_is_ready
+command:
+  set -eou pipefail
+  enable_intel_gpu_plugin.sh "v0.30.0" "10"
+  check_gpu_rollout.sh intel
 
 id: intel_gpu_plugin/labels
 category_id: dss-regress
 flags: simple
 imports: from com.canonical.certification import executable
-requires: executable.name == 'microk8s'
-depends: intel_gpu_plugin/daemonset_number_ready
-_summary: Check that Kubernetes has label intel.feature.node.kubernetes.io/gpu
-estimated_duration: 5s
-command: check_intel.sh gpu_node_label_is_attached
-
-id: intel_gpu_plugin/gpu_count
-category_id: dss-regress
-flags: simple
-imports: from com.canonical.certification import executable
-requires: executable.name == 'microk8s'
-depends: intel_gpu_plugin/labels
-_summary: Check that at least one Intel GPU is available on k8s node
-estimated_duration: 5s
-command: check_intel.sh at_least_one_gpu_is_available
-
-id: intel_gpu_plugin/node_gpu_capacity
-category_id: dss-regress
-flags: simple
-imports: from com.canonical.certification import executable
-requires: executable.name == 'microk8s'
-depends: intel_gpu_plugin/gpu_count
-_summary: Check that at least expected capacity slots for Intel GPU are availabled
-estimated_duration: 5s
-command: check_intel.sh capacity_slots_for_gpus_match
-
-id: intel_gpu_plugin/node_gpu_allocatable
-category_id: dss-regress
-flags: simple
-imports: from com.canonical.certification import executable
-requires: executable.name == 'microk8s'
-depends: intel_gpu_plugin/node_gpu_capacity
-_summary: Check that at least expected allocatable slots for Intel GPU are available
+requires: executable.name == 'kubectl'
+depends: intel_gpu_plugin/install
+_summary: Check that at least one k8s node has label Intel GPU label attached
 estimated_duration: 5s
-command: check_intel.sh allocatable_slots_for_gpus_match
+command:
+  set -eou pipefail
+  result="$(kubectl get nodes -o jsonpath='{.items[*].metadata.labels.intel\.feature\.node\.kubernetes\.io/gpu}')"
+  echo "$result" | grep "true"
 
 id: dss/status_intel_gpu
 category_id: dss-regress
 flags: simple
 imports: from com.canonical.certification import executable
 requires: executable.name == 'dss'
-depends: intel_gpu_plugin/node_gpu_allocatable
+depends: intel_gpu_plugin/labels
 _summary: Check that DSS status reports Intel GPU acceleration is enabled
 estimated_duration: 5s
 command:
@@ -271,7 +217,7 @@ command:
   set -eou pipefail
   OPERATOR_VERSION="24.6.2"
   microk8s enable gpu --driver=operator --version="${OPERATOR_VERSION}"
-  check_cuda_rollout.sh
+  check_gpu_rollout.sh nvidia
 
 id: nvidia_gpu_addon/validations_succeed
 category_id: dss-regress

@@ -12,13 +12,7 @@ include:
     cpu/tensorflow_can_use_cpu
     dss/remove_tensorflow_cpu_notebook
     intel_gpu_plugin/install
-    intel_gpu_plugin/daemonset_name
-    intel_gpu_plugin/daemonset_number_available
-    intel_gpu_plugin/daemonset_number_ready
     intel_gpu_plugin/labels
-    intel_gpu_plugin/gpu_count
-    intel_gpu_plugin/node_gpu_capacity
-    intel_gpu_plugin/node_gpu_allocatable
     dss/status_intel_gpu
     dss/create_tensorflow_intel_notebook
     xpu/tensorflow_can_use_xpu