Skip to content
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/usr/bin/env bash
# This file is part of Checkbox.
#
# Copyright 2022 Canonical Ltd.
#
# Authors:
# Abdullah (@motjuste) <abdullah.abdullah@canonical.com>
#
# Checkbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3,
# as published by the Free Software Foundation.
#
# Checkbox is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Checkbox. If not, see <http://www.gnu.org/licenses/>.
set -eou pipefail

check_nvidia_gpu_rollout() {
NAMESPACE="gpu-operator-resources"
sleep 10
kubectl -n "$NAMESPACE" rollout status ds/gpu-operator-feature-discovery
sleep 10
kubectl -n "$NAMESPACE" rollout status ds/nvidia-device-plugin-daemonset
sleep 10
kubectl -n "$NAMESPACE" rollout status ds/nvidia-operator-validator
}

check_intel_gpu_rollout() {
sleep 10
kubectl -n node-feature-discovery rollout status ds/nfd-worker
sleep 10
kubectl -n default rollout status ds/intel-gpu-plugin
}

help_function() {
echo "This script is used for checking rollout of GPU-related daemonsets"
echo "Usage: check_gpu_rollout.sh <nvidia | intel>"
exit 2
}

main() {
case ${1} in
nvidia) check_nvidia_gpu_rollout ;;
intel) check_intel_gpu_rollout ;;
*) help_function ;;
esac
}

main "$@"

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env bash
#
# This file is part of Checkbox.
#
# Copyright 2022 Canonical Ltd.
#
# Authors:
# Abdullah (@motjuste) <abdullah.abdullah@canonical.com>
#
# Checkbox is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 3,
# as published by the Free Software Foundation.
#
# Checkbox is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Checkbox. If not, see <http://www.gnu.org/licenses/>.
#
set -eo pipefail

# This follows the how-to from the DSS team, see:
# <https://github.com/canonical/data-science-stack/blob/e495bacef97a1b6bf8bdb63dca29912be317be8c/docs/how-to/enable-gpus/enable-intel-gpu.rst>

VERSION="${1:-"v0.30.0"}"

# IMPORTANT: this is the sharedDevNum we pass into the gpu_plugin.yaml during installation
SLOTS_PER_GPU="${2:-"10"}"

# hack with tee as redirecting stdout anywhere but /dev/null throws a permission denied error
# see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4
kubectl kustomize "https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd?ref=${VERSION}" | tee /tmp/node_feature_discovery.yaml >/dev/null
kubectl kustomize "https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/nfd/overlays/node-feature-rules?ref=${VERSION}" | tee /tmp/node_feature_rules.yaml >/dev/null
kubectl kustomize "https://github.com/intel/intel-device-plugins-for-kubernetes/deployments/gpu_plugin/overlays/nfd_labeled_nodes?ref=${VERSION}" | tee /tmp/gpu_plugin.yaml >/dev/null

sed -i "s/enable-monitoring/enable-monitoring\n - -shared-dev-num=${SLOTS_PER_GPU}/" /tmp/gpu_plugin.yaml

kubectl apply -f /tmp/node_feature_discovery.yaml
kubectl apply -f /tmp/node_feature_rules.yaml
kubectl apply -f /tmp/gpu_plugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,84 +108,30 @@ requires:
depends: dss/initialize
_summary: Install Intel K8s GPU Device Plugin
estimated_duration: 2m
command: check_intel.sh gpu_plugin_can_be_installed

id: intel_gpu_plugin/daemonset_name
category_id: dss-regress
flags: simple
imports: from com.canonical.certification import executable
requires: executable.name == 'microk8s'
depends: intel_gpu_plugin/install
_summary: Check that Intel GPU plugin daemonset is deployed
estimated_duration: 5s
command: check_intel.sh gpu_plugin_daemonset_is_deployed

id: intel_gpu_plugin/daemonset_number_available
category_id: dss-regress
flags: simple
imports: from com.canonical.certification import executable
requires: executable.name == 'microk8s'
depends: intel_gpu_plugin/install
_summary: Check that at least one Intel GPU daemonset is available
estimated_duration: 5s
command: check_intel.sh one_daemonset_is_available

id: intel_gpu_plugin/daemonset_number_ready
category_id: dss-regress
flags: simple
imports: from com.canonical.certification import executable
requires: executable.name == 'microk8s'
depends: intel_gpu_plugin/daemonset_number_available
_summary: Check that at least one Intel GPU daemonset is ready
estimated_duration: 5s
command: check_intel.sh one_daemonset_is_ready
command:
set -eou pipefail
enable_intel_gpu_plugin.sh "v0.30.0" "10"
check_gpu_rollout.sh intel

id: intel_gpu_plugin/labels
category_id: dss-regress
flags: simple
imports: from com.canonical.certification import executable
requires: executable.name == 'microk8s'
depends: intel_gpu_plugin/daemonset_number_ready
_summary: Check that Kubernetes has label intel.feature.node.kubernetes.io/gpu
estimated_duration: 5s
command: check_intel.sh gpu_node_label_is_attached

id: intel_gpu_plugin/gpu_count
category_id: dss-regress
flags: simple
imports: from com.canonical.certification import executable
requires: executable.name == 'microk8s'
depends: intel_gpu_plugin/labels
_summary: Check that at least one Intel GPU is available on k8s node
estimated_duration: 5s
command: check_intel.sh at_least_one_gpu_is_available

id: intel_gpu_plugin/node_gpu_capacity
category_id: dss-regress
flags: simple
imports: from com.canonical.certification import executable
requires: executable.name == 'microk8s'
depends: intel_gpu_plugin/gpu_count
_summary: Check that at least expected capacity slots for Intel GPU are availabled
estimated_duration: 5s
command: check_intel.sh capacity_slots_for_gpus_match

id: intel_gpu_plugin/node_gpu_allocatable
category_id: dss-regress
flags: simple
imports: from com.canonical.certification import executable
requires: executable.name == 'microk8s'
depends: intel_gpu_plugin/node_gpu_capacity
_summary: Check that at least expected allocatable slots for Intel GPU are available
requires: executable.name == 'kubectl'
depends: intel_gpu_plugin/install
_summary: Check that at least one k8s node has label Intel GPU label attached
estimated_duration: 5s
command: check_intel.sh allocatable_slots_for_gpus_match
command:
set -eou pipefail
result="$(kubectl get nodes -o jsonpath='{.items[*].metadata.labels.intel\.feature\.node\.kubernetes\.io/gpu}')"
echo "$result" | grep "true"

id: dss/status_intel_gpu
category_id: dss-regress
flags: simple
imports: from com.canonical.certification import executable
requires: executable.name == 'dss'
depends: intel_gpu_plugin/node_gpu_allocatable
depends: intel_gpu_plugin/labels
_summary: Check that DSS status reports Intel GPU acceleration is enabled
estimated_duration: 5s
command:
Expand Down Expand Up @@ -271,7 +217,7 @@ command:
set -eou pipefail
OPERATOR_VERSION="24.6.2"
microk8s enable gpu --driver=operator --version="${OPERATOR_VERSION}"
check_cuda_rollout.sh
check_gpu_rollout.sh nvidia

id: nvidia_gpu_addon/validations_succeed
category_id: dss-regress
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,7 @@ include:
cpu/tensorflow_can_use_cpu
dss/remove_tensorflow_cpu_notebook
intel_gpu_plugin/install
intel_gpu_plugin/daemonset_name
intel_gpu_plugin/daemonset_number_available
intel_gpu_plugin/daemonset_number_ready
intel_gpu_plugin/labels
intel_gpu_plugin/gpu_count
intel_gpu_plugin/node_gpu_capacity
intel_gpu_plugin/node_gpu_allocatable
dss/status_intel_gpu
dss/create_tensorflow_intel_notebook
xpu/tensorflow_can_use_xpu
Expand Down