Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
c500cea
add jobs to DSS validation for setup and test on NVIDIA GPUs
motjuste Nov 18, 2024
14979c9
fix cuda test for tensorflow and give more time for things to settle
motjuste Nov 18, 2024
c8a4afb
fix dependency of nvidia_gpu_addon/enable job
motjuste Nov 18, 2024
6786061
fix wrong dependency for cuda jobs and make validation more reliable
motjuste Nov 19, 2024
50601d1
fix shebang to use control instead of remote in launcher script
motjuste Nov 19, 2024
45d427c
fix flaky gpu addon rollout checking in better order and more sleep
motjuste Nov 19, 2024
9da8078
make the GPU checking into resources to control GPU tests are run
motjuste Nov 19, 2024
37a9b64
remove flaky mlflow deployed test
motjuste Nov 19, 2024
e180c9a
update other dss test-plans to use the GPU as resources
motjuste Nov 19, 2024
ac91262
reduce max_attempts for retry to 2
motjuste Nov 19, 2024
6d046e0
add cpu-only tests for dss
motjuste Nov 19, 2024
d722754
rename validate script to not contain intel and bump snap's version
motjuste Nov 19, 2024
a741596
refactor testflinger job file builder to unify into one re-usable one
motjuste Nov 20, 2024
18b591e
add nvidia dgx as target machine for DSS testflinger jobs
motjuste Nov 20, 2024
2d43581
allow other workflow jobs in matrix to continue running if one fails
motjuste Nov 20, 2024
8d290ae
add notebook removal tests and rename cases to be consistent
motjuste Nov 25, 2024
eb4c09e
skip installing intel gpu plugin if it is already there
motjuste Nov 25, 2024
b6202e1
remove unused itex- and ipex-only test plans
motjuste Nov 25, 2024
98359c5
rename check_dss.sh to check_dss for pseudo-fluent usage
motjuste Nov 25, 2024
2971237
refactor remove notebook test to accept multiple arguments
motjuste Nov 25, 2024
b73a281
extract out notebook creation to reused function
motjuste Nov 25, 2024
d13de27
disable intel gpu capacity tests temporarily
motjuste Nov 26, 2024
99bb957
rename test case for dss to be more fluid
motjuste Nov 25, 2024
adfe2cd
refactor checking dss status into reusable function
motjuste Nov 25, 2024
89bfdca
add missing usage string for dss create notebook function
motjuste Nov 25, 2024
12fca77
use pushd popd instead of cd-ing to HOME in check dss
motjuste Nov 25, 2024
6e051f3
rename check_cuda.sh to check_cuda to have a pseudo-fluent usage
motjuste Nov 26, 2024
8e1f358
refactor cuda notebook tests to reusable script
motjuste Nov 27, 2024
ae0178c
refactor out the notebook tests for cpu
motjuste Nov 27, 2024
70c8673
refactor out itex tests to common notebook script
motjuste Nov 27, 2024
b8b3551
refactor out ipex tests to common notebook script
motjuste Nov 27, 2024
87d1526
reformat long requires clauses to multi-line ones
motjuste Nov 27, 2024
42ab519
drop .sh extension from check_intel script
motjuste Nov 27, 2024
ec75b21
fix failing intel gpu verification tests
motjuste Nov 27, 2024
e468f73
reduce sleep time in steps while enabling nvidia gpu addon
motjuste Nov 27, 2024
2cf48d0
fix help string for check_notebook
motjuste Nov 27, 2024
44a504a
refactor install-deps script allowing customization of microk8s and k…
motjuste Nov 27, 2024
5962863
add customized microk8s channels to github workflow for dss
motjuste Nov 27, 2024
896d18d
fix default dss_snap_channel to latest/stable instead of non-existent…
motjuste Nov 28, 2024
c10b121
add .sh extension back to the test runner scripts
motjuste Dec 2, 2024
3068584
use graphics_card resource for checking GPU instead of own
motjuste Dec 2, 2024
4481c28
change to detecting GPU based on vendor
motjuste Dec 2, 2024
8d7703c
fix mention of default channel for DSS in the README
motjuste Dec 3, 2024
1dc30bd
remove unnecessary dss integration tests script (coming later)
motjuste Dec 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 14 additions & 19 deletions .github/workflows/testflinger-contrib-dss-regression.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,36 +21,31 @@ jobs:
run:
working-directory: contrib/checkbox-dss-validation
strategy:
fail-fast: false
matrix:
dss_channel:
- latest/stable
- latest/edge
microk8s_channel:
- 1.28/stable
- 1.31/stable
queue:
- dell-precision-3470-c30322 #ADL iGPU + NVIDIA GPU
- dell-precision-5680-c31665 #RPL iGPU + Arc Pro A60M dGPU
- name: dell-precision-3470-c30322 #ADL iGPU + NVIDIA GPU
provision_data: "distro: jammy"
- name: dell-precision-5680-c31665 #RPL iGPU + Arc Pro A60M dGPU
provision_data: "url: http://10.102.196.9/somerville/Platforms/jellyfish-muk/X96_A00/dell-bto-jammy-jellyfish-muk-X96-20230419-19_A00.iso"
- name: nvidia-dgx-station-c25989 # NO iGPU + NVIDIA GPU
provision_data: "distro: jammy"
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Build job file from template with maas2 provisioning
if: ${{ matrix.queue == 'dell-precision-3470-c30322' }}
env:
PROVISION_DATA: "distro: jammy"
- name: Build job file from template
run: |
sed -e "s|REPLACE_BRANCH|${BRANCH}|" \
-e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
-e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
-e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/job.yaml
- name: Build job file from template with oemscript provisioning
if: ${{ matrix.queue == 'dell-precision-5680-c31665' }}
env:
PROVISION_DATA: "url: http://10.102.196.9/somerville/Platforms/jellyfish-muk/X96_A00/dell-bto-jammy-jellyfish-muk-X96-20230419-19_A00.iso"
run: |
sed -e "s|REPLACE_BRANCH|${BRANCH}|" \
-e "s|REPLACE_QUEUE|${{ matrix.queue }}|" \
-e "s|REPLACE_PROVISION_DATA|${PROVISION_DATA}|" \
-e "s|REPLACE_QUEUE|${{ matrix.queue.name }}|" \
-e "s|REPLACE_PROVISION_DATA|${{ matrix.queue.provision_data }}|" \
-e "s|REPLACE_DSS_CHANNEL|${{ matrix.dss_channel }}|" \
-e "s|REPLACE_MICROK8S_CHANNEL|${{ matrix.microk8s_channel }}|" \
${GITHUB_WORKSPACE}/contrib/checkbox-dss-validation/testflinger/job-def.yaml > \
${GITHUB_WORKSPACE}/job.yaml
- name: Submit testflinger job
Expand Down
22 changes: 18 additions & 4 deletions contrib/checkbox-dss-validation/README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# Welcome to the Checkbox DSS project!

This repository contains the Checkbox DSS Provider (test cases and test plans for validating Intel GPU support in the [Data Science Stack](https://documentation.ubuntu.com/data-science-stack/en/latest/)) as well as everything that is required to build the `checkbox-dss` snap.
This repository contains the Checkbox DSS Provider (test cases and test plans for validating Intel and NVIDIA GPU support in the [Data Science Stack](https://documentation.ubuntu.com/data-science-stack/en/latest/)) as well as everything that is required to build the `checkbox-dss` snap.

# Requirements

- Ubuntu Jammy (22.04)
- Supported hardware platforms:
- No GPUs
- Intel platforms with recent GPU (>= Broadwell)
- Recent NVIDIA GPU

# Installation

Expand All @@ -19,7 +21,7 @@ lxd init --auto
git clone https://github.com/canonical/checkbox
cd checkbox/contrib/checkbox-dss-validation
snapcraft
sudo snap install --dangerous --classic ./checkbox-dss_2.0_amd64.snap
sudo snap install --dangerous --classic ./checkbox-dss_3.0_amd64.snap
```

Make sure that the provider service is running and active:
Expand All @@ -40,15 +42,27 @@ By default this will install the `data-science-stack` snap from the `latest/stab
channel. To instead install from `latest/edge` use:

```shell
checkbox-dss.install-deps --dss-snap-channel=latest/edge
checkbox-dss.install-deps --dss-snap-channel latest/edge
```

Furthermore, the default `microk8s` snap channel is `1.28/stable` in classic mode,
but this can be customized as
(please note that this snap must to be `--classic` to enable GPU support):

```shell
checkbox-dss.install-deps --microk8s-snap-channel 1.31/stable
```

These validations also need the `kubectl` snap installed, and the default channel
used for that is `1.29/stable`, but can be customized as shown previously by passing
the appropriate channel name for `--kubectl-snap-channel`.

# Automated Run

To run the test plans:

```shell
checkbox-dss.validate-intel-gpu
checkbox-dss.validate-with-gpu
```

# Cleanup
Expand Down
139 changes: 86 additions & 53 deletions contrib/checkbox-dss-validation/bin/install-deps
Original file line number Diff line number Diff line change
@@ -1,56 +1,89 @@
#!/bin/bash
set -e

echo -e "\nStep 1/5: Installing microk8s snap"
sudo snap install microk8s --channel 1.28/stable --classic

USER=$(id -nu ${SNAP_UID})
HOME=${SNAP_REAL_HOME}

# microk8s commands run from tests are run without sudo
sudo usermod -a -G microk8s $USER
# Directory needed for sharing microk8s config with kubectl snap
mkdir -p $HOME/.kube

echo -e "\nStep 2/5: Configuring microk8s addons"
sudo microk8s status --wait-ready
# Give microk8s another minute to stabilize
# to avoid intermittent failures when
# enabling hostpath-storage
echo "Giving microk8s a minute to stabilize..."
sleep 60
sudo microk8s enable hostpath-storage
sudo microk8s enable dns
sudo microk8s enable rbac

echo "Waiting for microk8s addons to become ready..."
sudo microk8s.kubectl wait \
--for=condition=available \
--timeout 1800s \
-n kube-system \
deployment/coredns \
deployment/hostpath-provisioner
sudo microk8s.kubectl -n kube-system rollout status ds/calico-node

# This is needed to overcome the following bug within microk8s:
# https://github.com/canonical/microk8s/issues/4453
echo -e "\nStep 3/5: Installing kubectl snap"
sudo snap install kubectl --classic --channel=1.29/stable
# hack as redirecting stdout anywhere but /dev/null throws a permission denied error
# see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4
sudo microk8s.kubectl config view --raw | tee $HOME/.kube/config > /dev/null

# intel_gpu_top command used for host-level GPU check
# jq used for cases where jsonpath is insufficient for parsing json results
echo -e "\nStep 4/5: Installing intel-gpu-tools"
DEBIAN_FRONTEND=noninteractive sudo apt install -y intel-gpu-tools jq

echo -e "\nStep 5/5: Installing data-science-stack snap"
optional_arg=$1
if [ "${optional_arg}" = "--dss-snap-channel=latest/edge" ]; then
echo "Installing from edge"
sudo snap install data-science-stack --channel latest/edge
else
echo "Installing from stable"
sudo snap install data-science-stack --channel latest/stable
fi
dss_snap_channel="latest/stable"
microk8s_snap_channel="1.28/stable"
kubectl_snap_channel="1.29/stable"

setup_microk8s_snap() {
echo -e "\nInstalling microk8s snap from channel $1"
sudo snap install microk8s --channel "$1" --classic

SNAP_USER=$(id -nu "${SNAP_UID}")

# microk8s commands run from tests are run without sudo
sudo usermod -a -G microk8s "$SNAP_USER"
# Directory needed for sharing microk8s config with kubectl snap
mkdir -p "${SNAP_REAL_HOME}/.kube"

echo -e "\nConfiguring microk8s addons"
sudo microk8s status --wait-ready
# Give microk8s another minute to stabilize
# to avoid intermittent failures when
# enabling hostpath-storage
echo "Giving microk8s a minute to stabilize..."
sleep 60
sudo microk8s enable hostpath-storage
sudo microk8s enable dns
sudo microk8s enable rbac

echo "Waiting for microk8s addons to become ready..."
sudo microk8s.kubectl wait \
--for=condition=available \
--timeout 1800s \
-n kube-system \
deployment/coredns \
deployment/hostpath-provisioner
sudo microk8s.kubectl -n kube-system rollout status ds/calico-node
}

setup_kubectl_snap() {
# This is needed to overcome the following bug within microk8s:
# https://github.com/canonical/microk8s/issues/4453
echo -e "\nInstalling kubectl snap from channel $1"
sudo snap install kubectl --classic --channel="$1"
# hack as redirecting stdout anywhere but /dev/null throws a permission denied error
# see: https://forum.snapcraft.io/t/eksctl-cannot-write-to-stdout/17254/4
sudo microk8s.kubectl config view --raw | tee "${SNAP_REAL_HOME}/.kube/config" >/dev/null
}

help_function() {
echo "This script is used install all dependencies for checkbox-dss to run; defaults for optional arguments are shown in usage"
echo "Usage: checkbox-dss.install-deps [--dss-snap-channel $dss_snap_channel] [--microk8s-snap-channel $microk8s_snap_channel] [--kubectl-snap-channel $kubectl_snap_channel]"
}

main() {
while [ $# -ne 0 ]; do
case $1 in
--dss-snap-channel)
dss_snap_channel="$2"
shift 2
;;
--microk8s-snap-channel)
microk8s_snap_channel="$2"
shift 2
;;
--kubectl-snap-channel)
kubectl_snap_channel="$2"
shift 2
;;
*) help_function ;;
esac
done

echo -e "\n Step 1/4: Setting up microk8s"
setup_microk8s_snap "$microk8s_snap_channel"

echo -e "\n Step 2/4: Setting up kubectl"
setup_kubectl_snap "$kubectl_snap_channel"

# intel_gpu_top command used for host-level GPU check
# jq used for cases where jsonpath is insufficient for parsing json results
echo -e "\nStep 3/4: Installing intel-gpu-tools"
DEBIAN_FRONTEND=noninteractive sudo apt install -y intel-gpu-tools jq

echo -e "\nStep 4/4: Installing data-science-stack snap from channel $dss_snap_channel"
sudo snap install data-science-stack --channel "$dss_snap_channel"
}

main "$@"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env -S checkbox-cli-wrapper remote 127.0.0.1
#!/usr/bin/env -S checkbox-cli-wrapper control 127.0.0.1
[launcher]
app_id = com.canonical.contrib.dss-validation:checkbox
launcher_version = 1
Expand All @@ -14,5 +14,5 @@ forced = yes
[ui]
type = silent
auto_retry = yes
max_attempts = 10
max_attempts = 2
delay_before_retry = 10
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env bash

set -euxo pipefail

check_nvidia_gpu_addon_can_be_enabled() {
# TODO: enable changing GPU_OPERATOR_VERSION
GPU_OPERATOR_VERSION=24.6.2
echo "[INFO]: enabling the NVIDIA GPU addon"
sudo microk8s enable gpu --driver=operator --version="$GPU_OPERATOR_VERSION"
SLEEP_SECS=10
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU feature discovery has rolled out."
sleep ${SLEEP_SECS}
microk8s.kubectl -n gpu-operator-resources rollout status ds/gpu-operator-node-feature-discovery-worker
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if daemonsets have rolled out."
sleep ${SLEEP_SECS}
microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-device-plugin-daemonset
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking GPU validations have rolled out."
sleep ${SLEEP_SECS}
echo "[INFO]: Waiting for the GPU validations to rollout"
microk8s.kubectl -n gpu-operator-resources rollout status ds/nvidia-operator-validator
echo "Test success: NVIDIA GPU addon enabled."
}

check_nvidia_gpu_validations_succeed() {
SLEEP_SECS=5
echo "[INFO]: sleeping for ${SLEEP_SECS} seconds before checking if GPU validations were successful."
sleep ${SLEEP_SECS}
result=$(microk8s.kubectl logs -n gpu-operator-resources -lapp=nvidia-operator-validator -c nvidia-operator-validator)
if [ "${result}" = "all validations are successful" ]; then
echo "Test success: NVIDIA GPU validations were successful!"
else
>&2 echo "Test failure: NVIDIA GPU validations were not successful, got ${result}"
exit 1
fi
}

help_function() {
echo "This script is used for tests related to CUDA"
echo "Usage: check_dss.sh <test_case>"
echo
echo "Test cases currently implemented:"
echo -e "\t<gpu_addon_can_be_enabled>: check_nvidia_gpu_addon_can_be_enabled"
echo -e "\t<gpu_validations_succeed>: check_nvidia_gpu_validations_succeed"
}

main() {
case ${1} in
gpu_addon_can_be_enabled) check_nvidia_gpu_addon_can_be_enabled ;;
gpu_validations_succeed) check_nvidia_gpu_validations_succeed ;;
*) help_function ;;
esac
}

main "$@"
Loading