[autotest-client-tests][PATCH 1/1] UBUNTU: SAUCE: ubuntu_nvidia_server_driver: create nvidia-fs module test
Taihsiang Ho (tai271828)
taihsiang.ho at canonical.com
Mon Jan 24 21:24:29 UTC 2022
The goal of this test is to confirm that the nvidia-fs module continues to
build and work properly with new kernel updates.
The environment in which this test needs to run requires several 3rd party
pieces of software - including other 3rd party modules that require a reboot
after installation. To avoid having to handle reboots of the test client,
we instead do the test inside of a virtual machine that the test client
can spin up and reboot itself. The actual nvidia-fs test runs in a docker
container inside that virtual machine.
The test is kicked off by running 01-run-test.sh, which will run each of
the other scripts in turn to set up the virtual machine and the test
docker container within it.
Signed-off-by: Taihsiang Ho (tai271828) <taihsiang.ho at canonical.com>
---
ubuntu_nvidia_server_driver/control | 1 +
ubuntu_nvidia_server_driver/nvidia-fs/00-vars | 11 ++
.../nvidia-fs/01-run-test.sh | 156 ++++++++++++++++++
.../nvidia-fs/02-inside-vm-update-kernel.sh | 52 ++++++
.../nvidia-fs/03-inside-vm-install-drivers.sh | 39 +++++
.../04-inside-vm-setup-docker-and-run-test.sh | 41 +++++
.../nvidia-fs/05-inside-docker-run-test.sh | 38 +++++
ubuntu_nvidia_server_driver/nvidia-fs/README | 17 ++
.../nvidia-fs/a-c-t-entry.sh | 10 ++
.../ubuntu_nvidia_server_driver.py | 10 ++
10 files changed, 375 insertions(+)
create mode 100644 ubuntu_nvidia_server_driver/nvidia-fs/00-vars
create mode 100755 ubuntu_nvidia_server_driver/nvidia-fs/01-run-test.sh
create mode 100755 ubuntu_nvidia_server_driver/nvidia-fs/02-inside-vm-update-kernel.sh
create mode 100755 ubuntu_nvidia_server_driver/nvidia-fs/03-inside-vm-install-drivers.sh
create mode 100755 ubuntu_nvidia_server_driver/nvidia-fs/04-inside-vm-setup-docker-and-run-test.sh
create mode 100755 ubuntu_nvidia_server_driver/nvidia-fs/05-inside-docker-run-test.sh
create mode 100644 ubuntu_nvidia_server_driver/nvidia-fs/README
create mode 100755 ubuntu_nvidia_server_driver/nvidia-fs/a-c-t-entry.sh
diff --git a/ubuntu_nvidia_server_driver/control b/ubuntu_nvidia_server_driver/control
index 2c3f2510..3f6f2323 100644
--- a/ubuntu_nvidia_server_driver/control
+++ b/ubuntu_nvidia_server_driver/control
@@ -10,3 +10,4 @@ Perform testing of Nvidia server drivers
"""
job.run_test_detail('ubuntu_nvidia_server_driver', test_name='load', tag='load', timeout=600)
+job.run_test_detail('ubuntu_nvidia_server_driver', test_name='nvidia-fs', tag='nvidia-fs', timeout=1500)
diff --git a/ubuntu_nvidia_server_driver/nvidia-fs/00-vars b/ubuntu_nvidia_server_driver/nvidia-fs/00-vars
new file mode 100644
index 00000000..ad86f46e
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/nvidia-fs/00-vars
@@ -0,0 +1,11 @@
+# shellcheck shell=bash
+# shellcheck disable=SC2034
+KERNEL_FLAVOR="generic"
+CUDA_CONTAINER_NAME="nvcr.io/nvidia/cuda"
+NVIDIA_BRANCH="470-server"
+LXD_INSTANCE="nvidia-fs-test"
+MLNX_REPO="https://linux.mellanox.com/public/repo/mlnx_ofed"
+MLNX_OFED_VER="5.4-1.0.3.0"
+if [ -f 00-vars.gen ]; then
+ source ./00-vars.gen
+fi
diff --git a/ubuntu_nvidia_server_driver/nvidia-fs/01-run-test.sh b/ubuntu_nvidia_server_driver/nvidia-fs/01-run-test.sh
new file mode 100755
index 00000000..1db631af
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/nvidia-fs/01-run-test.sh
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+shopt -s nullglob
+
+rm -f 00-vars.gen # avoid stale configs from previous runs
+source 00-vars
+source ../nvidia-module-lib
+
+sudo apt install -y jq xmlstarlet
+
+driver_recommended_cuda_version() {
+ local xmlout
+ xmlout="$(mktemp)"
+
+ sudo nvidia-smi -q -u -x --dtd | tee "$xmlout" > /dev/null
+ xmlstarlet sel -t -v "/nvidia_smi_log/cuda_version" < "$xmlout"
+ rm -f "$xmlout"
+}
+
+find_latest_cuda_container_tag_by_branch() {
+ local branch="$1" # e.g. 11.4
+ source ./00-vars.gen # pick up LXD_OS_VER
+
+ # List all of the available nvidia cuda image tags, filter for
+ # devel/ubuntu images that match our cuda x.y, and sort numerically
+ # to find the newest minor (x.y.z) version.
+ #
+ # Output is paginated by default. To get all the items in one go,
+ # set a page_size greater than the likely number of items (1024)
+ curl -L -s \
+ 'https://registry.hub.docker.com/v2/repositories/nvidia/cuda/tags?page_size=1024' | \
+ jq '."results"[]["name"]' | \
+ tr -d \" | \
+ grep -E "^${branch}(\.[0-9]+)*-devel-ubuntu${LXD_OS_VER}$" | \
+ sort -n | tail -1
+}
+
+gen_vars() {
+ local cuda_branch
+ local container_tag
+
+ # Match the host OS
+ echo "LXD_OS_CODENAME=$(lsb_release -cs)" > 00-vars.gen
+ echo "LXD_OS_VER=$(lsb_release -rs)" >> 00-vars.gen
+ cuda_branch="$(driver_recommended_cuda_version)"
+ container_tag="$(find_latest_cuda_container_tag_by_branch "$cuda_branch")"
+ echo "CUDA_BRANCH=${cuda_branch}" >> 00-vars.gen
+ echo "CUDA_CONTAINER_TAG=${container_tag}" >> 00-vars.gen
+}
+
+lxd_wait() {
+ local instance="$1"
+
+ for _ in $(seq 300); do
+ if lxc exec "${instance}" -- /bin/true; then
+ break
+ fi
+ sleep 1
+ done
+}
+
+is_whole_nvme_dev() {
+ local dev
+ dev="$(basename "$1")"
+ echo "$dev" | grep -Eq '^nvme[0-9]+n[0-9]+$'
+}
+
+find_free_nvme() {
+ local dev
+ local children
+ command -v jq > /dev/null || sudo apt install -y jq 1>&2
+ for dev in /dev/nvme*; do
+ is_whole_nvme_dev "$dev" || continue
+ # Is this device used by another kernel device (RAID/LVM/etc)?
+ children=$(lsblk -J "$dev" | jq '.["blockdevices"][0]."children"')
+ if [ "$children" = "null" ]; then
+ echo "$dev"
+ return 0
+ fi
+ done
+ return 1
+}
+
+nvme_dev_to_bdf() {
+ local dev="$1"
+ local bdf=""
+
+ while read -r comp; do
+ if echo "$comp" | grep -q -E '^[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f]$'; then
+ bdf="$comp"
+ fi
+ done <<<"$(readlink /sys/block/"$(basename "$dev")" | tr / '\n')"
+ if [ -z "$bdf" ]; then
+ echo "ERROR: name_dev_to_bdf: No PCI address found for $dev" 1>&2
+ return 1
+ fi
+ echo "$bdf"
+ return 0
+}
+
+gen_vars
+source ./00-vars.gen
+
+# 20.04 installs currently get LXD 4.0.7 by default, but we need at least
+# 4.11 for PCI passthrough support for VMs. latest/stable is new enough.
+sudo snap refresh lxd --channel=latest/stable
+sudo lxd init --auto
+lxc delete --force "$LXD_INSTANCE" || :
+
+# FIXME: Should probably dynamically adapt cpu/memory based on host system
+lxc launch --vm "ubuntu:${LXD_OS_CODENAME}" "$LXD_INSTANCE" \
+ -t c48-m16 \
+ -c security.secureboot=false # so we can load untrusted modules
+
+# Ran out of space pulling the docker image w/ the default 10GB. Double it.
+lxc config device override "${LXD_INSTANCE}" root size=20GB
+lxd_wait "${LXD_INSTANCE}"
+
+for file in 00-vars 00-vars.gen 02-inside-vm-update-kernel.sh 03-inside-vm-install-drivers.sh 04-inside-vm-setup-docker-and-run-test.sh 05-inside-docker-run-test.sh; do
+ lxc file push ${file} "${LXD_INSTANCE}"/root/${file}
+done
+lxc exec "${LXD_INSTANCE}" -- /root/02-inside-vm-update-kernel.sh
+
+# Reboot to switch to updated kernel, so new drivers will build for it
+lxc stop "${LXD_INSTANCE}"
+
+# Release GPU devices so we can assign them to a VM
+sudo service nvidia-fabricmanager stop || :
+recursive_remove_module nvidia
+
+## Pass in devices. Note: devices can be assigned only while VM is stopped
+
+# Any Nvidia GPU will do, just grab the first one we find
+gpuaddr="$(lspci | grep '3D controller: NVIDIA Corporation' | cut -d' ' -f1 | head -1)"
+lxc config device add "${LXD_INSTANCE}" gpu pci "address=${gpuaddr}"
+
+# Find an unused NVMe device to pass in
+nvmedev=$(find_free_nvme) || \
+ (echo "ERROR: No unused nvme device found" 1>&2 && exit 1)
+nvmeaddr="$(nvme_dev_to_bdf "$nvmedev")" || \
+ (echo "ERROR: No PCI device found for $nvmedev" 1>&2 && exit 1)
+lxc config device add "${LXD_INSTANCE}" nvme pci "address=${nvmeaddr}"
+
+lxc start "${LXD_INSTANCE}"
+lxd_wait "${LXD_INSTANCE}"
+lxc exec "${LXD_INSTANCE}" -- /root/03-inside-vm-install-drivers.sh
+
+# Reboot to switch to new overridden drivers
+lxc stop "${LXD_INSTANCE}"
+lxc start "${LXD_INSTANCE}"
+
+lxd_wait "${LXD_INSTANCE}"
+lxc exec "${LXD_INSTANCE}" -- /root/04-inside-vm-setup-docker-and-run-test.sh
diff --git a/ubuntu_nvidia_server_driver/nvidia-fs/02-inside-vm-update-kernel.sh b/ubuntu_nvidia_server_driver/nvidia-fs/02-inside-vm-update-kernel.sh
new file mode 100755
index 00000000..914cf795
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/nvidia-fs/02-inside-vm-update-kernel.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+source ./00-vars
+
+export DEBCONF_FRONTEND="noniteractive"
+export DEBIAN_PRIORITY="critical"
+
+enable_proposed() {
+ local arch
+ local release
+ local mirror
+ local pockets
+ arch="$(dpkg --print-architecture)"
+ release="$(lsb_release -cs)"
+ pockets="restricted main universe multiverse"
+
+ case $arch in
+ i386|amd64)
+ mirror="http://archive.ubuntu.com/ubuntu"
+ ;;
+ *)
+ mirror="http://ports.ubuntu.com/ubuntu-ports"
+ ;;
+ esac
+
+ echo "deb $mirror ${release}-proposed restricted $pockets" | \
+ sudo tee "/etc/apt/sources.list.d/${release}-proposed.list" > /dev/null
+ echo "deb-src $mirror ${release}-proposed restricted $pockets" | \
+ sudo tee -a "/etc/apt/sources.list.d/${release}-proposed.list" > /dev/null
+}
+
+enable_proposed
+apt update
+apt install -y linux-"${KERNEL_FLAVOR}" \
+ linux-modules-nvidia-"${NVIDIA_BRANCH}"-"${KERNEL_FLAVOR}" \
+ nvidia-kernel-source-"${NVIDIA_BRANCH}" \
+ nvidia-utils-"${NVIDIA_BRANCH}"
+
+# Find the latest kernel version that matches our flavor and create "-test"
+# symlinks to it since they will sort highest, making it the default
+kver=$(linux-version list | grep -- "-${KERNEL_FLAVOR}$" | \
+ linux-version sort --reverse | head -1)
+ln -s "vmlinuz-${kver}" /boot/vmlinuz-test
+ln -s "initrd.img-${kver}" /boot/initrd.img-test
+
+# Workaround LP: #1849563
+echo "GRUB_CMDLINE_LINUX_DEFAULT=\"\$GRUB_CMDLINE_LINUX_DEFAULT pci=nocrs pci=realloc\"" > /etc/default/grub.d/99-nvidia-fs-test.cfg
+
+update-grub
diff --git a/ubuntu_nvidia_server_driver/nvidia-fs/03-inside-vm-install-drivers.sh b/ubuntu_nvidia_server_driver/nvidia-fs/03-inside-vm-install-drivers.sh
new file mode 100755
index 00000000..9d12ddc6
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/nvidia-fs/03-inside-vm-install-drivers.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+source ./00-vars
+
+export DEBCONF_FRONTEND="noniteractive"
+export DEBIAN_PRIORITY="critical"
+
+# Remove headers for all kernels except the one running so DKMS does not
+# try to build modules against them. Other kernels may not be compatible
+# with our modules, and we don't want the install to fail because of that.
+# We need to do this twice because apt will avoid removing a metapackage
+# (e.g. linux-kvm) if it can instead upgrade it, which may pull in a new
+# headers package. If that happens, the 2nd time through we'll remove that
+# updated headers package as well as the metapackage(s) that brung it.
+for _ in 1 2; do
+ for file in /lib/modules/*/build; do
+ if [ "$file" = "/lib/modules/$(uname -r)/build" ]; then
+ continue
+ fi
+ apt remove --purge "$(dpkg -S "$file" | cut -d":" -f1 | sed 's/, / /g')" -y
+ done
+done
+
+# Install MOFED stack
+wget -qO - https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox | \
+ apt-key add -
+wget -qO - "${MLNX_REPO}/${MLNX_OFED_VER}/ubuntu${LXD_OS_VER}/mellanox_mlnx_ofed.list" | tee /etc/apt/sources.list.d/mellanox_mlnx_ofed.list
+apt update
+apt install -y mlnx-ofed-all mlnx-nvme-dkms mlnx-nfsrdma-dkms
+
+# Install nvidia-fs module
+cuda_os="ubuntu$(echo "$LXD_OS_VER" | tr -d .)"
+apt-key adv --fetch-keys "https://developer.download.nvidia.com/compute/cuda/repos/${cuda_os}/x86_64/7fa2af80.pub"
+add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/${cuda_os}/x86_64/ /"
+apt install -y nvidia-fs-dkms
+add-apt-repository -r "deb https://developer.download.nvidia.com/compute/cuda/repos/${cuda_os}/x86_64/ /"
diff --git a/ubuntu_nvidia_server_driver/nvidia-fs/04-inside-vm-setup-docker-and-run-test.sh b/ubuntu_nvidia_server_driver/nvidia-fs/04-inside-vm-setup-docker-and-run-test.sh
new file mode 100755
index 00000000..17cb5ddb
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/nvidia-fs/04-inside-vm-setup-docker-and-run-test.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+source ./00-vars
+
+install_nvidia_docker() {
+ local distribution
+ distribution="$(. /etc/os-release;echo "$ID$VERSION_ID")"
+ curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+ curl -s -L "https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list" | \
+ sudo tee /etc/apt/sources.list.d/nvidia-docker.list > /dev/null
+ sudo apt update
+ sudo apt install -y nvidia-docker2 -y
+ sudo systemctl restart docker
+}
+
+umount /mnt/nvme || true
+parted -s /dev/nvme0n1 -- mklabel gpt
+parted -s /dev/nvme0n1 -- mkpart primary ext4 0 100%
+udevadm settle
+mkfs.ext4 -F "/dev/nvme0n1p1"
+mkdir -p /mnt/nvme
+mount "/dev/nvme0n1p1" /mnt/nvme -o data=ordered
+
+modprobe nvidia-fs
+
+install_nvidia_docker
+
+container="${CUDA_CONTAINER_NAME}:${CUDA_CONTAINER_TAG}"
+
+docker pull "${container}"
+docker run --rm --ipc host --name test_gds --gpus device=all \
+ --volume /run/udev:/run/udev:ro \
+ --volume /sys/kernel/config:/sys/kernel/config/ \
+ --volume /dev:/dev:ro \
+ --volume /mnt/nvme:/data/:rw \
+ --volume /root:/root/:ro \
+ --privileged "${container}" \
+ bash -c 'cd /root && ./05-inside-docker-run-test.sh'
diff --git a/ubuntu_nvidia_server_driver/nvidia-fs/05-inside-docker-run-test.sh b/ubuntu_nvidia_server_driver/nvidia-fs/05-inside-docker-run-test.sh
new file mode 100755
index 00000000..652bb558
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/nvidia-fs/05-inside-docker-run-test.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+set -e
+set -x
+
+source ./00-vars
+
+# We want e.g. gds-tools-11-4 if using CUDA 11.4
+gds_tools="gds-tools-$(echo "$CUDA_BRANCH" | tr "." "-")"
+
+apt update
+apt install "$gds_tools" libssl-dev -y
+cd /usr/local/cuda/gds/samples
+make -j "$(nproc)"
+dd status=none if=/dev/urandom of=/data/file1 iflag=fullblock bs=1M count=1024
+dd status=none if=/dev/urandom of=/data/file2 iflag=fullblock bs=1M count=1024
+
+#Edit cufile.json and set "allow_compat" property to "false".
+sed -i 's/"allow_compat_mode": true,/"allow_compat_mode": false,/' /etc/cufile.json
+
+echo "sample1"
+./cufile_sample_001 /data/file1 0
+echo "sample 2"
+./cufile_sample_002 /data/file1 0
+echo "sample 3"
+./cufile_sample_003 /data/file1 /data/file2 0
+echo "sample 4"
+./cufile_sample_004 /data/file1 /data/file2 0
+echo "sample 5"
+./cufile_sample_005 /data/file1 /data/file2 0
+echo "sample 6"
+./cufile_sample_006 /data/file1 /data/file2 0
+echo "sample 7"
+./cufile_sample_007 0
+echo "sample 8"
+./cufile_sample_008 0
+echo "sample 14"
+./cufile_sample_014 /data/file1 /data/file2 0
diff --git a/ubuntu_nvidia_server_driver/nvidia-fs/README b/ubuntu_nvidia_server_driver/nvidia-fs/README
new file mode 100644
index 00000000..fb68ce75
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/nvidia-fs/README
@@ -0,0 +1,17 @@
+= nvidia-fs testing =
+The goal of this test is to confirm that the nvidia-fs module continues to
+build and work properly with new kernel updates.
+
+The environment in which this test needs to run requires several 3rd party
+pieces of software - including other 3rd party modules that require a reboot
+after installation. To avoid having to handle reboots of the test client,
+we instead do the test inside of a virtual machine that the test client
+can spin up and reboot itself. The actual nvidia-fs test runs in a docker
+container inside that virtual machine.
+
+The test is kicked off by running 01-run-test.sh, which will run each of
+the other scripts in turn to set up the virtual machine and the test
+docker container within it.
+
+
+
diff --git a/ubuntu_nvidia_server_driver/nvidia-fs/a-c-t-entry.sh b/ubuntu_nvidia_server_driver/nvidia-fs/a-c-t-entry.sh
new file mode 100755
index 00000000..9c535270
--- /dev/null
+++ b/ubuntu_nvidia_server_driver/nvidia-fs/a-c-t-entry.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+run_test() {
+ exe_dir=$(dirname "${BASH_SOURCE[0]}")
+ pushd "${exe_dir}"
+ ./01-run-test.sh
+ popd
+}
+
+run_test
diff --git a/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.py b/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.py
index d0c667ae..6a6f4c53 100644
--- a/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.py
+++ b/ubuntu_nvidia_server_driver/ubuntu_nvidia_server_driver.py
@@ -19,6 +19,10 @@ class ubuntu_nvidia_server_driver(test.test):
cmd = "{} test".format(sh_executable)
utils.system(cmd)
+ def run_nvidia_fs_in_lxc(self):
+ cmd = os.path.join(p_dir, "./nvidia-fs/a-c-t-entry.sh")
+ utils.system(cmd)
+
def run_once(self, test_name):
if test_name == "load":
self.compare_kernel_modules()
@@ -26,6 +30,12 @@ class ubuntu_nvidia_server_driver(test.test):
print("")
print("{} has run.".format(test_name))
+ elif test_name == "nvidia-fs":
+ self.run_nvidia_fs_in_lxc()
+
+ print("")
+ print("{} has run.".format(test_name))
+
print("")
def postprocess_iteration(self):
--
2.34.1
More information about the kernel-team
mailing list