From 9674787e7ebaa1f44afd78f2fa36928dee259d4a Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 29 May 2025 20:47:49 +0200 Subject: [PATCH] [no-relnote] Add E2E for libnvidia-container Signed-off-by: Carlos Eduardo Arango Gutierrez --- tests/e2e/Makefile | 8 +- tests/e2e/installer.go | 17 +- tests/e2e/nvidia-container-cli_test.go | 208 +++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 5 deletions(-) create mode 100644 tests/e2e/nvidia-container-cli_test.go diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 2f14fd8a..0ba369d7 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -20,8 +20,14 @@ LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs GINKGO_BIN := $(CURDIR)/bin/ginkgo +# If GINKGO_FOCUS is not set, run all tests +# current available tests: +# - nvidia-container-cli +# - docker +GINKGO_FOCUS ?= + test: $(GINKGO_BIN) - $(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/... + $(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json --focus="$(GINKGO_FOCUS)" ./tests/e2e/... $(GINKGO_BIN): mkdir -p $(CURDIR)/bin diff --git a/tests/e2e/installer.go b/tests/e2e/installer.go index 4bd2552b..da065a12 100644 --- a/tests/e2e/installer.go +++ b/tests/e2e/installer.go @@ -28,11 +28,20 @@ var dockerInstallTemplate = ` #! /usr/bin/env bash set -xe -: ${IMAGE:={{.Image}}} +# if the TEMP_DIR is already set, use it +if [ -f /tmp/ctk_e2e_temp_dir.txt ]; then + TEMP_DIR=$(cat /tmp/ctk_e2e_temp_dir.txt) +else + TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM" + echo "$TEMP_DIR" > /tmp/ctk_e2e_temp_dir.txt +fi -# Create a temporary directory -TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM" -mkdir -p "$TEMP_DIR" +# if TEMP_DIR does not exist, create it +if [ ! -d "$TEMP_DIR" ]; then + mkdir -p "$TEMP_DIR" +fi + +: ${IMAGE:={{.Image}}} # Given that docker has an init function that checks for the existence of the # nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook diff --git a/tests/e2e/nvidia-container-cli_test.go b/tests/e2e/nvidia-container-cli_test.go new file mode 100644 index 00000000..ce182433 --- /dev/null +++ b/tests/e2e/nvidia-container-cli_test.go @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package e2e + +import ( + "context" + "fmt" + "strings" + "text/template" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +const ( + dockerDindTemplate = `docker run -d --rm --privileged \ + -v {{.SharedDir}}/etc/docker:/etc/docker \ + -v {{.SharedDir}}/run/nvidia:/run/nvidia \ + -v {{.SharedDir}}/usr/local/nvidia:/usr/local/nvidia \ + --name {{.ContainerName}} \ + docker:dind -H unix://{{.DockerSocket}}` + + dockerToolkitTemplate = `docker run -d --rm --privileged \ + --volumes-from {{.DindContainerName}} \ + --pid "container:{{.DindContainerName}}" \ + -e RUNTIME_ARGS="--socket {{.DockerSocket}}" \ + -v {{.TestScriptPath}}:/usr/local/bin/libnvidia-container-cli.sh \ + --name {{.ContainerName}} \ + {{.ToolkitImage}} /usr/local/bin/libnvidia-container-cli.sh` + + dockerDefaultConfigTemplate = ` +{ + "registry-mirrors": ["https://mirror.gcr.io"] +}` + + libnvidiaContainerCliTestTemplate = `#!/usr/bin/env bash +set -euo pipefail + +apt-get update -y && apt-get install -y curl gnupg2 + +WORKDIR="$(mktemp -d)" +ROOTFS="${WORKDIR}/rootfs" +mkdir -p "${ROOTFS}" + +export WORKDIR ROOTFS # make them visible in the child shell + +unshare --mount --pid --fork --propagation private -- bash -eux <<'IN_NS' + : "${ROOTFS:?}" "${WORKDIR:?}" # abort if either is empty + + # 1 Populate minimal Ubuntu base + curl -L http://cdimage.ubuntu.com/ubuntu-base/releases/22.04/release/ubuntu-base-22.04-base-amd64.tar.gz \ + | tar -C "$ROOTFS" -xz + + # 2 Add non-root user + useradd -R "$ROOTFS" -U -u 1000 -s /bin/bash nvidia + + # 3 Bind-mount new root and unshare mounts + mount --bind "$ROOTFS" "$ROOTFS" + mount --make-private "$ROOTFS" + cd "$ROOTFS" + + # 4 Minimal virtual filesystems + mount -t proc proc proc + mount -t sysfs sys sys + mount -t tmpfs tmp tmp + mount -t tmpfs run run + + # 5 GPU setup + nvidia-container-cli --load-kmods --debug=container-cli.log \ + configure --ldconfig=@/sbin/ldconfig.real \ + --no-cgroups --utility --device=0 "$(pwd)" + + # 6 Switch root + mkdir -p mnt + pivot_root . mnt + umount -l /mnt + + exec nvidia-smi -L +IN_NS +` +) + +// Integration tests for Docker runtime +var _ = Describe("nvidia-container-cli", Ordered, ContinueOnFailure, func() { + var runner Runner + var sharedDir string + var dindContainerName string + var toolkitContainerName string + var dockerSocket string + var hostOutput string + + // Install the NVIDIA Container Toolkit + BeforeAll(func(ctx context.Context) { + runner = NewRunner( + WithHost(sshHost), + WithPort(sshPort), + WithSshKey(sshKey), + WithSshUser(sshUser), + ) + + // Setup shared directory and container names + sharedDir = "/tmp/nvidia-container-toolkit-test" + dindContainerName = "nvidia-container-toolkit-dind" + toolkitContainerName = "nvidia-container-toolkit-test" + dockerSocket = "/run/nvidia/docker.sock" + + // Get host nvidia-smi output + var err error + hostOutput, _, err = runner.Run("nvidia-smi -L") + Expect(err).ToNot(HaveOccurred()) + + // Pull ubuntu image + _, _, err = runner.Run("docker pull ubuntu") + Expect(err).ToNot(HaveOccurred()) + + // Create shared directory structure + _, _, err = runner.Run(fmt.Sprintf("mkdir -p %s/{etc/docker,run/nvidia,usr/local/nvidia}", sharedDir)) + Expect(err).ToNot(HaveOccurred()) + + // Copy docker default config + createDockerConfigCmd := fmt.Sprintf("cat > %s/etc/docker/daemon.json <<'EOF'\n%s\nEOF", + sharedDir, dockerDefaultConfigTemplate) + _, _, err = runner.Run(createDockerConfigCmd) + Expect(err).ToNot(HaveOccurred()) + + // Start Docker-in-Docker container + tmpl, err := template.New("dockerDind").Parse(dockerDindTemplate) + Expect(err).ToNot(HaveOccurred()) + + var dindCmdBuilder strings.Builder + err = tmpl.Execute(&dindCmdBuilder, map[string]string{ + "SharedDir": sharedDir, + "ContainerName": dindContainerName, + "DockerSocket": dockerSocket, + }) + Expect(err).ToNot(HaveOccurred()) + + _, _, err = runner.Run(dindCmdBuilder.String()) + Expect(err).ToNot(HaveOccurred()) + }) + + AfterAll(func(ctx context.Context) { + // Cleanup containers + runner.Run(fmt.Sprintf("docker rm -f %s", toolkitContainerName)) + runner.Run(fmt.Sprintf("docker rm -f %s", dindContainerName)) + + // Cleanup shared directory + _, _, err := runner.Run(fmt.Sprintf("rm -rf %s", sharedDir)) + Expect(err).ToNot(HaveOccurred()) + }) + + When("running nvidia-smi -L", Ordered, func() { + It("should support NVIDIA_VISIBLE_DEVICES and NVIDIA_DRIVER_CAPABILITIES", func(ctx context.Context) { + // 1. Create the test script + testScriptPath := fmt.Sprintf("%s/libnvidia-container-cli.sh", sharedDir) + createScriptCmd := fmt.Sprintf("cat > %s <<'EOF'\n%s\nEOF\nchmod +x %s", + testScriptPath, libnvidiaContainerCliTestTemplate, testScriptPath) + _, _, err := runner.Run(createScriptCmd) + Expect(err).ToNot(HaveOccurred()) + + // 2. Start the toolkit container + tmpl, err := template.New("dockerToolkit").Parse(dockerToolkitTemplate) + Expect(err).ToNot(HaveOccurred()) + + var toolkitCmdBuilder strings.Builder + err = tmpl.Execute(&toolkitCmdBuilder, map[string]string{ + "DindContainerName": dindContainerName, + "ContainerName": toolkitContainerName, + "DockerSocket": dockerSocket, + "TestScriptPath": testScriptPath, + "ToolkitImage": imageName + ":" + imageTag, + }) + Expect(err).ToNot(HaveOccurred()) + + _, _, err = runner.Run(toolkitCmdBuilder.String()) + Expect(err).ToNot(HaveOccurred()) + + // 3. Wait for and verify the output + expected := strings.TrimSpace(strings.ReplaceAll(hostOutput, "\r", "")) + Eventually(func() string { + logs, _, err := runner.Run(fmt.Sprintf("docker logs %s | tail -n 20", toolkitContainerName)) + if err != nil { + return "" + } + + logLines := strings.Split(strings.TrimSpace(logs), "\n") + if len(logLines) == 0 { + return "" + } + return strings.TrimSpace(strings.ReplaceAll(logLines[len(logLines)-1], "\r", "")) + }, "5m", "5s").Should(Equal(expected)) + }) + }) +})