This commit is contained in:
Carlos Eduardo Arango Gutierrez 2025-04-30 17:42:44 +02:00 committed by GitHub
commit b378f08975
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 349 additions and 150 deletions

View File

@ -70,8 +70,8 @@ jobs:
- name: Run e2e tests
env:
IMAGE_NAME: ghcr.io/nvidia/container-toolkit
VERSION: ${{ inputs.version }}
E2E_IMAGE_REPO: ghcr.io/nvidia/container-toolkit
E2E_IMAGE_TAG: ${{ inputs.version }}-ubuntu20.04
SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }}
E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }}
@ -82,7 +82,14 @@ jobs:
chmod 600 "$e2e_ssh_key"
export E2E_SSH_KEY="$e2e_ssh_key"
make -f tests/e2e/Makefile test
make -f tests/e2e/Makefile test-e2e
- name: Archive Ginkgo logs
uses: actions/upload-artifact@v4
with:
name: ginkgo-logs
path: ginkgo.json
retention-days: 15
- name: Send Slack alert notification
if: ${{ failure() }}

View File

@ -1,4 +1,5 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -12,34 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
GO_CMD ?= go
.PHONY: test-e2e ginkgo
include $(CURDIR)/versions.mk
GINKGO_ARGS ?=
LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs
E2E_RUNTIME ?= docker
ginkgo:
mkdir -p $(CURDIR)/bin
GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest
E2E_INSTALL_CTK ?= false
ifeq ($($(DIST)),)
DIST ?= ubuntu20.04
endif
IMAGE_TAG ?= $(VERSION)-$(DIST)
IMAGE = $(IMAGE_NAME):$(IMAGE_TAG)
E2E_SSH_KEY ?=
E2E_SSH_USER ?=
E2E_SSH_HOST ?=
E2E_SSH_PORT ?= 22
.PHONY: test
test:
cd $(CURDIR)/tests/e2e && $(GO_CMD) test -v . -args \
-ginkgo.focus="$(E2E_RUNTIME)" \
-test.timeout=1h \
-ginkgo.v \
-install-ctk=$(E2E_INSTALL_CTK) \
-toolkit-image=$(IMAGE) \
-ssh-key=$(E2E_SSH_KEY) \
-ssh-user=$(E2E_SSH_USER) \
-remote-host=$(E2E_SSH_HOST) \
-remote-port=$(E2E_SSH_PORT)
test-e2e: ginkgo
$(CURDIR)/bin/ginkgo $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...

141
tests/e2e/README.md Normal file
View File

@ -0,0 +1,141 @@
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# NVIDIA Container Toolkit EndtoEnd (E2E) Test Suite
---
## 1 Scope & Goals
This repository contains a **Ginkgov2 / Gomega** test harness that exercises an
NVIDIA Container Toolkit (CTK) installation on a **remote GPUenabled host** via
SSH. The suite validates that:
1. CTK can be installed (or upgraded) headless (`INSTALL_CTK=true`).
2. The specified **container image** runs successfully under `nvidia-container-runtime`.
3. Errors and diagnostics are captured for postmortem analysis.
The tests are intended for continuousintegration pipelines, nightly
compatibility runs, and prerelease validation of new CTK builds.
---
## 2 Execution model
* The framework **does not** spin up a Kubernetes cluster; it drives a single
host reachable over SSH.
* All commands run in a Ginkgomanaged context (`ctx`) so they abort cleanly on
timeout or CtrlC.
* Environment discovery happens once in `TestMain``getTestEnv()`; parameters
are therefore immutable for the duration of the run.
---
## 3 Prerequisites
| Item | Version / requirement |
|------|-----------------------|
| **Go toolchain** | ≥ 1.22 (for building Ginkgo helper binaries) |
| **GPUenabled Linux host** | Running a supported NVIDIA driver; reachable via SSH |
| **SSH connectivity** | Publickey authentication *without* passphrase for unattended CI |
| **Local OS** | Linux/macOS; POSIX shell required by the Makefile |
---
## 4 Environment variables
| Variable | Required | Example | Description |
|----------|----------|---------|-------------|
| `INSTALL_CTK` | ✖ | `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. |
| `E2E_IMAGE_REPO` | ✔ | `ghcr.io/nvidia/container-toolkit` | Container Toolkit Image |
| `E2E_IMAGE_TAG` | ✔ | `latest` | Image tag |
| `E2E_SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
| `E2E_SSH_USER` | ✔ | `ubuntu` | Username on the remote host. |
| `REMOTE_HOST` | ✔ | `gpurunner01.corp.local` | Hostname or IP address of the target node. |
| `REMOTE_PORT` | ✔ | `22` | SSH port of the target node. |
> All variables are validated at startup; the suite aborts early with a clear
> message if any are missing or illformed.
---
## 5 Build helper binaries
Install the latest Ginkgo CLI locally so that the Makefile can invoke it:
```bash
make ginkgo # installs ./bin/ginkgo
```
The Makefile entry mirrors the pattern used in other NVIDIA E2E suites:
```make
bin/ginkgo:
GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest
```
---
## 6 Running the suite
### 6.1 Basic invocation
```bash
INSTALL_CTK=true \
TOOLKIT_IMAGE=nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9 \
SSH_KEY=$HOME/.ssh/id_rsa \
SSH_USER=ubuntu \
REMOTE_HOST=10.0.0.15 \
REMOTE_PORT=22 \
make test-e2e
```
This downloads the image on the remote host, installs CTK (if requested), and
executes a minimal CUDAbased workload.
---
## 7 Internal test flow
| Phase | Key function(s) | Notes |
|-------|-----------------|-------|
| **Init** | `TestMain``getTestEnv` | Collects env vars, initializes `ctx`. |
| **Connection check** | `BeforeSuite` (not shown) | Verifies SSH reachability using `ssh -o BatchMode=yes`. |
| **Optional CTK install** | `installCTK == true` path | Runs the distrospecific install script on the remote host. |
| **Runtime validation** | Leaf `It` blocks | Pulls `TOOLKIT_IMAGE`, runs `nvidia-smi` inside the container, asserts exit code `0`. |
| **Failure diagnostics** | `AfterEach` | Copies `/var/log/nvidia-container-runtime.log` & dmesg to `${LOG_ARTIFACTS_DIR}` via `scp`. |
---
## 8 Extending the suite
1. Create a new `_test.go` file under `tests/e2e`.
2. Use the Ginkgo DSL (`Describe`, `When`, `It` …). Each leaf node receives a
`context.Context` so you can run remote commands with deadline control.
3. Helper utilities such as `runSSH`, `withSudo`, and `collectLogs` are already
available from the shared test harness (see `ssh_helpers.go`).
4. Keep tests **idempotent** and clean any artefacts you create on the host.
---
## 9 Common issues & fixes
| Symptom | Likely cause | Fix |
|---------|--------------|-----|
| `Permission denied (publickey)` | Wrong `SSH_KEY` or `SSH_USER` | Check variables; ensure key is readable by the CI user. |
| `docker: Error response from daemon: could not select device driver` | CTK not installed or wrong runtime class | Verify `INSTALL_CTK=true` or confirm CTK installation on the host. |
| Test hangs at image pull | No outbound internet on remote host | Preload the image or use a local registry mirror. |
## 10 License
Distributed under the terms of the **Apache License 2.0** (see header).

View File

@ -1,24 +1,28 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package e2e
import (
"context"
"flag"
"os"
"path/filepath"
"runtime"
"strconv"
"testing"
. "github.com/onsi/ginkgo/v2"
@ -31,27 +35,27 @@ var (
installCTK bool
image string
ImageRepo string
ImageTag string
sshKey string
sshUser string
host string
sshPort string
sshKey string
sshUser string
host string
sshPort string
cwd string
packagePath string
runner Runner
)
func init() {
flag.BoolVar(&installCTK, "install-ctk", false, "Install the NVIDIA Container Toolkit")
flag.StringVar(&image, "toolkit-image", "", "Repository of the image to test")
flag.StringVar(&sshKey, "ssh-key", "", "SSH key to use for remote login")
flag.StringVar(&sshUser, "ssh-user", "", "SSH user to use for remote login")
flag.StringVar(&host, "remote-host", "", "Hostname of the remote machine")
flag.StringVar(&sshPort, "remote-port", "22", "SSH port to use for remote login")
}
func TestMain(t *testing.T) {
suiteName := "NVIDIA Container Toolkit E2E"
suiteName := "E2E NVIDIA Container Toolkit"
RegisterFailHandler(Fail)
ctx = context.Background()
getTestEnv()
RunSpecs(t,
suiteName,
)
@ -59,5 +63,89 @@ func TestMain(t *testing.T) {
// BeforeSuite runs before the test suite
var _ = BeforeSuite(func() {
ctx = context.Background()
runner = NewRunner(
WithHost(host),
WithPort(sshPort),
WithSshKey(sshKey),
WithSshUser(sshUser),
)
if installCTK {
installer, err := NewToolkitInstaller(
WithRunner(runner),
WithImage(ImageRepo+":"+ImageTag),
WithTemplate(dockerInstallTemplate),
)
Expect(err).ToNot(HaveOccurred())
err = installer.Install()
Expect(err).ToNot(HaveOccurred())
}
_, _, err := runner.Run("docker pull ubuntu")
Expect(err).ToNot(HaveOccurred())
_, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
_, _, err = runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
Expect(err).ToNot(HaveOccurred())
})
// getTestEnv gets the test environment variables
func getTestEnv() {
defer GinkgoRecover()
var err error
_, thisFile, _, _ := runtime.Caller(0)
packagePath = filepath.Dir(thisFile)
installCTK = getBoolEnvVar("E2E_INSTALL_CTK", false)
ImageRepo = os.Getenv("E2E_IMAGE_REPO")
Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set")
ImageTag = os.Getenv("E2E_IMAGE_TAG")
Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set")
sshKey = os.Getenv("E2E_SSH_KEY")
Expect(sshKey).NotTo(BeEmpty(), "E2E_SSH_KEY environment variable must be set")
sshUser = os.Getenv("E2E_SSH_USER")
Expect(sshUser).NotTo(BeEmpty(), "SSH_USER environment variable must be set")
host = os.Getenv("E2E_SSH_HOST")
Expect(host).NotTo(BeEmpty(), "REMOTE_HOST environment variable must be set")
sshPort = getIntEnvVar("E2E_SSH_PORT", 22)
// Get current working directory
cwd, err = os.Getwd()
Expect(err).NotTo(HaveOccurred())
}
// getBoolEnvVar returns the boolean value of the environment variable or the default value if not set.
func getBoolEnvVar(key string, defaultValue bool) bool {
value := os.Getenv(key)
if value == "" {
return defaultValue
}
boolValue, err := strconv.ParseBool(value)
if err != nil {
return defaultValue
}
return boolValue
}
// getIntEnvVar returns the integer value of the environment variable or the default value if not set.
func getIntEnvVar(key string, defaultValue int) string {
value := os.Getenv(key)
if value == "" {
return strconv.Itoa(defaultValue)
}
intValue, err := strconv.Atoi(value)
if err != nil {
return strconv.Itoa(defaultValue)
}
return strconv.Itoa(intValue)
}

View File

@ -1,19 +1,19 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package e2e
import (

View File

@ -1,5 +1,6 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -27,69 +28,50 @@ import (
// Integration tests for Docker runtime
var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
var r Runner
// Install the NVIDIA Container Toolkit
BeforeAll(func(ctx context.Context) {
r = NewRunner(
WithHost(host),
WithPort(sshPort),
WithSshKey(sshKey),
WithSshUser(sshUser),
)
if installCTK {
installer, err := NewToolkitInstaller(
WithRunner(r),
WithImage(image),
WithTemplate(dockerInstallTemplate),
)
Expect(err).ToNot(HaveOccurred())
err = installer.Install()
Expect(err).ToNot(HaveOccurred())
}
})
// GPUs are accessible in a container: Running nvidia-smi -L inside the
// container shows the same output inside the container as outside the
// container. This means that the following commands must all produce
// the same output
When("running nvidia-smi -L", Ordered, func() {
var hostOutput string
var err error
BeforeAll(func(ctx context.Context) {
_, _, err := r.Run("docker pull ubuntu")
Expect(err).ToNot(HaveOccurred())
hostOutput, _, err = r.Run("nvidia-smi -L")
hostOutput, _, err = runner.Run("nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
})
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
It("should support automatic CDI spec generation", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
By("Running docker run with --gpus=all --runtime=nvidia --gpus all")
containerOutput, _, err := runner.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
By("Running docker run with --runtime=nvidia --gpus all")
containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
By("Running docker run with --gpus all")
containerOutput, _, err := runner.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
@ -98,35 +80,34 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
// A vectorAdd sample runs in a container with access to all GPUs.
// The following should all produce the same result.
When("Running the cuda-vectorAdd sample", Ordered, func() {
BeforeAll(func(ctx context.Context) {
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
})
var referenceOutput string
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
var err error
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
})
It("should support automatic CDI spec generation", func(ctx context.Context) {
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out2))
})
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
By("Running docker run with --runtime=nvidia --gpus all")
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out3))
})
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
By("Running docker run with --gpus all")
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out4))
})
@ -136,53 +117,52 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
// The following should all produce the same result.
When("Running the cuda-deviceQuery sample", Ordered, func() {
BeforeAll(func(ctx context.Context) {
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
_, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
})
var referenceOutput string
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
var err error
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
})
It("should support automatic CDI spec generation", func(ctx context.Context) {
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out2))
})
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
By("Running docker run with --runtime=nvidia --gpus all")
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out3))
})
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
By("Running docker run with --gpus all")
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out4))
})
})
Describe("CUDA Forward compatibility", Ordered, func() {
When("Testing CUDA Forward compatibility", Ordered, func() {
BeforeAll(func(ctx context.Context) {
_, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
Expect(err).ToNot(HaveOccurred())
})
BeforeAll(func(ctx context.Context) {
compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
Expect(err).ToNot(HaveOccurred())
Expect(compatOutput).ToNot(BeEmpty())
compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.")
compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0]
driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"")
driverOutput, _, err := runner.Run("nvidia-smi -q | grep \"Driver Version\"")
Expect(err).ToNot(HaveOccurred())
parts := strings.SplitN(driverOutput, ":", 2)
Expect(parts).To(HaveLen(2))
@ -198,19 +178,22 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
})
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
})
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
})
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --gpus all")
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
})

View File

@ -1,17 +1,18 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package e2e

View File

@ -1,8 +1,6 @@
module github.com/NVIDIA/nvidia-container-toolkit/tests
go 1.23.2
toolchain go1.24.1
go 1.24.1
require (
github.com/onsi/ginkgo/v2 v2.23.4