mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-05-06 13:05:19 +00:00
Merge b3728406e0
into ca061bb4f0
This commit is contained in:
commit
b378f08975
13
.github/workflows/e2e.yaml
vendored
13
.github/workflows/e2e.yaml
vendored
@ -70,8 +70,8 @@ jobs:
|
||||
|
||||
- name: Run e2e tests
|
||||
env:
|
||||
IMAGE_NAME: ghcr.io/nvidia/container-toolkit
|
||||
VERSION: ${{ inputs.version }}
|
||||
E2E_IMAGE_REPO: ghcr.io/nvidia/container-toolkit
|
||||
E2E_IMAGE_TAG: ${{ inputs.version }}-ubuntu20.04
|
||||
SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
|
||||
E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }}
|
||||
E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }}
|
||||
@ -82,7 +82,14 @@ jobs:
|
||||
chmod 600 "$e2e_ssh_key"
|
||||
export E2E_SSH_KEY="$e2e_ssh_key"
|
||||
|
||||
make -f tests/e2e/Makefile test
|
||||
make -f tests/e2e/Makefile test-e2e
|
||||
|
||||
- name: Archive Ginkgo logs
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ginkgo-logs
|
||||
path: ginkgo.json
|
||||
retention-days: 15
|
||||
|
||||
- name: Send Slack alert notification
|
||||
if: ${{ failure() }}
|
||||
|
@ -1,4 +1,5 @@
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -12,34 +13,14 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
GO_CMD ?= go
|
||||
.PHONY: test-e2e ginkgo
|
||||
|
||||
include $(CURDIR)/versions.mk
|
||||
GINKGO_ARGS ?=
|
||||
LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs
|
||||
|
||||
E2E_RUNTIME ?= docker
|
||||
ginkgo:
|
||||
mkdir -p $(CURDIR)/bin
|
||||
GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest
|
||||
|
||||
E2E_INSTALL_CTK ?= false
|
||||
|
||||
ifeq ($($(DIST)),)
|
||||
DIST ?= ubuntu20.04
|
||||
endif
|
||||
IMAGE_TAG ?= $(VERSION)-$(DIST)
|
||||
IMAGE = $(IMAGE_NAME):$(IMAGE_TAG)
|
||||
|
||||
E2E_SSH_KEY ?=
|
||||
E2E_SSH_USER ?=
|
||||
E2E_SSH_HOST ?=
|
||||
E2E_SSH_PORT ?= 22
|
||||
|
||||
.PHONY: test
|
||||
test:
|
||||
cd $(CURDIR)/tests/e2e && $(GO_CMD) test -v . -args \
|
||||
-ginkgo.focus="$(E2E_RUNTIME)" \
|
||||
-test.timeout=1h \
|
||||
-ginkgo.v \
|
||||
-install-ctk=$(E2E_INSTALL_CTK) \
|
||||
-toolkit-image=$(IMAGE) \
|
||||
-ssh-key=$(E2E_SSH_KEY) \
|
||||
-ssh-user=$(E2E_SSH_USER) \
|
||||
-remote-host=$(E2E_SSH_HOST) \
|
||||
-remote-port=$(E2E_SSH_PORT)
|
||||
test-e2e: ginkgo
|
||||
$(CURDIR)/bin/ginkgo $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...
|
||||
|
141
tests/e2e/README.md
Normal file
141
tests/e2e/README.md
Normal file
@ -0,0 +1,141 @@
|
||||
<!--
|
||||
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
# NVIDIA Container Toolkit – End‑to‑End (E2E) Test Suite
|
||||
|
||||
---
|
||||
|
||||
## 1 Scope & Goals
|
||||
This repository contains a **Ginkgo v2 / Gomega** test harness that exercises an
|
||||
NVIDIA Container Toolkit (CTK) installation on a **remote GPU‑enabled host** via
|
||||
SSH. The suite validates that:
|
||||
|
||||
1. CTK can be installed (or upgraded) head‑less (`INSTALL_CTK=true`).
|
||||
2. The specified **container image** runs successfully under `nvidia-container-runtime`.
|
||||
3. Errors and diagnostics are captured for post‑mortem analysis.
|
||||
|
||||
The tests are intended for continuous‑integration pipelines, nightly
|
||||
compatibility runs, and pre‑release validation of new CTK builds.
|
||||
|
||||
---
|
||||
|
||||
## 2 Execution model
|
||||
* The framework **does not** spin up a Kubernetes cluster; it drives a single
|
||||
host reachable over SSH.
|
||||
* All commands run in a Ginkgo‑managed context (`ctx`) so they abort cleanly on
|
||||
timeout or Ctrl‑C.
|
||||
* Environment discovery happens once in `TestMain` → `getTestEnv()`; parameters
|
||||
are therefore immutable for the duration of the run.
|
||||
|
||||
---
|
||||
|
||||
## 3 Prerequisites
|
||||
|
||||
| Item | Version / requirement |
|
||||
|------|-----------------------|
|
||||
| **Go toolchain** | ≥ 1.22 (for building Ginkgo helper binaries) |
|
||||
| **GPU‑enabled Linux host** | Running a supported NVIDIA driver; reachable via SSH |
|
||||
| **SSH connectivity** | Public‑key authentication *without* pass‑phrase for unattended CI |
|
||||
| **Local OS** | Linux/macOS; POSIX shell required by the Makefile |
|
||||
|
||||
---
|
||||
|
||||
## 4 Environment variables
|
||||
|
||||
| Variable | Required | Example | Description |
|
||||
|----------|----------|---------|-------------|
|
||||
| `INSTALL_CTK` | ✖ | `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. |
|
||||
| `E2E_IMAGE_REPO` | ✔ | `ghcr.io/nvidia/container-toolkit` | Container Toolkit Image |
|
||||
| `E2E_IMAGE_TAG` | ✔ | `latest` | Image tag |
|
||||
| `E2E_SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
|
||||
| `E2E_SSH_USER` | ✔ | `ubuntu` | Username on the remote host. |
|
||||
| `REMOTE_HOST` | ✔ | `gpurunner01.corp.local` | Hostname or IP address of the target node. |
|
||||
| `REMOTE_PORT` | ✔ | `22` | SSH port of the target node. |
|
||||
|
||||
> All variables are validated at start‑up; the suite aborts early with a clear
|
||||
> message if any are missing or ill‑formed.
|
||||
|
||||
---
|
||||
|
||||
## 5 Build helper binaries
|
||||
|
||||
Install the latest Ginkgo CLI locally so that the Makefile can invoke it:
|
||||
|
||||
```bash
|
||||
make ginkgo # installs ./bin/ginkgo
|
||||
```
|
||||
|
||||
The Makefile entry mirrors the pattern used in other NVIDIA E2E suites:
|
||||
|
||||
```make
|
||||
bin/ginkgo:
|
||||
GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6 Running the suite
|
||||
|
||||
### 6.1 Basic invocation
|
||||
```bash
|
||||
INSTALL_CTK=true \
|
||||
TOOLKIT_IMAGE=nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9 \
|
||||
SSH_KEY=$HOME/.ssh/id_rsa \
|
||||
SSH_USER=ubuntu \
|
||||
REMOTE_HOST=10.0.0.15 \
|
||||
REMOTE_PORT=22 \
|
||||
make test-e2e
|
||||
```
|
||||
This downloads the image on the remote host, installs CTK (if requested), and
|
||||
executes a minimal CUDA‑based workload.
|
||||
|
||||
---
|
||||
|
||||
## 7 Internal test flow
|
||||
|
||||
| Phase | Key function(s) | Notes |
|
||||
|-------|-----------------|-------|
|
||||
| **Init** | `TestMain` → `getTestEnv` | Collects env vars, initializes `ctx`. |
|
||||
| **Connection check** | `BeforeSuite` (not shown) | Verifies SSH reachability using `ssh -o BatchMode=yes`. |
|
||||
| **Optional CTK install** | `installCTK == true` path | Runs the distro‑specific install script on the remote host. |
|
||||
| **Runtime validation** | Leaf `It` blocks | Pulls `TOOLKIT_IMAGE`, runs `nvidia-smi` inside the container, asserts exit code `0`. |
|
||||
| **Failure diagnostics** | `AfterEach` | Copies `/var/log/nvidia-container-runtime.log` & dmesg to `${LOG_ARTIFACTS_DIR}` via `scp`. |
|
||||
|
||||
---
|
||||
|
||||
## 8 Extending the suite
|
||||
|
||||
1. Create a new `_test.go` file under `tests/e2e`.
|
||||
2. Use the Ginkgo DSL (`Describe`, `When`, `It` …). Each leaf node receives a
|
||||
`context.Context` so you can run remote commands with deadline control.
|
||||
3. Helper utilities such as `runSSH`, `withSudo`, and `collectLogs` are already
|
||||
available from the shared test harness (see `ssh_helpers.go`).
|
||||
4. Keep tests **idempotent** and clean any artefacts you create on the host.
|
||||
|
||||
---
|
||||
|
||||
## 9 Common issues & fixes
|
||||
|
||||
| Symptom | Likely cause | Fix |
|
||||
|---------|--------------|-----|
|
||||
| `Permission denied (publickey)` | Wrong `SSH_KEY` or `SSH_USER` | Check variables; ensure key is readable by the CI user. |
|
||||
| `docker: Error response from daemon: could not select device driver` | CTK not installed or wrong runtime class | Verify `INSTALL_CTK=true` or confirm CTK installation on the host. |
|
||||
| Test hangs at image pull | No outbound internet on remote host | Pre‑load the image or use a local registry mirror. |
|
||||
|
||||
## 10 License
|
||||
Distributed under the terms of the **Apache License 2.0** (see header).
|
||||
|
@ -1,24 +1,28 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
@ -31,27 +35,27 @@ var (
|
||||
|
||||
installCTK bool
|
||||
|
||||
image string
|
||||
ImageRepo string
|
||||
ImageTag string
|
||||
|
||||
sshKey string
|
||||
sshUser string
|
||||
host string
|
||||
sshPort string
|
||||
sshKey string
|
||||
sshUser string
|
||||
host string
|
||||
sshPort string
|
||||
cwd string
|
||||
packagePath string
|
||||
|
||||
runner Runner
|
||||
)
|
||||
|
||||
func init() {
|
||||
flag.BoolVar(&installCTK, "install-ctk", false, "Install the NVIDIA Container Toolkit")
|
||||
flag.StringVar(&image, "toolkit-image", "", "Repository of the image to test")
|
||||
flag.StringVar(&sshKey, "ssh-key", "", "SSH key to use for remote login")
|
||||
flag.StringVar(&sshUser, "ssh-user", "", "SSH user to use for remote login")
|
||||
flag.StringVar(&host, "remote-host", "", "Hostname of the remote machine")
|
||||
flag.StringVar(&sshPort, "remote-port", "22", "SSH port to use for remote login")
|
||||
}
|
||||
|
||||
func TestMain(t *testing.T) {
|
||||
suiteName := "NVIDIA Container Toolkit E2E"
|
||||
suiteName := "E2E NVIDIA Container Toolkit"
|
||||
|
||||
RegisterFailHandler(Fail)
|
||||
|
||||
ctx = context.Background()
|
||||
getTestEnv()
|
||||
|
||||
RunSpecs(t,
|
||||
suiteName,
|
||||
)
|
||||
@ -59,5 +63,89 @@ func TestMain(t *testing.T) {
|
||||
|
||||
// BeforeSuite runs before the test suite
|
||||
var _ = BeforeSuite(func() {
|
||||
ctx = context.Background()
|
||||
runner = NewRunner(
|
||||
WithHost(host),
|
||||
WithPort(sshPort),
|
||||
WithSshKey(sshKey),
|
||||
WithSshUser(sshUser),
|
||||
)
|
||||
|
||||
if installCTK {
|
||||
installer, err := NewToolkitInstaller(
|
||||
WithRunner(runner),
|
||||
WithImage(ImageRepo+":"+ImageTag),
|
||||
WithTemplate(dockerInstallTemplate),
|
||||
)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
err = installer.Install()
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
}
|
||||
|
||||
_, _, err := runner.Run("docker pull ubuntu")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
_, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
_, _, err = runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
// getTestEnv gets the test environment variables
|
||||
func getTestEnv() {
|
||||
defer GinkgoRecover()
|
||||
var err error
|
||||
|
||||
_, thisFile, _, _ := runtime.Caller(0)
|
||||
packagePath = filepath.Dir(thisFile)
|
||||
|
||||
installCTK = getBoolEnvVar("E2E_INSTALL_CTK", false)
|
||||
|
||||
ImageRepo = os.Getenv("E2E_IMAGE_REPO")
|
||||
Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set")
|
||||
|
||||
ImageTag = os.Getenv("E2E_IMAGE_TAG")
|
||||
Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set")
|
||||
|
||||
sshKey = os.Getenv("E2E_SSH_KEY")
|
||||
Expect(sshKey).NotTo(BeEmpty(), "E2E_SSH_KEY environment variable must be set")
|
||||
|
||||
sshUser = os.Getenv("E2E_SSH_USER")
|
||||
Expect(sshUser).NotTo(BeEmpty(), "SSH_USER environment variable must be set")
|
||||
|
||||
host = os.Getenv("E2E_SSH_HOST")
|
||||
Expect(host).NotTo(BeEmpty(), "REMOTE_HOST environment variable must be set")
|
||||
|
||||
sshPort = getIntEnvVar("E2E_SSH_PORT", 22)
|
||||
|
||||
// Get current working directory
|
||||
cwd, err = os.Getwd()
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
}
|
||||
|
||||
// getBoolEnvVar returns the boolean value of the environment variable or the default value if not set.
|
||||
func getBoolEnvVar(key string, defaultValue bool) bool {
|
||||
value := os.Getenv(key)
|
||||
if value == "" {
|
||||
return defaultValue
|
||||
}
|
||||
boolValue, err := strconv.ParseBool(value)
|
||||
if err != nil {
|
||||
return defaultValue
|
||||
}
|
||||
return boolValue
|
||||
}
|
||||
|
||||
// getIntEnvVar returns the integer value of the environment variable or the default value if not set.
|
||||
func getIntEnvVar(key string, defaultValue int) string {
|
||||
value := os.Getenv(key)
|
||||
if value == "" {
|
||||
return strconv.Itoa(defaultValue)
|
||||
}
|
||||
intValue, err := strconv.Atoi(value)
|
||||
if err != nil {
|
||||
return strconv.Itoa(defaultValue)
|
||||
}
|
||||
return strconv.Itoa(intValue)
|
||||
}
|
||||
|
@ -1,19 +1,19 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
|
@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -27,69 +28,50 @@ import (
|
||||
|
||||
// Integration tests for Docker runtime
|
||||
var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
var r Runner
|
||||
|
||||
// Install the NVIDIA Container Toolkit
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
r = NewRunner(
|
||||
WithHost(host),
|
||||
WithPort(sshPort),
|
||||
WithSshKey(sshKey),
|
||||
WithSshUser(sshUser),
|
||||
)
|
||||
if installCTK {
|
||||
installer, err := NewToolkitInstaller(
|
||||
WithRunner(r),
|
||||
WithImage(image),
|
||||
WithTemplate(dockerInstallTemplate),
|
||||
)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
err = installer.Install()
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
}
|
||||
})
|
||||
|
||||
// GPUs are accessible in a container: Running nvidia-smi -L inside the
|
||||
// container shows the same output inside the container as outside the
|
||||
// container. This means that the following commands must all produce
|
||||
// the same output
|
||||
When("running nvidia-smi -L", Ordered, func() {
|
||||
var hostOutput string
|
||||
var err error
|
||||
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull ubuntu")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
hostOutput, _, err = r.Run("nvidia-smi -L")
|
||||
hostOutput, _, err = runner.Run("nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
|
||||
containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
|
||||
containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
|
||||
By("Running docker run with --gpus=all --runtime=nvidia --gpus all")
|
||||
containerOutput, _, err := runner.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
|
||||
By("Running docker run with --runtime=nvidia --gpus all")
|
||||
containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
|
||||
By("Running docker run with --gpus all")
|
||||
containerOutput, _, err := runner.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
@ -98,35 +80,34 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
// A vectorAdd sample runs in a container with access to all GPUs.
|
||||
// The following should all produce the same result.
|
||||
When("Running the cuda-vectorAdd sample", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
var referenceOutput string
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
|
||||
var err error
|
||||
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
|
||||
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out2))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
By("Running docker run with --runtime=nvidia --gpus all")
|
||||
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out3))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
By("Running docker run with --gpus all")
|
||||
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out4))
|
||||
})
|
||||
@ -136,53 +117,52 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
// The following should all produce the same result.
|
||||
When("Running the cuda-deviceQuery sample", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
_, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
var referenceOutput string
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
|
||||
var err error
|
||||
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
|
||||
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out2))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
By("Running docker run with --runtime=nvidia --gpus all")
|
||||
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out3))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
By("Running docker run with --gpus all")
|
||||
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out4))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("CUDA Forward compatibility", Ordered, func() {
|
||||
When("Testing CUDA Forward compatibility", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
|
||||
compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(compatOutput).ToNot(BeEmpty())
|
||||
|
||||
compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.")
|
||||
compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0]
|
||||
|
||||
driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"")
|
||||
driverOutput, _, err := runner.Run("nvidia-smi -q | grep \"Driver Version\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
parts := strings.SplitN(driverOutput, ":", 2)
|
||||
Expect(parts).To(HaveLen(2))
|
||||
@ -198,19 +178,22 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
})
|
||||
|
||||
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
|
||||
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
|
||||
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --gpus all")
|
||||
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
|
||||
})
|
||||
|
@ -1,17 +1,18 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
@ -1,8 +1,6 @@
|
||||
module github.com/NVIDIA/nvidia-container-toolkit/tests
|
||||
|
||||
go 1.23.2
|
||||
|
||||
toolchain go1.24.1
|
||||
go 1.24.1
|
||||
|
||||
require (
|
||||
github.com/onsi/ginkgo/v2 v2.23.4
|
||||
|
Loading…
Reference in New Issue
Block a user