[no-relnote] Update Github Actions E2E
Some checks failed
CI Pipeline / code-scanning (push) Has been cancelled
CI Pipeline / variables (push) Has been cancelled
CI Pipeline / golang (push) Has been cancelled
CI Pipeline / image (push) Has been cancelled
CI Pipeline / e2e-test (push) Has been cancelled

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
This commit is contained in:
Carlos Eduardo Arango Gutierrez 2025-04-24 20:50:18 +02:00
parent 6df26cc7a5
commit d239850a30
No known key found for this signature in database
GPG Key ID: 42D9CB42F300A852
9 changed files with 98 additions and 101 deletions

View File

@ -70,8 +70,8 @@ jobs:
- name: Run e2e tests - name: Run e2e tests
env: env:
IMAGE_NAME: ghcr.io/nvidia/container-toolkit E2E_IMAGE_REPO: ghcr.io/nvidia/container-toolkit
VERSION: ${{ inputs.version }} E2E_IMAGE_TAG: ${{ inputs.version }}-ubuntu20.04
SSH_KEY: ${{ secrets.AWS_SSH_KEY }} SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }} E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }}
E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }} E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }}
@ -84,6 +84,13 @@ jobs:
make -f tests/e2e/Makefile test make -f tests/e2e/Makefile test
- name: Archive Ginkgo logs
uses: actions/upload-artifact@v4
with:
name: ginkgo-logs
path: ginkgo.json
retention-days: 15
- name: Send Slack alert notification - name: Send Slack alert notification
if: ${{ failure() }} if: ${{ failure() }}
uses: slackapi/slack-github-action@v2.0.0 uses: slackapi/slack-github-action@v2.0.0

1
.gitignore vendored
View File

@ -11,3 +11,4 @@
/nvidia-ctk /nvidia-ctk
/shared-* /shared-*
/release-* /release-*
/bin

View File

@ -13,14 +13,16 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
.PHONY: test-e2e ginkgo .PHONY: test $(GINKGO_BIN)
GINKGO_ARGS ?= GINKGO_ARGS ?=
LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs
ginkgo: GINKGO_BIN := $(CURDIR)/bin/ginkgo
test: $(GINKGO_BIN)
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...
$(GINKGO_BIN):
mkdir -p $(CURDIR)/bin mkdir -p $(CURDIR)/bin
GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest
test-e2e: ginkgo
$(CURDIR)/bin/ginkgo $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...

View File

@ -20,7 +20,7 @@ limitations under the License.
--- ---
## 1 Scope & Goals ## 1 Scope & Goals
This repository contains a **Ginkgov2 / Gomega** test harness that exercises an This folder contains a **Ginkgov2 / Gomega** test harness that exercises an
NVIDIA Container Toolkit (CTK) installation on a **remote GPUenabled host** via NVIDIA Container Toolkit (CTK) installation on a **remote GPUenabled host** via
SSH. The suite validates that: SSH. The suite validates that:
@ -58,12 +58,13 @@ compatibility runs, and prerelease validation of new CTK builds.
| Variable | Required | Example | Description | | Variable | Required | Example | Description |
|----------|----------|---------|-------------| |----------|----------|---------|-------------|
| `INSTALL_CTK` | ✖ | `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. | | `E2E_INSTALL_CTK` | ✖ | `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. |
| `TOOLKIT_IMAGE` | ✔ | `nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9` | Image that will be pulled & executed. | | `E2E_IMAGE_REPO` | ✔ | `ghcr.io/nvidia/container-toolkit` | Container Toolkit Image |
| `SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. | | `E2E_IMAGE_TAG` | ✔ | `latest` | Image tag |
| `SSH_USER` | ✔ | `ubuntu` | Username on the remote host. | | `E2E_SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
| `REMOTE_HOST` | ✔ | `gpurunner01.corp.local` | Hostname or IP address of the target node. | | `E2E_SSH_USER` | ✔ | `ubuntu` | Username on the remote host. |
| `REMOTE_PORT` | ✔ | `22` | SSH port of the target node. | | `E2E_SSH_HOST` | ✔ | `10.0.0.0` | Hostname or IP address of the target node. |
| `E2E_SSH_PORT` | ✔ | `22` | SSH port of the target node. |
> All variables are validated at startup; the suite aborts early with a clear > All variables are validated at startup; the suite aborts early with a clear
> message if any are missing or illformed. > message if any are missing or illformed.
@ -87,24 +88,7 @@ bin/ginkgo:
--- ---
## 6 Running the suite ## 6 Internal test flow
### 6.1 Basic invocation
```bash
INSTALL_CTK=true \
TOOLKIT_IMAGE=nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9 \
SSH_KEY=$HOME/.ssh/id_rsa \
SSH_USER=ubuntu \
REMOTE_HOST=10.0.0.15 \
REMOTE_PORT=22 \
make test-e2e
```
This downloads the image on the remote host, installs CTK (if requested), and
executes a minimal CUDAbased workload.
---
## 7 Internal test flow
| Phase | Key function(s) | Notes | | Phase | Key function(s) | Notes |
|-------|-----------------|-------| |-------|-----------------|-------|
@ -116,7 +100,7 @@ executes a minimal CUDAbased workload.
--- ---
## 8 Extending the suite ## 7 Extending the suite
1. Create a new `_test.go` file under `tests/e2e`. 1. Create a new `_test.go` file under `tests/e2e`.
2. Use the Ginkgo DSL (`Describe`, `When`, `It` …). Each leaf node receives a 2. Use the Ginkgo DSL (`Describe`, `When`, `It` …). Each leaf node receives a
@ -127,7 +111,7 @@ executes a minimal CUDAbased workload.
--- ---
## 9 Common issues & fixes ## 8 Common issues & fixes
| Symptom | Likely cause | Fix | | Symptom | Likely cause | Fix |
|---------|--------------|-----| |---------|--------------|-----|
@ -135,6 +119,6 @@ executes a minimal CUDAbased workload.
| `docker: Error response from daemon: could not select device driver` | CTK not installed or wrong runtime class | Verify `INSTALL_CTK=true` or confirm CTK installation on the host. | | `docker: Error response from daemon: could not select device driver` | CTK not installed or wrong runtime class | Verify `INSTALL_CTK=true` or confirm CTK installation on the host. |
| Test hangs at image pull | No outbound internet on remote host | Preload the image or use a local registry mirror. | | Test hangs at image pull | No outbound internet on remote host | Preload the image or use a local registry mirror. |
## 10 License ## 9 License
Distributed under the terms of the **Apache License 2.0** (see header). Distributed under the terms of the **Apache License 2.0** (see header).

View File

@ -1,6 +1,5 @@
/* /*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -19,9 +18,8 @@ package e2e
import ( import (
"context" "context"
"errors"
"os" "os"
"path/filepath"
"runtime"
"strconv" "strconv"
"testing" "testing"
@ -38,12 +36,11 @@ var (
ImageRepo string ImageRepo string
ImageTag string ImageTag string
sshKey string sshKey string
sshUser string sshUser string
host string sshHost string
sshPort string sshPort string
cwd string cwd string
packagePath string
runner Runner runner Runner
) )
@ -64,7 +61,7 @@ func TestMain(t *testing.T) {
// BeforeSuite runs before the test suite // BeforeSuite runs before the test suite
var _ = BeforeSuite(func() { var _ = BeforeSuite(func() {
runner = NewRunner( runner = NewRunner(
WithHost(host), WithHost(sshHost),
WithPort(sshPort), WithPort(sshPort),
WithSshKey(sshKey), WithSshKey(sshKey),
WithSshUser(sshUser), WithSshUser(sshUser),
@ -81,15 +78,6 @@ var _ = BeforeSuite(func() {
err = installer.Install() err = installer.Install()
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
} }
_, _, err := runner.Run("docker pull ubuntu")
Expect(err).ToNot(HaveOccurred())
_, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
_, _, err = runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
Expect(err).ToNot(HaveOccurred())
}) })
// getTestEnv gets the test environment variables // getTestEnv gets the test environment variables
@ -97,43 +85,63 @@ func getTestEnv() {
defer GinkgoRecover() defer GinkgoRecover()
var err error var err error
_, thisFile, _, _ := runtime.Caller(0) installCTK = getEnvVarOrDefault("E2E_INSTALL_CTK", true)
packagePath = filepath.Dir(thisFile)
installCTK = getBoolEnvVar("INSTALL_CTK", false) if installCTK {
ImageRepo = os.Getenv("E2E_IMAGE_REPO")
Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set")
ImageRepo = os.Getenv("E2E_IMAGE_REPO") ImageTag = os.Getenv("E2E_IMAGE_TAG")
Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set") Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set")
}
ImageTag = os.Getenv("E2E_IMAGE_TAG") sshKey = os.Getenv("E2E_SSH_KEY")
Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set") Expect(sshKey).NotTo(BeEmpty(), "E2E_SSH_KEY environment variable must be set")
sshKey = os.Getenv("SSH_KEY") sshUser = os.Getenv("E2E_SSH_USER")
Expect(sshKey).NotTo(BeEmpty(), "SSH_KEY environment variable must be set") Expect(sshUser).NotTo(BeEmpty(), "E2E_SSH_USER environment variable must be set")
sshUser = os.Getenv("SSH_USER") sshHost = os.Getenv("E2E_SSH_HOST")
Expect(sshUser).NotTo(BeEmpty(), "SSH_USER environment variable must be set") Expect(sshHost).NotTo(BeEmpty(), "E2E_SSH_HOST environment variable must be set")
host = os.Getenv("REMOTE_HOST") sshPort = getEnvVarOrDefault("E2E_SSH_PORT", "22")
Expect(host).NotTo(BeEmpty(), "REMOTE_HOST environment variable must be set")
sshPort = os.Getenv("REMOTE_PORT")
Expect(sshPort).NotTo(BeEmpty(), "REMOTE_PORT environment variable must be set")
// Get current working directory // Get current working directory
cwd, err = os.Getwd() cwd, err = os.Getwd()
Expect(err).NotTo(HaveOccurred()) Expect(err).NotTo(HaveOccurred())
} }
// getBoolEnvVar returns the boolean value of the environment variable or the default value if not set. func getEnvVarAs[T any](key string) (T, error) {
func getBoolEnvVar(key string, defaultValue bool) bool { var zero T
value := os.Getenv(key) value := os.Getenv(key)
if value == "" { if value == "" {
return defaultValue return zero, errors.New("env var not set")
} }
boolValue, err := strconv.ParseBool(value)
switch any(zero).(type) {
case bool:
v, err := strconv.ParseBool(value)
if err != nil {
return zero, err
}
return any(v).(T), nil
case int:
v, err := strconv.Atoi(value)
if err != nil {
return zero, err
}
return any(v).(T), nil
case string:
return any(value).(T), nil
default:
return zero, errors.New("unsupported type")
}
}
func getEnvVarOrDefault[T any](key string, defaultValue T) T {
val, err := getEnvVarAs[T](key)
if err != nil { if err != nil {
return defaultValue return defaultValue
} }
return boolValue return val
} }

View File

@ -1,6 +1,5 @@
/* /*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -14,6 +13,7 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package e2e package e2e
import ( import (

View File

@ -1,6 +1,5 @@
/* /*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -39,38 +38,36 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
BeforeAll(func(ctx context.Context) { BeforeAll(func(ctx context.Context) {
hostOutput, _, err = runner.Run("nvidia-smi -L") hostOutput, _, err = runner.Run("nvidia-smi -L")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
_, _, err := runner.Run("docker pull ubuntu")
Expect(err).ToNot(HaveOccurred())
}) })
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L") containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput)) Expect(containerOutput).To(Equal(hostOutput))
}) })
It("should support automatic CDI spec generation", func(ctx context.Context) { It("should support automatic CDI spec generation", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L") containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput)) Expect(containerOutput).To(Equal(hostOutput))
}) })
It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) { It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) {
By("Running docker run with --gpus=all --runtime=nvidia --gpus all")
containerOutput, _, err := runner.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L") containerOutput, _, err := runner.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput)) Expect(containerOutput).To(Equal(hostOutput))
}) })
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia --gpus all")
containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L") containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput)) Expect(containerOutput).To(Equal(hostOutput))
}) })
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
By("Running docker run with --gpus all")
containerOutput, _, err := runner.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L") containerOutput, _, err := runner.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput)) Expect(containerOutput).To(Equal(hostOutput))
@ -82,8 +79,12 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
When("Running the cuda-vectorAdd sample", Ordered, func() { When("Running the cuda-vectorAdd sample", Ordered, func() {
var referenceOutput string var referenceOutput string
BeforeAll(func(ctx context.Context) {
_, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
})
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
var err error var err error
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
@ -92,21 +93,18 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
}) })
It("should support automatic CDI spec generation", func(ctx context.Context) { It("should support automatic CDI spec generation", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out2)) Expect(referenceOutput).To(Equal(out2))
}) })
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia --gpus all")
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out3)) Expect(referenceOutput).To(Equal(out3))
}) })
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
By("Running docker run with --gpus all")
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out4)) Expect(referenceOutput).To(Equal(out4))
@ -116,15 +114,14 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
// A deviceQuery sample runs in a container with access to all GPUs // A deviceQuery sample runs in a container with access to all GPUs
// The following should all produce the same result. // The following should all produce the same result.
When("Running the cuda-deviceQuery sample", Ordered, func() { When("Running the cuda-deviceQuery sample", Ordered, func() {
var referenceOutput string
BeforeAll(func(ctx context.Context) { BeforeAll(func(ctx context.Context) {
_, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") _, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
}) })
var referenceOutput string
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
var err error var err error
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
@ -132,21 +129,18 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
}) })
It("should support automatic CDI spec generation", func(ctx context.Context) { It("should support automatic CDI spec generation", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out2)) Expect(referenceOutput).To(Equal(out2))
}) })
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia --gpus all")
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out3)) Expect(referenceOutput).To(Equal(out3))
}) })
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
By("Running docker run with --gpus all")
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out4)) Expect(referenceOutput).To(Equal(out4))
@ -155,6 +149,9 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
When("Testing CUDA Forward compatibility", Ordered, func() { When("Testing CUDA Forward compatibility", Ordered, func() {
BeforeAll(func(ctx context.Context) { BeforeAll(func(ctx context.Context) {
_, _, err := runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
Expect(err).ToNot(HaveOccurred())
compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"") compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(compatOutput).ToNot(BeEmpty()) Expect(compatOutput).ToNot(BeEmpty())
@ -178,21 +175,18 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
}) })
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) { It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
}) })
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) { It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
}) })
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) { It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --gpus all")
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
Expect(err).ToNot(HaveOccurred()) Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64")) Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))

View File

@ -1,6 +1,5 @@
/* /*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.

View File

@ -1,6 +1,8 @@
module github.com/NVIDIA/nvidia-container-toolkit/tests module github.com/NVIDIA/nvidia-container-toolkit/tests
go 1.24.1 go 1.23.2
toolchain go1.24.1
require ( require (
github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/ginkgo/v2 v2.23.4