Move test to tests

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2025-02-28 17:21:23 +02:00
parent 91a983a341
commit 069926e4b6
807 changed files with 444269 additions and 78 deletions

45
tests/e2e/Makefile Normal file
View File

@@ -0,0 +1,45 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
GO_CMD ?= go
include $(CURDIR)/versions.mk
E2E_RUNTIME ?= docker
E2E_INSTALL_CTK ?= false
ifeq ($($(DIST)),)
DIST ?= ubuntu20.04
endif
IMAGE_TAG ?= $(VERSION)-$(DIST)
IMAGE = $(IMAGE_NAME):$(IMAGE_TAG)
E2E_SSH_KEY ?=
E2E_SSH_USER ?=
E2E_SSH_HOST ?=
E2E_SSH_PORT ?= 22
.PHONY: test
test:
cd $(CURDIR)/tests/e2e && $(GO_CMD) test -v . -args \
-ginkgo.focus="$(E2E_RUNTIME)" \
-test.timeout=1h \
-ginkgo.v \
-install-ctk=$(E2E_INSTALL_CTK) \
-toolkit-image=$(IMAGE) \
-ssh-key=$(E2E_SSH_KEY) \
-ssh-user=$(E2E_SSH_USER) \
-remote-host=$(E2E_SSH_HOST) \
-remote-port=$(E2E_SSH_PORT)

63
tests/e2e/e2e_test.go Normal file
View File

@@ -0,0 +1,63 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package e2e
import (
"context"
"flag"
"testing"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
// Test context
var (
ctx context.Context
installCTK bool
image string
sshKey string
sshUser string
host string
sshPort string
)
func init() {
flag.BoolVar(&installCTK, "install-ctk", false, "Install the NVIDIA Container Toolkit")
flag.StringVar(&image, "toolkit-image", "", "Repository of the image to test")
flag.StringVar(&sshKey, "ssh-key", "", "SSH key to use for remote login")
flag.StringVar(&sshUser, "ssh-user", "", "SSH user to use for remote login")
flag.StringVar(&host, "remote-host", "", "Hostname of the remote machine")
flag.StringVar(&sshPort, "remote-port", "22", "SSH port to use for remote login")
}
func TestMain(t *testing.T) {
suiteName := "NVIDIA Container Toolkit E2E"
RegisterFailHandler(Fail)
RunSpecs(t,
suiteName,
)
}
// BeforeSuite runs before the test suite
var _ = BeforeSuite(func() {
ctx = context.Background()
})

30
tests/e2e/infra/aws.yaml Normal file
View File

@@ -0,0 +1,30 @@
apiVersion: holodeck.nvidia.com/v1alpha1
kind: Environment
metadata:
name: HOLODECK_NAME
description: "end-to-end test infrastructure"
spec:
provider: aws
auth:
keyName: cnt-ci
privateKey: HOLODECK_PRIVATE_KEY
instance:
type: g4dn.xlarge
region: us-west-1
ingressIpRanges:
- 18.190.12.32/32
- 3.143.46.93/32
- 44.230.241.223/32
- 44.235.4.62/32
- 52.15.119.136/32
- 52.24.205.48/32
image:
architecture: amd64
imageId: ami-0ce2cb35386fc22e9
containerRuntime:
install: true
name: docker
nvidiaContainerToolkit:
install: false
nvidiaDriver:
install: true

118
tests/e2e/installer.go Normal file
View File

@@ -0,0 +1,118 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package e2e
import (
"bytes"
"fmt"
"text/template"
)
// dockerInstallTemplate is a template for installing the NVIDIA Container Toolkit
// on a host using Docker.
var dockerInstallTemplate = `
#! /usr/bin/env bash
set -xe
: ${IMAGE:={{.Image}}}
# Create a temporary directory
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
mkdir -p "$TEMP_DIR"
# Given that docker has an init function that checks for the existence of the
# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
# in the /usr/bin directory.
# See https://github.com/moby/moby/blob/20a05dabf44934447d1a66cdd616cc803b81d4e2/daemon/nvidia_linux.go#L32-L46
sudo rm -f /usr/bin/nvidia-container-runtime-hook
sudo ln -s "$TEMP_DIR/toolkit/nvidia-container-runtime-hook" /usr/bin/nvidia-container-runtime-hook
docker run --pid=host --rm -i --privileged \
-v /:/host \
-v /var/run/docker.sock:/var/run/docker.sock \
-v "$TEMP_DIR:$TEMP_DIR" \
-v /etc/docker:/config-root \
${IMAGE} \
--root "$TEMP_DIR" \
--runtime=docker \
--config=/config-root/daemon.json \
--driver-root=/ \
--no-daemon \
--restart-mode=systemd
`
type ToolkitInstaller struct {
runner Runner
template string
Image string
}
type installerOption func(*ToolkitInstaller)
func WithRunner(r Runner) installerOption {
return func(i *ToolkitInstaller) {
i.runner = r
}
}
func WithImage(image string) installerOption {
return func(i *ToolkitInstaller) {
i.Image = image
}
}
func WithTemplate(template string) installerOption {
return func(i *ToolkitInstaller) {
i.template = template
}
}
func NewToolkitInstaller(opts ...installerOption) (*ToolkitInstaller, error) {
i := &ToolkitInstaller{
runner: localRunner{},
template: dockerInstallTemplate,
}
for _, opt := range opts {
opt(i)
}
if i.Image == "" {
return nil, fmt.Errorf("image is required")
}
return i, nil
}
func (i *ToolkitInstaller) Install() error {
// Parse the combined template
tmpl, err := template.New("installScript").Parse(i.template)
if err != nil {
return fmt.Errorf("error parsing template: %w", err)
}
// Execute the template
var renderedScript bytes.Buffer
err = tmpl.Execute(&renderedScript, i)
if err != nil {
return fmt.Errorf("error executing template: %w", err)
}
_, _, err = i.runner.Run(renderedScript.String())
return err
}

View File

@@ -0,0 +1,218 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package e2e
import (
"context"
"path/filepath"
"strings"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
// Integration tests for Docker runtime
var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
var r Runner
// Install the NVIDIA Container Toolkit
BeforeAll(func(ctx context.Context) {
r = NewRunner(
WithHost(host),
WithPort(sshPort),
WithSshKey(sshKey),
WithSshUser(sshUser),
)
if installCTK {
installer, err := NewToolkitInstaller(
WithRunner(r),
WithImage(image),
WithTemplate(dockerInstallTemplate),
)
Expect(err).ToNot(HaveOccurred())
err = installer.Install()
Expect(err).ToNot(HaveOccurred())
}
})
// GPUs are accessible in a container: Running nvidia-smi -L inside the
// container shows the same output inside the container as outside the
// container. This means that the following commands must all produce
// the same output
When("running nvidia-smi -L", Ordered, func() {
var hostOutput string
BeforeAll(func(ctx context.Context) {
_, _, err := r.Run("docker pull ubuntu")
Expect(err).ToNot(HaveOccurred())
hostOutput, _, err = r.Run("nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
})
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
It("should support automatic CDI spec generation", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
containerOutput, _, err := r.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
Expect(err).ToNot(HaveOccurred())
Expect(containerOutput).To(Equal(hostOutput))
})
})
// A vectorAdd sample runs in a container with access to all GPUs.
// The following should all produce the same result.
When("Running the cuda-vectorAdd sample", Ordered, func() {
BeforeAll(func(ctx context.Context) {
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
})
var referenceOutput string
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
var err error
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
})
It("should support automatic CDI spec generation", func(ctx context.Context) {
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out2))
})
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out3))
})
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out4))
})
})
// A deviceQuery sample runs in a container with access to all GPUs
// The following should all produce the same result.
When("Running the cuda-deviceQuery sample", Ordered, func() {
BeforeAll(func(ctx context.Context) {
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
})
var referenceOutput string
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
var err error
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
})
It("should support automatic CDI spec generation", func(ctx context.Context) {
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out2))
})
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out3))
})
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out4))
})
})
Describe("CUDA Forward compatibility", Ordered, func() {
BeforeAll(func(ctx context.Context) {
_, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
Expect(err).ToNot(HaveOccurred())
})
BeforeAll(func(ctx context.Context) {
compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
Expect(err).ToNot(HaveOccurred())
Expect(compatOutput).ToNot(BeEmpty())
compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.")
compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0]
driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"")
Expect(err).ToNot(HaveOccurred())
parts := strings.SplitN(driverOutput, ":", 2)
Expect(parts).To(HaveLen(2))
hostDriverVersion := strings.TrimSpace(parts[1])
Expect(hostDriverVersion).ToNot(BeEmpty())
driverMajor := strings.SplitN(hostDriverVersion, ".", 2)[0]
if driverMajor >= compatMajor {
GinkgoLogr.Info("CUDA Forward Compatibility tests require an older driver version", "hostDriverVersion", hostDriverVersion, "compatDriverVersion", compatDriverVersion)
Skip("CUDA Forward Compatibility tests require an older driver version")
}
})
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
})
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
})
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
})
})
})

171
tests/e2e/runner.go Normal file
View File

@@ -0,0 +1,171 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package e2e
import (
"bytes"
"fmt"
"os"
"os/exec"
"time"
"golang.org/x/crypto/ssh"
)
type localRunner struct{}
type remoteRunner struct {
sshKey string
sshUser string
host string
port string
}
type runnerOption func(*remoteRunner)
type Runner interface {
Run(script string) (string, string, error)
}
func WithSshKey(key string) runnerOption {
return func(r *remoteRunner) {
r.sshKey = key
}
}
func WithSshUser(user string) runnerOption {
return func(r *remoteRunner) {
r.sshUser = user
}
}
func WithHost(host string) runnerOption {
return func(r *remoteRunner) {
r.host = host
}
}
func WithPort(port string) runnerOption {
return func(r *remoteRunner) {
r.port = port
}
}
func NewRunner(opts ...runnerOption) Runner {
r := &remoteRunner{}
for _, opt := range opts {
opt(r)
}
// If the Host is empty, return a local runner
if r.host == "" {
return localRunner{}
}
// Otherwise, return a remote runner
return r
}
func (l localRunner) Run(script string) (string, string, error) {
// Create a command to run the script using bash
cmd := exec.Command("bash", "-c", script)
// Buffer to capture standard output
var stdout bytes.Buffer
cmd.Stdout = &stdout
// Buffer to capture standard error
var stderr bytes.Buffer
cmd.Stderr = &stderr
// Run the command
err := cmd.Run()
if err != nil {
return "", "", fmt.Errorf("script execution failed: %v\nSTDOUT: %s\nSTDERR: %s", err, stdout.String(), stderr.String())
}
// Return the captured stdout and nil error
return stdout.String(), "", nil
}
func (r remoteRunner) Run(script string) (string, string, error) {
// Create a new SSH connection
client, err := connectOrDie(r.sshKey, r.sshUser, r.host, r.port)
if err != nil {
return "", "", fmt.Errorf("failed to connect to %s: %v", r.host, err)
}
defer client.Close()
// Create a session
session, err := client.NewSession()
if err != nil {
return "", "", fmt.Errorf("failed to create session: %v", err)
}
defer session.Close()
// Capture stdout and stderr
var stdout, stderr bytes.Buffer
session.Stdout = &stdout
session.Stderr = &stderr
// Run the script
err = session.Run(script)
if err != nil {
return "", "", fmt.Errorf("script execution failed: %v\nSTDOUT: %s\nSTDERR: %s", err, stdout.String(), stderr.String())
}
// Return stdout as string if no errors
return stdout.String(), "", nil
}
// createSshClient creates a ssh client, and retries if it fails to connect
func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) {
var client *ssh.Client
var err error
key, err := os.ReadFile(sshKey)
if err != nil {
return nil, fmt.Errorf("failed to read key file: %v", err)
}
signer, err := ssh.ParsePrivateKey(key)
if err != nil {
return nil, fmt.Errorf("failed to parse private key: %v", err)
}
sshConfig := &ssh.ClientConfig{
User: sshUser,
Auth: []ssh.AuthMethod{
ssh.PublicKeys(signer),
},
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
}
connectionFailed := false
for i := 0; i < 20; i++ {
client, err = ssh.Dial("tcp", host+":"+port, sshConfig)
if err == nil {
return client, nil // Connection succeeded, return the client.
}
connectionFailed = true
// Sleep for a brief moment before retrying.
// You can adjust the duration based on your requirements.
time.Sleep(1 * time.Second)
}
if connectionFailed {
return nil, fmt.Errorf("failed to connect to %s after 10 retries, giving up", host)
}
return client, nil
}