From 9defe37fa2f5fa5b04bc72fd91d3ebfc34de128a Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 24 Apr 2025 20:50:18 +0200 Subject: [PATCH] [no-relnote] Update Github Actions E2E Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/workflows/e2e.yaml | 11 +++- .gitignore | 1 + tests/e2e/Makefile | 12 ++-- tests/e2e/README.md | 20 +++--- tests/e2e/e2e_test.go | 72 +++++++++++++--------- tests/e2e/installer.go | 4 +- tests/e2e/nvidia-container-toolkit_test.go | 34 +++++----- tests/e2e/runner.go | 3 +- tests/go.mod | 4 +- 9 files changed, 91 insertions(+), 70 deletions(-) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 3a3275de..6b197d04 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -70,8 +70,8 @@ jobs: - name: Run e2e tests env: - IMAGE_NAME: ghcr.io/nvidia/container-toolkit - VERSION: ${{ inputs.version }} + E2E_IMAGE_REPO: ghcr.io/nvidia/container-toolkit + E2E_IMAGE_TAG: ${{ inputs.version }}-ubuntu20.04 SSH_KEY: ${{ secrets.AWS_SSH_KEY }} E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }} E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }} @@ -84,6 +84,13 @@ jobs: make -f tests/e2e/Makefile test + - name: Archive Ginkgo logs + uses: actions/upload-artifact@v4 + with: + name: ginkgo-logs + path: ginkgo.json + retention-days: 15 + - name: Send Slack alert notification if: ${{ failure() }} uses: slackapi/slack-github-action@v2.0.0 diff --git a/.gitignore b/.gitignore index f3c93fe5..3ef43479 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ /nvidia-ctk /shared-* /release-* +/bin \ No newline at end of file diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index c67c5574..2f14fd8a 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -13,14 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -.PHONY: test-e2e ginkgo +.PHONY: test $(GINKGO_BIN) GINKGO_ARGS ?= LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs -ginkgo: +GINKGO_BIN := $(CURDIR)/bin/ginkgo + +test: $(GINKGO_BIN) + $(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/... + +$(GINKGO_BIN): mkdir -p $(CURDIR)/bin GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest - -test-e2e: ginkgo - $(CURDIR)/bin/ginkgo $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/... diff --git a/tests/e2e/README.md b/tests/e2e/README.md index 43d840c3..2caebce2 100644 --- a/tests/e2e/README.md +++ b/tests/e2e/README.md @@ -20,7 +20,7 @@ limitations under the License. --- ## 1 Scope & Goals -This repository contains a **Ginkgo v2 / Gomega** test harness that exercises an +This folder contains a **Ginkgo v2 / Gomega** test harness that exercises an NVIDIA Container Toolkit (CTK) installation on a **remote GPU‑enabled host** via SSH. The suite validates that: @@ -58,12 +58,13 @@ compatibility runs, and pre‑release validation of new CTK builds. | Variable | Required | Example | Description | |----------|----------|---------|-------------| -| `INSTALL_CTK` | ✖ | `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. | -| `TOOLKIT_IMAGE` | ✔ | `nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9` | Image that will be pulled & executed. | -| `SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. | -| `SSH_USER` | ✔ | `ubuntu` | Username on the remote host. | -| `REMOTE_HOST` | ✔ | `gpurunner01.corp.local` | Hostname or IP address of the target node. | -| `REMOTE_PORT` | ✔ | `22` | SSH port of the target node. | +| `E2E_INSTALL_CTK` | ✖ | `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. | +| `E2E_IMAGE_REPO` | ✔ | `ghcr.io/nvidia/container-toolkit` | Container Toolkit Image | +| `E2E_IMAGE_TAG` | ✔ | `latest` | Image tag | +| `E2E_SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. | +| `E2E_SSH_USER` | ✔ | `ubuntu` | Username on the remote host. | +| `E2E_SSH_HOST` | ✔ | `10.0.0.0` | Hostname or IP address of the target node. | +| `E2E_SSH_PORT` | ✔ | `22` | SSH port of the target node. | > All variables are validated at start‑up; the suite aborts early with a clear > message if any are missing or ill‑formed. @@ -92,12 +93,13 @@ bin/ginkgo: ### 6.1 Basic invocation ```bash INSTALL_CTK=true \ -TOOLKIT_IMAGE=nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9 \ +E2E_IMAGE_REPO=ghcr.io/nvidia/container-toolkit \ +E2E_IMAGE_TAG= \ SSH_KEY=$HOME/.ssh/id_rsa \ SSH_USER=ubuntu \ REMOTE_HOST=10.0.0.15 \ REMOTE_PORT=22 \ -make test-e2e +make test ``` This downloads the image on the remote host, installs CTK (if requested), and executes a minimal CUDA‑based workload. diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 3d16bf26..41da6521 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -1,6 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +18,7 @@ package e2e import ( "context" + "errors" "os" "path/filepath" "runtime" @@ -81,15 +81,6 @@ var _ = BeforeSuite(func() { err = installer.Install() Expect(err).ToNot(HaveOccurred()) } - - _, _, err := runner.Run("docker pull ubuntu") - Expect(err).ToNot(HaveOccurred()) - - _, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") - Expect(err).ToNot(HaveOccurred()) - - _, _, err = runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8") - Expect(err).ToNot(HaveOccurred()) }) // getTestEnv gets the test environment variables @@ -100,40 +91,63 @@ func getTestEnv() { _, thisFile, _, _ := runtime.Caller(0) packagePath = filepath.Dir(thisFile) - installCTK = getBoolEnvVar("INSTALL_CTK", false) + installCTK = getEnvVarOrDefault("E2E_INSTALL_CTK", true) - ImageRepo = os.Getenv("E2E_IMAGE_REPO") - Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set") + if installCTK { + ImageRepo = os.Getenv("E2E_IMAGE_REPO") + Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set") - ImageTag = os.Getenv("E2E_IMAGE_TAG") - Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set") + ImageTag = os.Getenv("E2E_IMAGE_TAG") + Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set") + } - sshKey = os.Getenv("SSH_KEY") - Expect(sshKey).NotTo(BeEmpty(), "SSH_KEY environment variable must be set") + sshKey = os.Getenv("E2E_SSH_KEY") + Expect(sshKey).NotTo(BeEmpty(), "E2E_SSH_KEY environment variable must be set") - sshUser = os.Getenv("SSH_USER") - Expect(sshUser).NotTo(BeEmpty(), "SSH_USER environment variable must be set") + sshUser = os.Getenv("E2E_SSH_USER") + Expect(sshUser).NotTo(BeEmpty(), "E2E_SSH_USER environment variable must be set") - host = os.Getenv("REMOTE_HOST") - Expect(host).NotTo(BeEmpty(), "REMOTE_HOST environment variable must be set") + host = os.Getenv("E2E_SSH_HOST") + Expect(host).NotTo(BeEmpty(), "E2E_SSH_HOST environment variable must be set") - sshPort = os.Getenv("REMOTE_PORT") - Expect(sshPort).NotTo(BeEmpty(), "REMOTE_PORT environment variable must be set") + sshPort = getEnvVarOrDefault("E2E_SSH_PORT", "22") // Get current working directory cwd, err = os.Getwd() Expect(err).NotTo(HaveOccurred()) } -// getBoolEnvVar returns the boolean value of the environment variable or the default value if not set. -func getBoolEnvVar(key string, defaultValue bool) bool { +func getEnvVarAs[T any](key string) (T, error) { + var zero T value := os.Getenv(key) if value == "" { - return defaultValue + return zero, errors.New("env var not set") } - boolValue, err := strconv.ParseBool(value) + + switch any(zero).(type) { + case bool: + v, err := strconv.ParseBool(value) + if err != nil { + return zero, err + } + return any(v).(T), nil + case int: + v, err := strconv.Atoi(value) + if err != nil { + return zero, err + } + return any(v).(T), nil + case string: + return any(value).(T), nil + default: + return zero, errors.New("unsupported type") + } +} + +func getEnvVarOrDefault[T any](key string, defaultValue T) T { + val, err := getEnvVarAs[T](key) if err != nil { return defaultValue } - return boolValue + return val } diff --git a/tests/e2e/installer.go b/tests/e2e/installer.go index 9d8afd54..1b08af9f 100644 --- a/tests/e2e/installer.go +++ b/tests/e2e/installer.go @@ -1,6 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package e2e import ( diff --git a/tests/e2e/nvidia-container-toolkit_test.go b/tests/e2e/nvidia-container-toolkit_test.go index 310bbade..f60673ff 100644 --- a/tests/e2e/nvidia-container-toolkit_test.go +++ b/tests/e2e/nvidia-container-toolkit_test.go @@ -1,6 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,38 +38,36 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { BeforeAll(func(ctx context.Context) { hostOutput, _, err = runner.Run("nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) + + _, _, err := runner.Run("docker pull ubuntu") + Expect(err).ToNot(HaveOccurred()) }) It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { - By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all") containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) }) It("should support automatic CDI spec generation", func(ctx context.Context) { - By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all") containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) }) It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) { - By("Running docker run with --gpus=all --runtime=nvidia --gpus all") containerOutput, _, err := runner.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) }) It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { - By("Running docker run with --runtime=nvidia --gpus all") containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) }) It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { - By("Running docker run with --gpus all") containerOutput, _, err := runner.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) @@ -82,8 +79,12 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { When("Running the cuda-vectorAdd sample", Ordered, func() { var referenceOutput string + BeforeAll(func(ctx context.Context) { + _, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + Expect(err).ToNot(HaveOccurred()) + }) + It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { - By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all") var err error referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) @@ -92,21 +93,18 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { }) It("should support automatic CDI spec generation", func(ctx context.Context) { - By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all") out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out2)) }) It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { - By("Running docker run with --runtime=nvidia --gpus all") out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out3)) }) It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { - By("Running docker run with --gpus all") out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out4)) @@ -116,15 +114,14 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { // A deviceQuery sample runs in a container with access to all GPUs // The following should all produce the same result. When("Running the cuda-deviceQuery sample", Ordered, func() { + var referenceOutput string + BeforeAll(func(ctx context.Context) { _, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) }) - var referenceOutput string - It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { - By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all") var err error referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) @@ -132,21 +129,18 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { }) It("should support automatic CDI spec generation", func(ctx context.Context) { - By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all") out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out2)) }) It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { - By("Running docker run with --runtime=nvidia --gpus all") out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out3)) }) It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { - By("Running docker run with --gpus all") out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out4)) @@ -155,6 +149,9 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { When("Testing CUDA Forward compatibility", Ordered, func() { BeforeAll(func(ctx context.Context) { + _, _, err := runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8") + Expect(err).ToNot(HaveOccurred()) + compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"") Expect(err).ToNot(HaveOccurred()) Expect(compatOutput).ToNot(BeEmpty()) @@ -178,21 +175,18 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { }) It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) { - By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all") ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") Expect(err).ToNot(HaveOccurred()) Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) }) It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) { - By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all") ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") Expect(err).ToNot(HaveOccurred()) Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) }) It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) { - By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --gpus all") ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") Expect(err).ToNot(HaveOccurred()) Expect(ldconfigOut).To(ContainSubstring("/usr/lib64")) diff --git a/tests/e2e/runner.go b/tests/e2e/runner.go index 1afb4226..54a74265 100644 --- a/tests/e2e/runner.go +++ b/tests/e2e/runner.go @@ -1,6 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/tests/go.mod b/tests/go.mod index eafdaab0..3c28ebc9 100644 --- a/tests/go.mod +++ b/tests/go.mod @@ -1,6 +1,8 @@ module github.com/NVIDIA/nvidia-container-toolkit/tests -go 1.24.1 +go 1.23.2 + +toolchain go1.24.1 require ( github.com/onsi/ginkgo/v2 v2.23.4