diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 5c6ace97..5772da3f 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -70,20 +70,25 @@ jobs: - name: Run e2e tests env: - IMAGE_NAME: ghcr.io/nvidia/container-toolkit - VERSION: ${{ inputs.version }} - SSH_KEY: ${{ secrets.AWS_SSH_KEY }} + E2E_INSTALL_CTK: "true" + E2E_IMAGE_NAME: ghcr.io/nvidia/container-toolkit + E2E_IMAGE_TAG: ${{ inputs.version }}-ubuntu20.04 E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }} E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }} - E2E_INSTALL_CTK: "true" run: | e2e_ssh_key=$(mktemp) - echo "$SSH_KEY" > "$e2e_ssh_key" + echo "${{ secrets.AWS_SSH_KEY }}" > "$e2e_ssh_key" chmod 600 "$e2e_ssh_key" export E2E_SSH_KEY="$e2e_ssh_key" make -f tests/e2e/Makefile test + - name: Archive Ginkgo logs + uses: actions/upload-artifact@v4 + with: + name: ginkgo-logs + path: ginkgo.json + retention-days: 15 - name: Send Slack alert notification if: ${{ failure() }} uses: slackapi/slack-github-action@v2.1.0 @@ -94,5 +99,5 @@ jobs: channel: ${{ secrets.SLACK_CHANNEL_ID }} text: | :x: On repository ${{ github.repository }}, the Workflow *${{ github.workflow }}* has failed. - + Details: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} diff --git a/.gitignore b/.gitignore index f3c93fe5..7fe5237b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ /nvidia-ctk /shared-* /release-* +/bin diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index cc11366e..2f14fd8a 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -1,4 +1,5 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,34 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -GO_CMD ?= go +.PHONY: test $(GINKGO_BIN) -include $(CURDIR)/versions.mk +GINKGO_ARGS ?= +LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs -E2E_RUNTIME ?= docker +GINKGO_BIN := $(CURDIR)/bin/ginkgo -E2E_INSTALL_CTK ?= false +test: $(GINKGO_BIN) + $(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/... -ifeq ($($(DIST)),) -DIST ?= ubuntu20.04 -endif -IMAGE_TAG ?= $(VERSION)-$(DIST) -IMAGE = $(IMAGE_NAME):$(IMAGE_TAG) - -E2E_SSH_KEY ?= -E2E_SSH_USER ?= -E2E_SSH_HOST ?= -E2E_SSH_PORT ?= 22 - -.PHONY: test -test: - cd $(CURDIR)/tests/e2e && $(GO_CMD) test -v . -args \ - -ginkgo.focus="$(E2E_RUNTIME)" \ - -test.timeout=1h \ - -ginkgo.v \ - -install-ctk=$(E2E_INSTALL_CTK) \ - -toolkit-image=$(IMAGE) \ - -ssh-key=$(E2E_SSH_KEY) \ - -ssh-user=$(E2E_SSH_USER) \ - -remote-host=$(E2E_SSH_HOST) \ - -remote-port=$(E2E_SSH_PORT) +$(GINKGO_BIN): + mkdir -p $(CURDIR)/bin + GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index e31a88f0..d6b80dd5 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -1,24 +1,27 @@ -/* -* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. - */ +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ package e2e import ( "context" - "flag" + "errors" + "os" + "strconv" "testing" . "github.com/onsi/ginkgo/v2" @@ -31,33 +34,86 @@ var ( installCTK bool - image string + imageName string + imageTag string sshKey string sshUser string - host string + sshHost string sshPort string ) -func init() { - flag.BoolVar(&installCTK, "install-ctk", false, "Install the NVIDIA Container Toolkit") - flag.StringVar(&image, "toolkit-image", "", "Repository of the image to test") - flag.StringVar(&sshKey, "ssh-key", "", "SSH key to use for remote login") - flag.StringVar(&sshUser, "ssh-user", "", "SSH user to use for remote login") - flag.StringVar(&host, "remote-host", "", "Hostname of the remote machine") - flag.StringVar(&sshPort, "remote-port", "22", "SSH port to use for remote login") -} - func TestMain(t *testing.T) { - suiteName := "NVIDIA Container Toolkit E2E" + suiteName := "E2E NVIDIA Container Toolkit" RegisterFailHandler(Fail) + + ctx = context.Background() + getTestEnv() + RunSpecs(t, suiteName, ) } -// BeforeSuite runs before the test suite -var _ = BeforeSuite(func() { - ctx = context.Background() -}) +// getTestEnv gets the test environment variables +func getTestEnv() { + defer GinkgoRecover() + + installCTK = getEnvVarOrDefault("E2E_INSTALL_CTK", false) + + if installCTK { + imageName = getRequiredEnvvar[string]("E2E_IMAGE_NAME") + + imageTag = getRequiredEnvvar[string]("E2E_IMAGE_TAG") + + } + + sshKey = getRequiredEnvvar[string]("E2E_SSH_KEY") + sshUser = getRequiredEnvvar[string]("E2E_SSH_USER") + sshHost = getRequiredEnvvar[string]("E2E_SSH_HOST") + + sshPort = getEnvVarOrDefault("E2E_SSH_PORT", "22") +} + +// getRequiredEnvvar returns the specified envvar if set or raises an error. +func getRequiredEnvvar[T any](key string) T { + v, err := getEnvVarAs[T](key) + Expect(err).To(BeNil(), "required environement variable not set", key) + return v +} + +func getEnvVarAs[T any](key string) (T, error) { + var zero T + value := os.Getenv(key) + if value == "" { + return zero, errors.New("env var not set") + } + + switch any(zero).(type) { + case bool: + v, err := strconv.ParseBool(value) + if err != nil { + return zero, err + } + return any(v).(T), nil + case int: + v, err := strconv.Atoi(value) + if err != nil { + return zero, err + } + return any(v).(T), nil + case string: + return any(value).(T), nil + default: + return zero, errors.New("unsupported type") + } +} + +func getEnvVarOrDefault[T any](key string, defaultValue T) T { + val, err := getEnvVarAs[T](key) + if err != nil { + return defaultValue + } + return val +} diff --git a/tests/e2e/nvidia-container-toolkit_test.go b/tests/e2e/nvidia-container-toolkit_test.go index 5948014b..2b1ef289 100644 --- a/tests/e2e/nvidia-container-toolkit_test.go +++ b/tests/e2e/nvidia-container-toolkit_test.go @@ -27,23 +27,25 @@ import ( // Integration tests for Docker runtime var _ = Describe("docker", Ordered, ContinueOnFailure, func() { - var r Runner + var runner Runner // Install the NVIDIA Container Toolkit BeforeAll(func(ctx context.Context) { - r = NewRunner( - WithHost(host), + runner = NewRunner( + WithHost(sshHost), WithPort(sshPort), WithSshKey(sshKey), WithSshUser(sshUser), ) + if installCTK { installer, err := NewToolkitInstaller( - WithRunner(r), - WithImage(image), + WithRunner(runner), + WithImage(imageName+":"+imageTag), WithTemplate(dockerInstallTemplate), ) Expect(err).ToNot(HaveOccurred()) + err = installer.Install() Expect(err).ToNot(HaveOccurred()) } @@ -55,41 +57,42 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { // the same output When("running nvidia-smi -L", Ordered, func() { var hostOutput string + var err error BeforeAll(func(ctx context.Context) { - _, _, err := r.Run("docker pull ubuntu") + hostOutput, _, err = runner.Run("nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) - hostOutput, _, err = r.Run("nvidia-smi -L") + _, _, err := runner.Run("docker pull ubuntu") Expect(err).ToNot(HaveOccurred()) }) It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { - containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L") + containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) }) It("should support automatic CDI spec generation", func(ctx context.Context) { - containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L") + containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) }) It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) { - containerOutput, _, err := r.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L") + containerOutput, _, err := runner.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) }) It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { - containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L") + containerOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) }) It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { - containerOutput, _, err := r.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L") + containerOutput, _, err := runner.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L") Expect(err).ToNot(HaveOccurred()) Expect(containerOutput).To(Equal(hostOutput)) }) @@ -98,35 +101,35 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { // A vectorAdd sample runs in a container with access to all GPUs. // The following should all produce the same result. When("Running the cuda-vectorAdd sample", Ordered, func() { + var referenceOutput string + BeforeAll(func(ctx context.Context) { - _, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + _, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) }) - var referenceOutput string - It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { var err error - referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(ContainSubstring("Test PASSED")) }) It("should support automatic CDI spec generation", func(ctx context.Context) { - out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out2)) }) It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { - out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out3)) }) It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { - out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out4)) }) @@ -135,54 +138,52 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { // A deviceQuery sample runs in a container with access to all GPUs // The following should all produce the same result. When("Running the cuda-deviceQuery sample", Ordered, func() { + var referenceOutput string + BeforeAll(func(ctx context.Context) { - _, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + _, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) }) - var referenceOutput string - It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { var err error - referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) - Expect(referenceOutput).To(ContainSubstring("Result = PASS")) }) It("should support automatic CDI spec generation", func(ctx context.Context) { - out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out2)) }) It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { - out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out3)) }) It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { - out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out4)) }) }) - Describe("CUDA Forward compatibility", Ordered, func() { + When("Testing CUDA Forward compatibility", Ordered, func() { BeforeAll(func(ctx context.Context) { - _, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8") + _, _, err := runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8") Expect(err).ToNot(HaveOccurred()) - }) - BeforeAll(func(ctx context.Context) { - compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"") + compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"") Expect(err).ToNot(HaveOccurred()) Expect(compatOutput).ToNot(BeEmpty()) + compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.") compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0] - driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"") + driverOutput, _, err := runner.Run("nvidia-smi -q | grep \"Driver Version\"") Expect(err).ToNot(HaveOccurred()) parts := strings.SplitN(driverOutput, ":", 2) Expect(parts).To(HaveLen(2)) @@ -198,19 +199,19 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { }) It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) { - ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") + ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") Expect(err).ToNot(HaveOccurred()) Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) }) It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) { - ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") + ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") Expect(err).ToNot(HaveOccurred()) Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) }) It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) { - ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") + ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") Expect(err).ToNot(HaveOccurred()) Expect(ldconfigOut).To(ContainSubstring("/usr/lib64")) })