From de1e004c43c7e2d89bb949c8a92dd98a9b3fc39d Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 24 Apr 2025 20:50:18 +0200 Subject: [PATCH] [no-relnote] Update Github Actions E2E Signed-off-by: Carlos Eduardo Arango Gutierrez --- .github/workflows/e2e.yaml | 13 ++++++-- tests/e2e/README.md | 10 ++++--- tests/e2e/e2e_test.go | 35 +++++++++++++++++----- tests/e2e/nvidia-container-toolkit_test.go | 30 ++++++++----------- 4 files changed, 57 insertions(+), 31 deletions(-) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 3a3275de..e1937aaa 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -70,8 +70,8 @@ jobs: - name: Run e2e tests env: - IMAGE_NAME: ghcr.io/nvidia/container-toolkit - VERSION: ${{ inputs.version }} + E2E_IMAGE_REPO: ghcr.io/nvidia/container-toolkit + E2E_IMAGE_TAG: ${{ inputs.version }}-ubuntu20.04 SSH_KEY: ${{ secrets.AWS_SSH_KEY }} E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }} E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }} @@ -82,8 +82,15 @@ jobs: chmod 600 "$e2e_ssh_key" export E2E_SSH_KEY="$e2e_ssh_key" - make -f tests/e2e/Makefile test + make -f tests/e2e/Makefile test-e2e + - name: Archive Ginkgo logs + uses: actions/upload-artifact@v4 + with: + name: ginkgo-logs + path: ginkgo.json + retention-days: 15 + - name: Send Slack alert notification if: ${{ failure() }} uses: slackapi/slack-github-action@v2.0.0 diff --git a/tests/e2e/README.md b/tests/e2e/README.md index 43d840c3..2beb2c96 100644 --- a/tests/e2e/README.md +++ b/tests/e2e/README.md @@ -59,9 +59,10 @@ compatibility runs, and pre‑release validation of new CTK builds. | Variable | Required | Example | Description | |----------|----------|---------|-------------| | `INSTALL_CTK` | ✖ | `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. | -| `TOOLKIT_IMAGE` | ✔ | `nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9` | Image that will be pulled & executed. | -| `SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. | -| `SSH_USER` | ✔ | `ubuntu` | Username on the remote host. | +| `E2E_IMAGE_REPO` | ✔ | `ghcr.io/nvidia/container-toolkit` | Container Toolkit Image | +| `E2E_IMAGE_TAG` | ✔ | `latest` | Image tag | +| `E2E_SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. | +| `E2E_SSH_USER` | ✔ | `ubuntu` | Username on the remote host. | | `REMOTE_HOST` | ✔ | `gpurunner01.corp.local` | Hostname or IP address of the target node. | | `REMOTE_PORT` | ✔ | `22` | SSH port of the target node. | @@ -92,7 +93,8 @@ bin/ginkgo: ### 6.1 Basic invocation ```bash INSTALL_CTK=true \ -TOOLKIT_IMAGE=nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9 \ +E2E_IMAGE_REPO=ghcr.io/nvidia/container-toolkit \ +E2E_IMAGE_TAG=latest \ SSH_KEY=$HOME/.ssh/id_rsa \ SSH_USER=ubuntu \ REMOTE_HOST=10.0.0.15 \ diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 3d16bf26..4b8e4a34 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -29,6 +29,12 @@ import ( . "github.com/onsi/gomega" ) +const ( + vectorAddImage = "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0" + deviceQueryImage = "nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0" + cudaImage = "nvcr.io/nvidia/cuda:12.8.0-base-ubi8" +) + // Test context var ( ctx context.Context @@ -88,6 +94,8 @@ var _ = BeforeSuite(func() { _, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") Expect(err).ToNot(HaveOccurred()) + _, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + Expect(err).ToNot(HaveOccurred()) _, _, err = runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8") Expect(err).ToNot(HaveOccurred()) }) @@ -100,7 +108,7 @@ func getTestEnv() { _, thisFile, _, _ := runtime.Caller(0) packagePath = filepath.Dir(thisFile) - installCTK = getBoolEnvVar("INSTALL_CTK", false) + installCTK = getBoolEnvVar("E2E_INSTALL_CTK", false) ImageRepo = os.Getenv("E2E_IMAGE_REPO") Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set") @@ -108,17 +116,16 @@ func getTestEnv() { ImageTag = os.Getenv("E2E_IMAGE_TAG") Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set") - sshKey = os.Getenv("SSH_KEY") - Expect(sshKey).NotTo(BeEmpty(), "SSH_KEY environment variable must be set") + sshKey = os.Getenv("E2E_SSH_KEY") + Expect(sshKey).NotTo(BeEmpty(), "E2E_SSH_KEY environment variable must be set") - sshUser = os.Getenv("SSH_USER") + sshUser = os.Getenv("E2E_SSH_USER") Expect(sshUser).NotTo(BeEmpty(), "SSH_USER environment variable must be set") - host = os.Getenv("REMOTE_HOST") + host = os.Getenv("E2E_SSH_HOST") Expect(host).NotTo(BeEmpty(), "REMOTE_HOST environment variable must be set") - sshPort = os.Getenv("REMOTE_PORT") - Expect(sshPort).NotTo(BeEmpty(), "REMOTE_PORT environment variable must be set") + sshPort = getIntEnvVar("E2E_SSH_PORT", 22) // Get current working directory cwd, err = os.Getwd() @@ -137,3 +144,17 @@ func getBoolEnvVar(key string, defaultValue bool) bool { } return boolValue } + +// getIntEnvVar returns the integer value of the environment variable or the default value if not set. +func getIntEnvVar(key string, defaultValue int) string { + value := os.Getenv(key) + if value == "" { + return strconv.Itoa(defaultValue) + } + intValue, err := strconv.Atoi(value) + if err != nil { + return strconv.Itoa(defaultValue) + } + + return strconv.Itoa(intValue) +} diff --git a/tests/e2e/nvidia-container-toolkit_test.go b/tests/e2e/nvidia-container-toolkit_test.go index 310bbade..7bac6a38 100644 --- a/tests/e2e/nvidia-container-toolkit_test.go +++ b/tests/e2e/nvidia-container-toolkit_test.go @@ -19,6 +19,7 @@ package e2e import ( "context" + "fmt" "path/filepath" "strings" @@ -85,7 +86,7 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all") var err error - referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", vectorAddImage)) Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(ContainSubstring("Test PASSED")) @@ -93,21 +94,21 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { It("should support automatic CDI spec generation", func(ctx context.Context) { By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all") - out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", vectorAddImage)) Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out2)) }) It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { By("Running docker run with --runtime=nvidia --gpus all") - out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", vectorAddImage)) Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out3)) }) It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { By("Running docker run with --gpus all") - out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0") + out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", vectorAddImage)) Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out4)) }) @@ -116,38 +117,33 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { // A deviceQuery sample runs in a container with access to all GPUs // The following should all produce the same result. When("Running the cuda-deviceQuery sample", Ordered, func() { - BeforeAll(func(ctx context.Context) { - _, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") - Expect(err).ToNot(HaveOccurred()) - }) - var referenceOutput string It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) { By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all") var err error - referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", deviceQueryImage)) Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(ContainSubstring("Result = PASS")) }) It("should support automatic CDI spec generation", func(ctx context.Context) { By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all") - out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", deviceQueryImage)) Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out2)) }) It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) { By("Running docker run with --runtime=nvidia --gpus all") - out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", deviceQueryImage)) Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out3)) }) It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) { By("Running docker run with --gpus all") - out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0") + out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", deviceQueryImage)) Expect(err).ToNot(HaveOccurred()) Expect(referenceOutput).To(Equal(out4)) }) @@ -155,7 +151,7 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { When("Testing CUDA Forward compatibility", Ordered, func() { BeforeAll(func(ctx context.Context) { - compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"") + compatOutput, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void %s bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"", cudaImage)) Expect(err).ToNot(HaveOccurred()) Expect(compatOutput).ToNot(BeEmpty()) @@ -179,21 +175,21 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) { By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all") - ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") + ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage)) Expect(err).ToNot(HaveOccurred()) Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) }) It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) { By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all") - ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") + ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage)) Expect(err).ToNot(HaveOccurred()) Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat")) }) It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) { By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --gpus all") - ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"") + ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage)) Expect(err).ToNot(HaveOccurred()) Expect(ldconfigOut).To(ContainSubstring("/usr/lib64")) })