[no-relnote] Update Github Actions E2E
Some checks are pending
CI Pipeline / code-scanning (push) Waiting to run
CI Pipeline / variables (push) Waiting to run
CI Pipeline / golang (push) Waiting to run
CI Pipeline / image (push) Blocked by required conditions
CI Pipeline / e2e-test (push) Blocked by required conditions

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
This commit is contained in:
Carlos Eduardo Arango Gutierrez 2025-04-24 20:50:18 +02:00
parent 6df26cc7a5
commit b58d33b170
No known key found for this signature in database
GPG Key ID: 42D9CB42F300A852
6 changed files with 77 additions and 42 deletions

View File

@ -70,8 +70,8 @@ jobs:
- name: Run e2e tests
env:
IMAGE_NAME: ghcr.io/nvidia/container-toolkit
VERSION: ${{ inputs.version }}
E2E_IMAGE_REPO: ghcr.io/nvidia/container-toolkit
E2E_IMAGE_TAG: ${{ inputs.version }}
SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }}
E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }}
@ -84,6 +84,13 @@ jobs:
make -f tests/e2e/Makefile test
- name: Archive Ginkgo logs
uses: actions/upload-artifact@v4
with:
name: ginkgo-logs
path: ginkgo.json
retention-days: 15
- name: Send Slack alert notification
if: ${{ failure() }}
uses: slackapi/slack-github-action@v2.0.0

1
.gitignore vendored
View File

@ -11,3 +11,4 @@
/nvidia-ctk
/shared-*
/release-*
/bin

View File

@ -13,14 +13,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
.PHONY: test-e2e ginkgo
.PHONY: test $(GINKGO_BIN)
GINKGO_ARGS ?=
LOG_ARTIFACTS_DIR ?= $(CURDIR)/e2e_logs
ginkgo:
GINKGO_BIN := $(CURDIR)/bin/ginkgo
test: $(GINKGO_BIN)
$(GINKGO_BIN) $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...
$(GINKGO_BIN):
mkdir -p $(CURDIR)/bin
GOBIN=$(CURDIR)/bin go install github.com/onsi/ginkgo/v2/ginkgo@latest
test-e2e: ginkgo
$(CURDIR)/bin/ginkgo $(GINKGO_ARGS) -v --json-report ginkgo.json ./tests/e2e/...

View File

@ -20,7 +20,7 @@ limitations under the License.
---
## 1 Scope & Goals
This repository contains a **Ginkgov2 / Gomega** test harness that exercises an
This folder contains a **Ginkgov2 / Gomega** test harness that exercises an
NVIDIA Container Toolkit (CTK) installation on a **remote GPUenabled host** via
SSH. The suite validates that:
@ -59,10 +59,11 @@ compatibility runs, and prerelease validation of new CTK builds.
| Variable | Required | Example | Description |
|----------|----------|---------|-------------|
| `INSTALL_CTK` | ✖ | `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. |
| `TOOLKIT_IMAGE` | ✔ | `nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9` | Image that will be pulled & executed. |
| `SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
| `SSH_USER` | ✔ | `ubuntu` | Username on the remote host. |
| `REMOTE_HOST` | ✔ | `gpurunner01.corp.local` | Hostname or IP address of the target node. |
| `E2E_IMAGE_REPO` | ✔ | `ghcr.io/nvidia/container-toolkit` | Container Toolkit Image |
| `E2E_IMAGE_TAG` | ✔ | `latest` | Image tag |
| `E2E_SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
| `E2E_SSH_USER` | ✔ | `ubuntu` | Username on the remote host. |
| `REMOTE_HOST` | ✔ | `10.0.0.0` | Hostname or IP address of the target node. |
| `REMOTE_PORT` | ✔ | `22` | SSH port of the target node. |
> All variables are validated at startup; the suite aborts early with a clear
@ -92,12 +93,13 @@ bin/ginkgo:
### 6.1 Basic invocation
```bash
INSTALL_CTK=true \
TOOLKIT_IMAGE=nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9 \
E2E_IMAGE_REPO=ghcr.io/nvidia/container-toolkit \
E2E_IMAGE_TAG=<image-tag> \
SSH_KEY=$HOME/.ssh/id_rsa \
SSH_USER=ubuntu \
REMOTE_HOST=10.0.0.15 \
REMOTE_PORT=22 \
make test-e2e
make test
```
This downloads the image on the remote host, installs CTK (if requested), and
executes a minimal CUDAbased workload.

View File

@ -19,6 +19,7 @@ package e2e
import (
"context"
"fmt"
"os"
"path/filepath"
"runtime"
@ -29,6 +30,12 @@ import (
. "github.com/onsi/gomega"
)
const (
vectorAddImage = "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0"
deviceQueryImage = "nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0"
cudaImage = "nvcr.io/nvidia/cuda:12.8.0-base-ubi8"
)
// Test context
var (
ctx context.Context
@ -85,10 +92,12 @@ var _ = BeforeSuite(func() {
_, _, err := runner.Run("docker pull ubuntu")
Expect(err).ToNot(HaveOccurred())
_, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
_, _, err = runner.Run(fmt.Sprintf("docker pull %s", vectorAddImage))
Expect(err).ToNot(HaveOccurred())
_, _, err = runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
_, _, err = runner.Run(fmt.Sprintf("docker pull %s", deviceQueryImage))
Expect(err).ToNot(HaveOccurred())
_, _, err = runner.Run(fmt.Sprintf("docker pull %s", cudaImage))
Expect(err).ToNot(HaveOccurred())
})
@ -100,7 +109,7 @@ func getTestEnv() {
_, thisFile, _, _ := runtime.Caller(0)
packagePath = filepath.Dir(thisFile)
installCTK = getBoolEnvVar("INSTALL_CTK", false)
installCTK = getBoolEnvVar("E2E_INSTALL_CTK", false)
ImageRepo = os.Getenv("E2E_IMAGE_REPO")
Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set")
@ -108,17 +117,21 @@ func getTestEnv() {
ImageTag = os.Getenv("E2E_IMAGE_TAG")
Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set")
sshKey = os.Getenv("SSH_KEY")
Expect(sshKey).NotTo(BeEmpty(), "SSH_KEY environment variable must be set")
// TODO (@ArangoGutierrez):
// once https://github.com/NVIDIA/nvidia-container-toolkit/pull/602
// is merged, remove this
ImageTag = fmt.Sprintf("%s-ubuntu20.04", ImageTag)
sshUser = os.Getenv("SSH_USER")
Expect(sshUser).NotTo(BeEmpty(), "SSH_USER environment variable must be set")
sshKey = os.Getenv("E2E_SSH_KEY")
Expect(sshKey).NotTo(BeEmpty(), "E2E_SSH_KEY environment variable must be set")
host = os.Getenv("REMOTE_HOST")
Expect(host).NotTo(BeEmpty(), "REMOTE_HOST environment variable must be set")
sshUser = os.Getenv("E2E_SSH_USER")
Expect(sshUser).NotTo(BeEmpty(), "E2E_SSH_USER environment variable must be set")
sshPort = os.Getenv("REMOTE_PORT")
Expect(sshPort).NotTo(BeEmpty(), "REMOTE_PORT environment variable must be set")
host = os.Getenv("E2E_SSH_HOST")
Expect(host).NotTo(BeEmpty(), "E2E_SSH_HOST environment variable must be set")
sshPort = getIntEnvVar("E2E_SSH_PORT", 22)
// Get current working directory
cwd, err = os.Getwd()
@ -137,3 +150,17 @@ func getBoolEnvVar(key string, defaultValue bool) bool {
}
return boolValue
}
// getIntEnvVar returns the integer value of the environment variable or the default value if not set.
func getIntEnvVar(key string, defaultValue int) string {
value := os.Getenv(key)
if value == "" {
return strconv.Itoa(defaultValue)
}
intValue, err := strconv.Atoi(value)
if err != nil {
return strconv.Itoa(defaultValue)
}
return strconv.Itoa(intValue)
}

View File

@ -19,6 +19,7 @@ package e2e
import (
"context"
"fmt"
"path/filepath"
"strings"
@ -85,7 +86,7 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
var err error
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", vectorAddImage))
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
@ -93,21 +94,21 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
It("should support automatic CDI spec generation", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", vectorAddImage))
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out2))
})
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia --gpus all")
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", vectorAddImage))
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out3))
})
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
By("Running docker run with --gpus all")
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", vectorAddImage))
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out4))
})
@ -116,38 +117,33 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
// A deviceQuery sample runs in a container with access to all GPUs
// The following should all produce the same result.
When("Running the cuda-deviceQuery sample", Ordered, func() {
BeforeAll(func(ctx context.Context) {
_, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
Expect(err).ToNot(HaveOccurred())
})
var referenceOutput string
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
var err error
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", deviceQueryImage))
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
})
It("should support automatic CDI spec generation", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", deviceQueryImage))
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out2))
})
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
By("Running docker run with --runtime=nvidia --gpus all")
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", deviceQueryImage))
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out3))
})
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
By("Running docker run with --gpus all")
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", deviceQueryImage))
Expect(err).ToNot(HaveOccurred())
Expect(referenceOutput).To(Equal(out4))
})
@ -155,7 +151,7 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
When("Testing CUDA Forward compatibility", Ordered, func() {
BeforeAll(func(ctx context.Context) {
compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
compatOutput, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void %s bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"", cudaImage))
Expect(err).ToNot(HaveOccurred())
Expect(compatOutput).ToNot(BeEmpty())
@ -179,21 +175,21 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
})
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
})
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --gpus all")
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
Expect(err).ToNot(HaveOccurred())
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
})