mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
[no-relnote] Update Github Actions E2E
Some checks are pending
Some checks are pending
Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
This commit is contained in:
parent
6df26cc7a5
commit
f9792b737a
1
.github/workflows/ci.yaml
vendored
1
.github/workflows/ci.yaml
vendored
@ -51,3 +51,4 @@ jobs:
|
||||
uses: ./.github/workflows/e2e.yaml
|
||||
with:
|
||||
version: ${{ needs.variables.outputs.version }}
|
||||
distribution: ubuntu20.04
|
||||
|
||||
16
.github/workflows/e2e.yaml
vendored
16
.github/workflows/e2e.yaml
vendored
@ -20,6 +20,9 @@ on:
|
||||
version:
|
||||
required: true
|
||||
type: string
|
||||
distribution:
|
||||
required: true
|
||||
type: string
|
||||
secrets:
|
||||
AWS_ACCESS_KEY_ID:
|
||||
required: true
|
||||
@ -70,8 +73,8 @@ jobs:
|
||||
|
||||
- name: Run e2e tests
|
||||
env:
|
||||
IMAGE_NAME: ghcr.io/nvidia/container-toolkit
|
||||
VERSION: ${{ inputs.version }}
|
||||
E2E_IMAGE_REPO: ghcr.io/nvidia/container-toolkit
|
||||
E2E_IMAGE_TAG: ${{ inputs.version }}-${{ inputs.distribution }}
|
||||
SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
|
||||
E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }}
|
||||
E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }}
|
||||
@ -82,8 +85,15 @@ jobs:
|
||||
chmod 600 "$e2e_ssh_key"
|
||||
export E2E_SSH_KEY="$e2e_ssh_key"
|
||||
|
||||
make -f tests/e2e/Makefile test
|
||||
make -f tests/e2e/Makefile test-e2e
|
||||
|
||||
- name: Archive Ginkgo logs
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: ginkgo-logs
|
||||
path: ginkgo.json
|
||||
retention-days: 15
|
||||
|
||||
- name: Send Slack alert notification
|
||||
if: ${{ failure() }}
|
||||
uses: slackapi/slack-github-action@v2.0.0
|
||||
|
||||
@ -59,9 +59,10 @@ compatibility runs, and pre‑release validation of new CTK builds.
|
||||
| Variable | Required | Example | Description |
|
||||
|----------|----------|---------|-------------|
|
||||
| `INSTALL_CTK` | ✖ | `true` | When `true` the test installs CTK on the remote host before running the image. When `false` it assumes CTK is already present. |
|
||||
| `TOOLKIT_IMAGE` | ✔ | `nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9` | Image that will be pulled & executed. |
|
||||
| `SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
|
||||
| `SSH_USER` | ✔ | `ubuntu` | Username on the remote host. |
|
||||
| `E2E_IMAGE_REPO` | ✔ | `ghcr.io/nvidia/container-toolkit` | Container Toolkit Image |
|
||||
| `E2E_IMAGE_TAG` | ✔ | `latest` | Image tag |
|
||||
| `E2E_SSH_KEY` | ✔ | `/home/ci/.ssh/id_rsa` | Private key used for authentication. |
|
||||
| `E2E_SSH_USER` | ✔ | `ubuntu` | Username on the remote host. |
|
||||
| `REMOTE_HOST` | ✔ | `gpurunner01.corp.local` | Hostname or IP address of the target node. |
|
||||
| `REMOTE_PORT` | ✔ | `22` | SSH port of the target node. |
|
||||
|
||||
@ -92,7 +93,8 @@ bin/ginkgo:
|
||||
### 6.1 Basic invocation
|
||||
```bash
|
||||
INSTALL_CTK=true \
|
||||
TOOLKIT_IMAGE=nvcr.io/nvidia/cuda:12.4.0-runtime-ubi9 \
|
||||
E2E_IMAGE_REPO=ghcr.io/nvidia/container-toolkit \
|
||||
E2E_IMAGE_TAG=latest \
|
||||
SSH_KEY=$HOME/.ssh/id_rsa \
|
||||
SSH_USER=ubuntu \
|
||||
REMOTE_HOST=10.0.0.15 \
|
||||
|
||||
@ -29,6 +29,12 @@ import (
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
const (
|
||||
vectorAddImage = "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0"
|
||||
deviceQueryImage = "nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0"
|
||||
cudaImage = "nvcr.io/nvidia/cuda:12.8.0-base-ubi8"
|
||||
)
|
||||
|
||||
// Test context
|
||||
var (
|
||||
ctx context.Context
|
||||
@ -88,6 +94,8 @@ var _ = BeforeSuite(func() {
|
||||
_, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
_, _, err = runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
_, _, err = runner.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
@ -100,7 +108,7 @@ func getTestEnv() {
|
||||
_, thisFile, _, _ := runtime.Caller(0)
|
||||
packagePath = filepath.Dir(thisFile)
|
||||
|
||||
installCTK = getBoolEnvVar("INSTALL_CTK", false)
|
||||
installCTK = getBoolEnvVar("E2E_INSTALL_CTK", false)
|
||||
|
||||
ImageRepo = os.Getenv("E2E_IMAGE_REPO")
|
||||
Expect(ImageRepo).NotTo(BeEmpty(), "E2E_IMAGE_REPO environment variable must be set")
|
||||
@ -108,17 +116,16 @@ func getTestEnv() {
|
||||
ImageTag = os.Getenv("E2E_IMAGE_TAG")
|
||||
Expect(ImageTag).NotTo(BeEmpty(), "E2E_IMAGE_TAG environment variable must be set")
|
||||
|
||||
sshKey = os.Getenv("SSH_KEY")
|
||||
Expect(sshKey).NotTo(BeEmpty(), "SSH_KEY environment variable must be set")
|
||||
sshKey = os.Getenv("E2E_SSH_KEY")
|
||||
Expect(sshKey).NotTo(BeEmpty(), "E2E_SSH_KEY environment variable must be set")
|
||||
|
||||
sshUser = os.Getenv("SSH_USER")
|
||||
sshUser = os.Getenv("E2E_SSH_USER")
|
||||
Expect(sshUser).NotTo(BeEmpty(), "SSH_USER environment variable must be set")
|
||||
|
||||
host = os.Getenv("REMOTE_HOST")
|
||||
host = os.Getenv("E2E_SSH_HOST")
|
||||
Expect(host).NotTo(BeEmpty(), "REMOTE_HOST environment variable must be set")
|
||||
|
||||
sshPort = os.Getenv("REMOTE_PORT")
|
||||
Expect(sshPort).NotTo(BeEmpty(), "REMOTE_PORT environment variable must be set")
|
||||
sshPort = getIntEnvVar("E2E_SSH_PORT", 22)
|
||||
|
||||
// Get current working directory
|
||||
cwd, err = os.Getwd()
|
||||
@ -137,3 +144,17 @@ func getBoolEnvVar(key string, defaultValue bool) bool {
|
||||
}
|
||||
return boolValue
|
||||
}
|
||||
|
||||
// getIntEnvVar returns the integer value of the environment variable or the default value if not set.
|
||||
func getIntEnvVar(key string, defaultValue int) string {
|
||||
value := os.Getenv(key)
|
||||
if value == "" {
|
||||
return strconv.Itoa(defaultValue)
|
||||
}
|
||||
intValue, err := strconv.Atoi(value)
|
||||
if err != nil {
|
||||
return strconv.Itoa(defaultValue)
|
||||
}
|
||||
|
||||
return strconv.Itoa(intValue)
|
||||
}
|
||||
|
||||
@ -19,6 +19,7 @@ package e2e
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
@ -85,7 +86,7 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
|
||||
var err error
|
||||
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", vectorAddImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
|
||||
@ -93,21 +94,21 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
|
||||
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", vectorAddImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out2))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
By("Running docker run with --runtime=nvidia --gpus all")
|
||||
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", vectorAddImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out3))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
By("Running docker run with --gpus all")
|
||||
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", vectorAddImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out4))
|
||||
})
|
||||
@ -116,38 +117,33 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
// A deviceQuery sample runs in a container with access to all GPUs
|
||||
// The following should all produce the same result.
|
||||
When("Running the cuda-deviceQuery sample", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := runner.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
var referenceOutput string
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
|
||||
var err error
|
||||
referenceOutput, _, err = runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
referenceOutput, _, err = runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all %s", deviceQueryImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
By("Running docker run with --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
|
||||
out2, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
out2, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s", deviceQueryImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out2))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
By("Running docker run with --runtime=nvidia --gpus all")
|
||||
out3, _, err := runner.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
out3, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --runtime=nvidia --gpus all %s", deviceQueryImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out3))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
By("Running docker run with --gpus all")
|
||||
out4, _, err := runner.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
out4, _, err := runner.Run(fmt.Sprintf("docker run --rm -i --gpus all %s", deviceQueryImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out4))
|
||||
})
|
||||
@ -155,7 +151,7 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
|
||||
When("Testing CUDA Forward compatibility", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
compatOutput, _, err := runner.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
|
||||
compatOutput, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void %s bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"", cudaImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(compatOutput).ToNot(BeEmpty())
|
||||
|
||||
@ -179,21 +175,21 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
|
||||
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
|
||||
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all")
|
||||
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
|
||||
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all")
|
||||
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
By("Running docker run with -e NVIDIA_DISABLE_REQUIRE=true --gpus all")
|
||||
ldconfigOut, _, err := runner.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
ldconfigOut, _, err := runner.Run(fmt.Sprintf("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all %s bash -c \"ldconfig -p | grep libcuda.so.1\"", cudaImage))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user