mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
45
tests/e2e/Makefile
Normal file
45
tests/e2e/Makefile
Normal file
@@ -0,0 +1,45 @@
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
GO_CMD ?= go
|
||||
|
||||
include $(CURDIR)/versions.mk
|
||||
|
||||
E2E_RUNTIME ?= docker
|
||||
|
||||
E2E_INSTALL_CTK ?= false
|
||||
|
||||
ifeq ($($(DIST)),)
|
||||
DIST ?= ubuntu20.04
|
||||
endif
|
||||
IMAGE_TAG ?= $(VERSION)-$(DIST)
|
||||
IMAGE = $(IMAGE_NAME):$(IMAGE_TAG)
|
||||
|
||||
E2E_SSH_KEY ?=
|
||||
E2E_SSH_USER ?=
|
||||
E2E_SSH_HOST ?=
|
||||
E2E_SSH_PORT ?= 22
|
||||
|
||||
.PHONY: test
|
||||
test:
|
||||
cd $(CURDIR)/tests/e2e && $(GO_CMD) test -v . -args \
|
||||
-ginkgo.focus="$(E2E_RUNTIME)" \
|
||||
-test.timeout=1h \
|
||||
-ginkgo.v \
|
||||
-install-ctk=$(E2E_INSTALL_CTK) \
|
||||
-toolkit-image=$(IMAGE) \
|
||||
-ssh-key=$(E2E_SSH_KEY) \
|
||||
-ssh-user=$(E2E_SSH_USER) \
|
||||
-remote-host=$(E2E_SSH_HOST) \
|
||||
-remote-port=$(E2E_SSH_PORT)
|
||||
63
tests/e2e/e2e_test.go
Normal file
63
tests/e2e/e2e_test.go
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Test context
|
||||
var (
|
||||
ctx context.Context
|
||||
|
||||
installCTK bool
|
||||
|
||||
image string
|
||||
|
||||
sshKey string
|
||||
sshUser string
|
||||
host string
|
||||
sshPort string
|
||||
)
|
||||
|
||||
func init() {
|
||||
flag.BoolVar(&installCTK, "install-ctk", false, "Install the NVIDIA Container Toolkit")
|
||||
flag.StringVar(&image, "toolkit-image", "", "Repository of the image to test")
|
||||
flag.StringVar(&sshKey, "ssh-key", "", "SSH key to use for remote login")
|
||||
flag.StringVar(&sshUser, "ssh-user", "", "SSH user to use for remote login")
|
||||
flag.StringVar(&host, "remote-host", "", "Hostname of the remote machine")
|
||||
flag.StringVar(&sshPort, "remote-port", "22", "SSH port to use for remote login")
|
||||
}
|
||||
|
||||
func TestMain(t *testing.T) {
|
||||
suiteName := "NVIDIA Container Toolkit E2E"
|
||||
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t,
|
||||
suiteName,
|
||||
)
|
||||
}
|
||||
|
||||
// BeforeSuite runs before the test suite
|
||||
var _ = BeforeSuite(func() {
|
||||
ctx = context.Background()
|
||||
})
|
||||
30
tests/e2e/infra/aws.yaml
Normal file
30
tests/e2e/infra/aws.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
apiVersion: holodeck.nvidia.com/v1alpha1
|
||||
kind: Environment
|
||||
metadata:
|
||||
name: HOLODECK_NAME
|
||||
description: "end-to-end test infrastructure"
|
||||
spec:
|
||||
provider: aws
|
||||
auth:
|
||||
keyName: cnt-ci
|
||||
privateKey: HOLODECK_PRIVATE_KEY
|
||||
instance:
|
||||
type: g4dn.xlarge
|
||||
region: us-west-1
|
||||
ingressIpRanges:
|
||||
- 18.190.12.32/32
|
||||
- 3.143.46.93/32
|
||||
- 44.230.241.223/32
|
||||
- 44.235.4.62/32
|
||||
- 52.15.119.136/32
|
||||
- 52.24.205.48/32
|
||||
image:
|
||||
architecture: amd64
|
||||
imageId: ami-0ce2cb35386fc22e9
|
||||
containerRuntime:
|
||||
install: true
|
||||
name: docker
|
||||
nvidiaContainerToolkit:
|
||||
install: false
|
||||
nvidiaDriver:
|
||||
install: true
|
||||
118
tests/e2e/installer.go
Normal file
118
tests/e2e/installer.go
Normal file
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"text/template"
|
||||
)
|
||||
|
||||
// dockerInstallTemplate is a template for installing the NVIDIA Container Toolkit
|
||||
// on a host using Docker.
|
||||
var dockerInstallTemplate = `
|
||||
#! /usr/bin/env bash
|
||||
set -xe
|
||||
|
||||
: ${IMAGE:={{.Image}}}
|
||||
|
||||
# Create a temporary directory
|
||||
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
|
||||
mkdir -p "$TEMP_DIR"
|
||||
|
||||
# Given that docker has an init function that checks for the existence of the
|
||||
# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
|
||||
# in the /usr/bin directory.
|
||||
# See https://github.com/moby/moby/blob/20a05dabf44934447d1a66cdd616cc803b81d4e2/daemon/nvidia_linux.go#L32-L46
|
||||
sudo rm -f /usr/bin/nvidia-container-runtime-hook
|
||||
sudo ln -s "$TEMP_DIR/toolkit/nvidia-container-runtime-hook" /usr/bin/nvidia-container-runtime-hook
|
||||
|
||||
docker run --pid=host --rm -i --privileged \
|
||||
-v /:/host \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-v "$TEMP_DIR:$TEMP_DIR" \
|
||||
-v /etc/docker:/config-root \
|
||||
${IMAGE} \
|
||||
--root "$TEMP_DIR" \
|
||||
--runtime=docker \
|
||||
--config=/config-root/daemon.json \
|
||||
--driver-root=/ \
|
||||
--no-daemon \
|
||||
--restart-mode=systemd
|
||||
`
|
||||
|
||||
type ToolkitInstaller struct {
|
||||
runner Runner
|
||||
template string
|
||||
|
||||
Image string
|
||||
}
|
||||
|
||||
type installerOption func(*ToolkitInstaller)
|
||||
|
||||
func WithRunner(r Runner) installerOption {
|
||||
return func(i *ToolkitInstaller) {
|
||||
i.runner = r
|
||||
}
|
||||
}
|
||||
|
||||
func WithImage(image string) installerOption {
|
||||
return func(i *ToolkitInstaller) {
|
||||
i.Image = image
|
||||
}
|
||||
}
|
||||
|
||||
func WithTemplate(template string) installerOption {
|
||||
return func(i *ToolkitInstaller) {
|
||||
i.template = template
|
||||
}
|
||||
}
|
||||
|
||||
func NewToolkitInstaller(opts ...installerOption) (*ToolkitInstaller, error) {
|
||||
i := &ToolkitInstaller{
|
||||
runner: localRunner{},
|
||||
template: dockerInstallTemplate,
|
||||
}
|
||||
|
||||
for _, opt := range opts {
|
||||
opt(i)
|
||||
}
|
||||
|
||||
if i.Image == "" {
|
||||
return nil, fmt.Errorf("image is required")
|
||||
}
|
||||
|
||||
return i, nil
|
||||
}
|
||||
|
||||
func (i *ToolkitInstaller) Install() error {
|
||||
// Parse the combined template
|
||||
tmpl, err := template.New("installScript").Parse(i.template)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing template: %w", err)
|
||||
}
|
||||
|
||||
// Execute the template
|
||||
var renderedScript bytes.Buffer
|
||||
err = tmpl.Execute(&renderedScript, i)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error executing template: %w", err)
|
||||
}
|
||||
|
||||
_, _, err = i.runner.Run(renderedScript.String())
|
||||
return err
|
||||
}
|
||||
218
tests/e2e/nvidia-container-toolkit_test.go
Normal file
218
tests/e2e/nvidia-container-toolkit_test.go
Normal file
@@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"context"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Integration tests for Docker runtime
|
||||
var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
var r Runner
|
||||
|
||||
// Install the NVIDIA Container Toolkit
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
r = NewRunner(
|
||||
WithHost(host),
|
||||
WithPort(sshPort),
|
||||
WithSshKey(sshKey),
|
||||
WithSshUser(sshUser),
|
||||
)
|
||||
if installCTK {
|
||||
installer, err := NewToolkitInstaller(
|
||||
WithRunner(r),
|
||||
WithImage(image),
|
||||
WithTemplate(dockerInstallTemplate),
|
||||
)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
err = installer.Install()
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
}
|
||||
})
|
||||
|
||||
// GPUs are accessible in a container: Running nvidia-smi -L inside the
|
||||
// container shows the same output inside the container as outside the
|
||||
// container. This means that the following commands must all produce
|
||||
// the same output
|
||||
When("running nvidia-smi -L", Ordered, func() {
|
||||
var hostOutput string
|
||||
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull ubuntu")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
hostOutput, _, err = r.Run("nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
})
|
||||
|
||||
// A vectorAdd sample runs in a container with access to all GPUs.
|
||||
// The following should all produce the same result.
|
||||
When("Running the cuda-vectorAdd sample", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
var referenceOutput string
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
var err error
|
||||
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out2))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out3))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out4))
|
||||
})
|
||||
})
|
||||
|
||||
// A deviceQuery sample runs in a container with access to all GPUs
|
||||
// The following should all produce the same result.
|
||||
When("Running the cuda-deviceQuery sample", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
var referenceOutput string
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
var err error
|
||||
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out2))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out3))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out4))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("CUDA Forward compatibility", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(compatOutput).ToNot(BeEmpty())
|
||||
compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.")
|
||||
compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0]
|
||||
|
||||
driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
parts := strings.SplitN(driverOutput, ":", 2)
|
||||
Expect(parts).To(HaveLen(2))
|
||||
|
||||
hostDriverVersion := strings.TrimSpace(parts[1])
|
||||
Expect(hostDriverVersion).ToNot(BeEmpty())
|
||||
driverMajor := strings.SplitN(hostDriverVersion, ".", 2)[0]
|
||||
|
||||
if driverMajor >= compatMajor {
|
||||
GinkgoLogr.Info("CUDA Forward Compatibility tests require an older driver version", "hostDriverVersion", hostDriverVersion, "compatDriverVersion", compatDriverVersion)
|
||||
Skip("CUDA Forward Compatibility tests require an older driver version")
|
||||
}
|
||||
})
|
||||
|
||||
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
|
||||
})
|
||||
})
|
||||
})
|
||||
171
tests/e2e/runner.go
Normal file
171
tests/e2e/runner.go
Normal file
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"golang.org/x/crypto/ssh"
|
||||
)
|
||||
|
||||
type localRunner struct{}
|
||||
type remoteRunner struct {
|
||||
sshKey string
|
||||
sshUser string
|
||||
host string
|
||||
port string
|
||||
}
|
||||
|
||||
type runnerOption func(*remoteRunner)
|
||||
|
||||
type Runner interface {
|
||||
Run(script string) (string, string, error)
|
||||
}
|
||||
|
||||
func WithSshKey(key string) runnerOption {
|
||||
return func(r *remoteRunner) {
|
||||
r.sshKey = key
|
||||
}
|
||||
}
|
||||
|
||||
func WithSshUser(user string) runnerOption {
|
||||
return func(r *remoteRunner) {
|
||||
r.sshUser = user
|
||||
}
|
||||
}
|
||||
|
||||
func WithHost(host string) runnerOption {
|
||||
return func(r *remoteRunner) {
|
||||
r.host = host
|
||||
}
|
||||
}
|
||||
|
||||
func WithPort(port string) runnerOption {
|
||||
return func(r *remoteRunner) {
|
||||
r.port = port
|
||||
}
|
||||
}
|
||||
|
||||
func NewRunner(opts ...runnerOption) Runner {
|
||||
r := &remoteRunner{}
|
||||
for _, opt := range opts {
|
||||
opt(r)
|
||||
}
|
||||
|
||||
// If the Host is empty, return a local runner
|
||||
if r.host == "" {
|
||||
return localRunner{}
|
||||
}
|
||||
|
||||
// Otherwise, return a remote runner
|
||||
return r
|
||||
}
|
||||
|
||||
func (l localRunner) Run(script string) (string, string, error) {
|
||||
// Create a command to run the script using bash
|
||||
cmd := exec.Command("bash", "-c", script)
|
||||
|
||||
// Buffer to capture standard output
|
||||
var stdout bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
|
||||
// Buffer to capture standard error
|
||||
var stderr bytes.Buffer
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
// Run the command
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("script execution failed: %v\nSTDOUT: %s\nSTDERR: %s", err, stdout.String(), stderr.String())
|
||||
}
|
||||
|
||||
// Return the captured stdout and nil error
|
||||
return stdout.String(), "", nil
|
||||
}
|
||||
|
||||
func (r remoteRunner) Run(script string) (string, string, error) {
|
||||
// Create a new SSH connection
|
||||
client, err := connectOrDie(r.sshKey, r.sshUser, r.host, r.port)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("failed to connect to %s: %v", r.host, err)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
// Create a session
|
||||
session, err := client.NewSession()
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("failed to create session: %v", err)
|
||||
}
|
||||
defer session.Close()
|
||||
|
||||
// Capture stdout and stderr
|
||||
var stdout, stderr bytes.Buffer
|
||||
session.Stdout = &stdout
|
||||
session.Stderr = &stderr
|
||||
|
||||
// Run the script
|
||||
err = session.Run(script)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("script execution failed: %v\nSTDOUT: %s\nSTDERR: %s", err, stdout.String(), stderr.String())
|
||||
}
|
||||
|
||||
// Return stdout as string if no errors
|
||||
return stdout.String(), "", nil
|
||||
}
|
||||
|
||||
// createSshClient creates a ssh client, and retries if it fails to connect
|
||||
func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) {
|
||||
var client *ssh.Client
|
||||
var err error
|
||||
key, err := os.ReadFile(sshKey)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read key file: %v", err)
|
||||
}
|
||||
signer, err := ssh.ParsePrivateKey(key)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse private key: %v", err)
|
||||
}
|
||||
sshConfig := &ssh.ClientConfig{
|
||||
User: sshUser,
|
||||
Auth: []ssh.AuthMethod{
|
||||
ssh.PublicKeys(signer),
|
||||
},
|
||||
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
|
||||
}
|
||||
|
||||
connectionFailed := false
|
||||
for i := 0; i < 20; i++ {
|
||||
client, err = ssh.Dial("tcp", host+":"+port, sshConfig)
|
||||
if err == nil {
|
||||
return client, nil // Connection succeeded, return the client.
|
||||
}
|
||||
connectionFailed = true
|
||||
// Sleep for a brief moment before retrying.
|
||||
// You can adjust the duration based on your requirements.
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
|
||||
if connectionFailed {
|
||||
return nil, fmt.Errorf("failed to connect to %s after 10 retries, giving up", host)
|
||||
}
|
||||
|
||||
return client, nil
|
||||
}
|
||||
Reference in New Issue
Block a user