Move test to tests

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2025-06-26 18:18:24 +00:00 · 2025-02-28 17:21:23 +02:00
parent 91a983a341
commit 069926e4b6
807 changed files with 444269 additions and 78 deletions
--- a/tests/e2e/Makefile
+++ b/tests/e2e/Makefile
@@ -0,0 +1,45 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+GO_CMD ?= go
+
+include $(CURDIR)/versions.mk
+
+E2E_RUNTIME ?= docker
+
+E2E_INSTALL_CTK ?= false
+
+ifeq ($($(DIST)),)
+DIST ?= ubuntu20.04
+endif
+IMAGE_TAG ?= $(VERSION)-$(DIST)
+IMAGE = $(IMAGE_NAME):$(IMAGE_TAG)
+
+E2E_SSH_KEY ?=
+E2E_SSH_USER ?=
+E2E_SSH_HOST ?=
+E2E_SSH_PORT ?= 22
+
+.PHONY: test
+test:
+	cd $(CURDIR)/tests/e2e && $(GO_CMD) test -v . -args \
+		-ginkgo.focus="$(E2E_RUNTIME)" \
+		-test.timeout=1h \
+		-ginkgo.v \
+		-install-ctk=$(E2E_INSTALL_CTK) \
+		-toolkit-image=$(IMAGE) \
+		-ssh-key=$(E2E_SSH_KEY) \
+		-ssh-user=$(E2E_SSH_USER) \
+		-remote-host=$(E2E_SSH_HOST) \
+		-remote-port=$(E2E_SSH_PORT)
--- a/tests/e2e/e2e_test.go
+++ b/tests/e2e/e2e_test.go
@@ -0,0 +1,63 @@
+/*
+* Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+ */
+
+package e2e
+
+import (
+	"context"
+	"flag"
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Test context
+var (
+	ctx context.Context
+
+	installCTK bool
+
+	image string
+
+	sshKey  string
+	sshUser string
+	host    string
+	sshPort string
+)
+
+func init() {
+	flag.BoolVar(&installCTK, "install-ctk", false, "Install the NVIDIA Container Toolkit")
+	flag.StringVar(&image, "toolkit-image", "", "Repository of the image to test")
+	flag.StringVar(&sshKey, "ssh-key", "", "SSH key to use for remote login")
+	flag.StringVar(&sshUser, "ssh-user", "", "SSH user to use for remote login")
+	flag.StringVar(&host, "remote-host", "", "Hostname of the remote machine")
+	flag.StringVar(&sshPort, "remote-port", "22", "SSH port to use for remote login")
+}
+
+func TestMain(t *testing.T) {
+	suiteName := "NVIDIA Container Toolkit E2E"
+
+	RegisterFailHandler(Fail)
+	RunSpecs(t,
+		suiteName,
+	)
+}
+
+// BeforeSuite runs before the test suite
+var _ = BeforeSuite(func() {
+	ctx = context.Background()
+})
--- a/tests/e2e/infra/aws.yaml
+++ b/tests/e2e/infra/aws.yaml
@@ -0,0 +1,30 @@
+apiVersion: holodeck.nvidia.com/v1alpha1
+kind: Environment
+metadata:
+  name: HOLODECK_NAME
+  description: "end-to-end test infrastructure"
+spec:
+  provider: aws
+  auth:
+    keyName: cnt-ci
+    privateKey: HOLODECK_PRIVATE_KEY
+  instance:
+    type: g4dn.xlarge
+    region: us-west-1
+    ingressIpRanges:
+    - 18.190.12.32/32
+    - 3.143.46.93/32
+    - 44.230.241.223/32
+    - 44.235.4.62/32
+    - 52.15.119.136/32
+    - 52.24.205.48/32
+    image:
+      architecture: amd64
+      imageId: ami-0ce2cb35386fc22e9
+  containerRuntime:
+    install: true
+    name: docker
+  nvidiaContainerToolkit:
+    install: false
+  nvidiaDriver:
+    install: true
--- a/tests/e2e/installer.go
+++ b/tests/e2e/installer.go
@@ -0,0 +1,118 @@
+/*
+* Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+ */
+
+package e2e
+
+import (
+	"bytes"
+	"fmt"
+	"text/template"
+)
+
+// dockerInstallTemplate is a template for installing the NVIDIA Container Toolkit
+// on a host using Docker.
+var dockerInstallTemplate = `
+#! /usr/bin/env bash
+set -xe
+
+: ${IMAGE:={{.Image}}}
+
+# Create a temporary directory
+TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
+mkdir -p "$TEMP_DIR"
+
+# Given that docker has an init function that checks for the existence of the
+# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
+# in the /usr/bin directory.
+# See https://github.com/moby/moby/blob/20a05dabf44934447d1a66cdd616cc803b81d4e2/daemon/nvidia_linux.go#L32-L46
+sudo rm -f /usr/bin/nvidia-container-runtime-hook
+sudo ln -s "$TEMP_DIR/toolkit/nvidia-container-runtime-hook" /usr/bin/nvidia-container-runtime-hook
+
+docker run --pid=host --rm -i --privileged	\
+	-v /:/host	\
+	-v /var/run/docker.sock:/var/run/docker.sock	\
+	-v "$TEMP_DIR:$TEMP_DIR"	\
+	-v /etc/docker:/config-root	\
+	${IMAGE}	\
+	--root "$TEMP_DIR"	\
+	--runtime=docker	\
+	--config=/config-root/daemon.json	\
+	--driver-root=/	\
+	--no-daemon	\
+	--restart-mode=systemd
+`
+
+type ToolkitInstaller struct {
+	runner   Runner
+	template string
+
+	Image string
+}
+
+type installerOption func(*ToolkitInstaller)
+
+func WithRunner(r Runner) installerOption {
+	return func(i *ToolkitInstaller) {
+		i.runner = r
+	}
+}
+
+func WithImage(image string) installerOption {
+	return func(i *ToolkitInstaller) {
+		i.Image = image
+	}
+}
+
+func WithTemplate(template string) installerOption {
+	return func(i *ToolkitInstaller) {
+		i.template = template
+	}
+}
+
+func NewToolkitInstaller(opts ...installerOption) (*ToolkitInstaller, error) {
+	i := &ToolkitInstaller{
+		runner:   localRunner{},
+		template: dockerInstallTemplate,
+	}
+
+	for _, opt := range opts {
+		opt(i)
+	}
+
+	if i.Image == "" {
+		return nil, fmt.Errorf("image is required")
+	}
+
+	return i, nil
+}
+
+func (i *ToolkitInstaller) Install() error {
+	// Parse the combined template
+	tmpl, err := template.New("installScript").Parse(i.template)
+	if err != nil {
+		return fmt.Errorf("error parsing template: %w", err)
+	}
+
+	// Execute the template
+	var renderedScript bytes.Buffer
+	err = tmpl.Execute(&renderedScript, i)
+	if err != nil {
+		return fmt.Errorf("error executing template: %w", err)
+	}
+
+	_, _, err = i.runner.Run(renderedScript.String())
+	return err
+}
--- a/tests/e2e/nvidia-container-toolkit_test.go
+++ b/tests/e2e/nvidia-container-toolkit_test.go
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package e2e
+
+import (
+	"context"
+	"path/filepath"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+// Integration tests for Docker runtime
+var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
+	var r Runner
+
+	// Install the NVIDIA Container Toolkit
+	BeforeAll(func(ctx context.Context) {
+		r = NewRunner(
+			WithHost(host),
+			WithPort(sshPort),
+			WithSshKey(sshKey),
+			WithSshUser(sshUser),
+		)
+		if installCTK {
+			installer, err := NewToolkitInstaller(
+				WithRunner(r),
+				WithImage(image),
+				WithTemplate(dockerInstallTemplate),
+			)
+			Expect(err).ToNot(HaveOccurred())
+			err = installer.Install()
+			Expect(err).ToNot(HaveOccurred())
+		}
+	})
+
+	// GPUs are accessible in a container: Running nvidia-smi -L inside the
+	// container shows the same output inside the container as outside the
+	// container. This means that the following commands must all produce
+	// the same output
+	When("running nvidia-smi -L", Ordered, func() {
+		var hostOutput string
+
+		BeforeAll(func(ctx context.Context) {
+			_, _, err := r.Run("docker pull ubuntu")
+			Expect(err).ToNot(HaveOccurred())
+
+			hostOutput, _, err = r.Run("nvidia-smi -L")
+			Expect(err).ToNot(HaveOccurred())
+		})
+
+		It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
+			containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(containerOutput).To(Equal(hostOutput))
+		})
+
+		It("should support automatic CDI spec generation", func(ctx context.Context) {
+			containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(containerOutput).To(Equal(hostOutput))
+		})
+
+		It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) {
+			containerOutput, _, err := r.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(containerOutput).To(Equal(hostOutput))
+		})
+
+		It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
+			containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(containerOutput).To(Equal(hostOutput))
+		})
+
+		It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
+			containerOutput, _, err := r.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(containerOutput).To(Equal(hostOutput))
+		})
+	})
+
+	// A vectorAdd sample runs in a container with access to all GPUs.
+	// The following should all produce the same result.
+	When("Running the cuda-vectorAdd sample", Ordered, func() {
+		BeforeAll(func(ctx context.Context) {
+			_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+		})
+
+		var referenceOutput string
+
+		It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
+			var err error
+			referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+
+			Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
+		})
+
+		It("should support automatic CDI spec generation", func(ctx context.Context) {
+			out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(referenceOutput).To(Equal(out2))
+		})
+
+		It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
+			out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(referenceOutput).To(Equal(out3))
+		})
+
+		It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
+			out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(referenceOutput).To(Equal(out4))
+		})
+	})
+
+	// A deviceQuery sample runs in a container with access to all GPUs
+	// The following should all produce the same result.
+	When("Running the cuda-deviceQuery sample", Ordered, func() {
+		BeforeAll(func(ctx context.Context) {
+			_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+		})
+
+		var referenceOutput string
+
+		It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
+			var err error
+			referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+
+			Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
+		})
+
+		It("should support automatic CDI spec generation", func(ctx context.Context) {
+			out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(referenceOutput).To(Equal(out2))
+		})
+
+		It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
+			out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(referenceOutput).To(Equal(out3))
+		})
+
+		It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
+			out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(referenceOutput).To(Equal(out4))
+		})
+	})
+
+	Describe("CUDA Forward compatibility", Ordered, func() {
+		BeforeAll(func(ctx context.Context) {
+			_, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
+			Expect(err).ToNot(HaveOccurred())
+		})
+
+		BeforeAll(func(ctx context.Context) {
+			compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(compatOutput).ToNot(BeEmpty())
+			compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.")
+			compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0]
+
+			driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"")
+			Expect(err).ToNot(HaveOccurred())
+			parts := strings.SplitN(driverOutput, ":", 2)
+			Expect(parts).To(HaveLen(2))
+
+			hostDriverVersion := strings.TrimSpace(parts[1])
+			Expect(hostDriverVersion).ToNot(BeEmpty())
+			driverMajor := strings.SplitN(hostDriverVersion, ".", 2)[0]
+
+			if driverMajor >= compatMajor {
+				GinkgoLogr.Info("CUDA Forward Compatibility tests require an older driver version", "hostDriverVersion", hostDriverVersion, "compatDriverVersion", compatDriverVersion)
+				Skip("CUDA Forward Compatibility tests require an older driver version")
+			}
+		})
+
+		It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
+			ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
+		})
+
+		It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
+			ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true  --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
+		})
+
+		It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
+			ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
+		})
+	})
+})
--- a/tests/e2e/runner.go
+++ b/tests/e2e/runner.go
@@ -0,0 +1,171 @@
+/*
+* Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+ */
+
+package e2e
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"os/exec"
+	"time"
+
+	"golang.org/x/crypto/ssh"
+)
+
+type localRunner struct{}
+type remoteRunner struct {
+	sshKey  string
+	sshUser string
+	host    string
+	port    string
+}
+
+type runnerOption func(*remoteRunner)
+
+type Runner interface {
+	Run(script string) (string, string, error)
+}
+
+func WithSshKey(key string) runnerOption {
+	return func(r *remoteRunner) {
+		r.sshKey = key
+	}
+}
+
+func WithSshUser(user string) runnerOption {
+	return func(r *remoteRunner) {
+		r.sshUser = user
+	}
+}
+
+func WithHost(host string) runnerOption {
+	return func(r *remoteRunner) {
+		r.host = host
+	}
+}
+
+func WithPort(port string) runnerOption {
+	return func(r *remoteRunner) {
+		r.port = port
+	}
+}
+
+func NewRunner(opts ...runnerOption) Runner {
+	r := &remoteRunner{}
+	for _, opt := range opts {
+		opt(r)
+	}
+
+	// If the Host is empty, return a local runner
+	if r.host == "" {
+		return localRunner{}
+	}
+
+	// Otherwise, return a remote runner
+	return r
+}
+
+func (l localRunner) Run(script string) (string, string, error) {
+	// Create a command to run the script using bash
+	cmd := exec.Command("bash", "-c", script)
+
+	// Buffer to capture standard output
+	var stdout bytes.Buffer
+	cmd.Stdout = &stdout
+
+	// Buffer to capture standard error
+	var stderr bytes.Buffer
+	cmd.Stderr = &stderr
+
+	// Run the command
+	err := cmd.Run()
+	if err != nil {
+		return "", "", fmt.Errorf("script execution failed: %v\nSTDOUT: %s\nSTDERR: %s", err, stdout.String(), stderr.String())
+	}
+
+	// Return the captured stdout and nil error
+	return stdout.String(), "", nil
+}
+
+func (r remoteRunner) Run(script string) (string, string, error) {
+	// Create a new SSH connection
+	client, err := connectOrDie(r.sshKey, r.sshUser, r.host, r.port)
+	if err != nil {
+		return "", "", fmt.Errorf("failed to connect to %s: %v", r.host, err)
+	}
+	defer client.Close()
+
+	// Create a session
+	session, err := client.NewSession()
+	if err != nil {
+		return "", "", fmt.Errorf("failed to create session: %v", err)
+	}
+	defer session.Close()
+
+	// Capture stdout and stderr
+	var stdout, stderr bytes.Buffer
+	session.Stdout = &stdout
+	session.Stderr = &stderr
+
+	// Run the script
+	err = session.Run(script)
+	if err != nil {
+		return "", "", fmt.Errorf("script execution failed: %v\nSTDOUT: %s\nSTDERR: %s", err, stdout.String(), stderr.String())
+	}
+
+	// Return stdout as string if no errors
+	return stdout.String(), "", nil
+}
+
+// createSshClient creates a ssh client, and retries if it fails to connect
+func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) {
+	var client *ssh.Client
+	var err error
+	key, err := os.ReadFile(sshKey)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read key file: %v", err)
+	}
+	signer, err := ssh.ParsePrivateKey(key)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse private key: %v", err)
+	}
+	sshConfig := &ssh.ClientConfig{
+		User: sshUser,
+		Auth: []ssh.AuthMethod{
+			ssh.PublicKeys(signer),
+		},
+		HostKeyCallback: ssh.InsecureIgnoreHostKey(),
+	}
+
+	connectionFailed := false
+	for i := 0; i < 20; i++ {
+		client, err = ssh.Dial("tcp", host+":"+port, sshConfig)
+		if err == nil {
+			return client, nil // Connection succeeded, return the client.
+		}
+		connectionFailed = true
+		// Sleep for a brief moment before retrying.
+		// You can adjust the duration based on your requirements.
+		time.Sleep(1 * time.Second)
+	}
+
+	if connectionFailed {
+		return nil, fmt.Errorf("failed to connect to %s after 10 retries, giving up", host)
+	}
+
+	return client, nil
+}