mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
Compare commits
100 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f202b80a9b | ||
|
|
54af66f48c | ||
|
|
d6f610790f | ||
|
|
007faf8491 | ||
|
|
d7f498ade7 | ||
|
|
e34b8cebdb | ||
|
|
76bb848f40 | ||
|
|
02000c07f9 | ||
|
|
b3b6b824cd | ||
|
|
1aed5f4aa2 | ||
|
|
dd40dadbdc | ||
|
|
77326385ea | ||
|
|
fe56514d01 | ||
|
|
bae3e7842e | ||
|
|
e78999b08c | ||
|
|
462ca9f93f | ||
|
|
ac9146832b | ||
|
|
a734438ce2 | ||
|
|
61d94f7856 | ||
|
|
e2ff6830f5 | ||
|
|
ab050837ce | ||
|
|
becddb70e6 | ||
|
|
8069346746 | ||
|
|
34526b19c0 | ||
|
|
f8b0b43a3f | ||
|
|
ce6928ccca | ||
|
|
63e8ecbc8e | ||
|
|
d4739cb17f | ||
|
|
e8ac80146f | ||
|
|
dc0dee1f33 | ||
|
|
21827ad367 | ||
|
|
651e9f541a | ||
|
|
56b80c94b0 | ||
|
|
e096251183 | ||
|
|
cf35409004 | ||
|
|
8012e4f1be | ||
|
|
570e223276 | ||
|
|
e627eb2e21 | ||
|
|
24859f56d2 | ||
|
|
8676b5625a | ||
|
|
6bb4a5c7de | ||
|
|
a8e7ffcc95 | ||
|
|
58f54b937a | ||
|
|
8176ac40ee | ||
|
|
01e55461e8 | ||
|
|
32fe41a3d5 | ||
|
|
3436b5b032 | ||
|
|
c4f46e7354 | ||
|
|
753b5d1595 | ||
|
|
e0b651668d | ||
|
|
6e59255149 | ||
|
|
a152a2fd7e | ||
|
|
b43c8c424e | ||
|
|
f785e908a7 | ||
|
|
ef941f423c | ||
|
|
90d30740a4 | ||
|
|
35e0dea1d3 | ||
|
|
ac9eee956b | ||
|
|
13bccdda73 | ||
|
|
bdca5b83a1 | ||
|
|
997d9a774f | ||
|
|
a26ba7b2a7 | ||
|
|
1c13a9647c | ||
|
|
4e84c0dc50 | ||
|
|
997f23cf11 | ||
|
|
e4f8406139 | ||
|
|
aa0d4af51a | ||
|
|
7b3ec6f42d | ||
|
|
936827d09f | ||
|
|
267fb5987f | ||
|
|
eb48d2d5fd | ||
|
|
b71bb87d91 | ||
|
|
cc88c554ed | ||
|
|
ce7cea3a0d | ||
|
|
1bc9548a2f | ||
|
|
7c758c97b8 | ||
|
|
48d538eef9 | ||
|
|
9848c3e985 | ||
|
|
868f385a01 | ||
|
|
069926e4b6 | ||
|
|
91a983a341 | ||
|
|
f5680dd0cd | ||
|
|
5bdf14b1e7 | ||
|
|
b598826ff2 | ||
|
|
c1bac2873b | ||
|
|
9f611a5a23 | ||
|
|
e330a938fd | ||
|
|
f445d4b614 | ||
|
|
e1ae57eef9 | ||
|
|
76040ff2ad | ||
|
|
fd865bb9e7 | ||
|
|
6b037a0dde | ||
|
|
9eccc1659d | ||
|
|
6da7af8dfa | ||
|
|
b170a35328 | ||
|
|
2b11b7eaf2 | ||
|
|
82090b547e | ||
|
|
f452ef4747 | ||
|
|
c3622abeac | ||
|
|
c599c6cc62 |
128
.github/dependabot.yml
vendored
128
.github/dependabot.yml
vendored
@@ -3,63 +3,43 @@
|
||||
|
||||
version: 2
|
||||
updates:
|
||||
# main branch
|
||||
- package-ecosystem: "gomod"
|
||||
target-branch: main
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
day: "sunday"
|
||||
ignore:
|
||||
- dependency-name: k8s.io/*
|
||||
labels:
|
||||
- dependencies
|
||||
|
||||
- package-ecosystem: "docker"
|
||||
target-branch: main
|
||||
directory: "/deployments/container"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
|
||||
- package-ecosystem: "gomod"
|
||||
# This defines a specific dependabot rule for the latest release-* branch.
|
||||
target-branch: release-1.16
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
day: "sunday"
|
||||
ignore:
|
||||
- dependency-name: k8s.io/*
|
||||
labels:
|
||||
- dependencies
|
||||
- maintenance
|
||||
|
||||
- package-ecosystem: "docker"
|
||||
target-branch: release-1.16
|
||||
directory: "/deployments/container"
|
||||
directories:
|
||||
- "/"
|
||||
- "deployments/devel"
|
||||
- "tests"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
labels:
|
||||
- dependencies
|
||||
- maintenance
|
||||
groups:
|
||||
k8sio:
|
||||
patterns:
|
||||
- k8s.io/*
|
||||
exclude-patterns:
|
||||
- k8s.io/klog/*
|
||||
|
||||
- package-ecosystem: "gomod"
|
||||
target-branch: main
|
||||
directory: "deployments/devel"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
day: "sunday"
|
||||
|
||||
# A dependabot rule to bump the golang version.
|
||||
- package-ecosystem: "docker"
|
||||
target-branch: main
|
||||
directory: "/deployments/devel"
|
||||
directories:
|
||||
# CUDA image
|
||||
- "/deployments/container"
|
||||
# Golang version
|
||||
- "/deployments/devel"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
labels:
|
||||
- dependencies
|
||||
|
||||
- package-ecosystem: "github-actions"
|
||||
target-branch: main
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
labels:
|
||||
- dependencies
|
||||
|
||||
# Allow dependabot to update the libnvidia-container submodule.
|
||||
- package-ecosystem: "gitsubmodule"
|
||||
@@ -72,3 +52,69 @@ updates:
|
||||
labels:
|
||||
- dependencies
|
||||
- libnvidia-container
|
||||
|
||||
# The release branch(es):
|
||||
- package-ecosystem: "gomod"
|
||||
target-branch: release-1.17
|
||||
directories:
|
||||
- "/"
|
||||
# We don't update development or test dependencies on release branches
|
||||
# - "deployments/devel"
|
||||
# - "tests"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
day: "sunday"
|
||||
labels:
|
||||
- dependencies
|
||||
- maintenance
|
||||
ignore:
|
||||
# For release branches we only consider patch updates.
|
||||
- dependency-name: "*"
|
||||
update-types:
|
||||
- version-update:semver-major
|
||||
- version-update:semver-minor
|
||||
groups:
|
||||
k8sio:
|
||||
patterns:
|
||||
- k8s.io/*
|
||||
exclude-patterns:
|
||||
- k8s.io/klog/*
|
||||
|
||||
- package-ecosystem: "docker"
|
||||
target-branch: release-1.17
|
||||
directories:
|
||||
# CUDA image
|
||||
- "/deployments/container"
|
||||
# Golang version
|
||||
- "/deployments/devel"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
day: "sunday"
|
||||
ignore:
|
||||
# For release branches we only apply patch updates to the golang version.
|
||||
- dependency-name: "*golang*"
|
||||
update-types:
|
||||
- version-update:semver-major
|
||||
- version-update:semver-minor
|
||||
labels:
|
||||
- dependencies
|
||||
- maintenance
|
||||
|
||||
- package-ecosystem: "github-actions"
|
||||
target-branch: release-1.17
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "weekly"
|
||||
day: "sunday"
|
||||
labels:
|
||||
- dependencies
|
||||
- maintenance
|
||||
|
||||
# Github actions need to be gh-pages branches.
|
||||
- package-ecosystem: "github-actions"
|
||||
target-branch: gh-pages
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
labels:
|
||||
- dependencies
|
||||
|
||||
53
.github/workflows/ci.yaml
vendored
Normal file
53
.github/workflows/ci.yaml
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
# Copyright 2025 NVIDIA CORPORATION
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: CI Pipeline
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- "pull-request/[0-9]+"
|
||||
- main
|
||||
- release-*
|
||||
|
||||
jobs:
|
||||
code-scanning:
|
||||
uses: ./.github/workflows/code_scanning.yaml
|
||||
|
||||
variables:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
version: ${{ steps.version.outputs.version }}
|
||||
steps:
|
||||
- name: Generate Commit Short SHA
|
||||
id: version
|
||||
run: echo "version=$(echo $GITHUB_SHA | cut -c1-8)" >> "$GITHUB_OUTPUT"
|
||||
|
||||
golang:
|
||||
uses: ./.github/workflows/golang.yaml
|
||||
|
||||
image:
|
||||
uses: ./.github/workflows/image.yaml
|
||||
needs: [variables, golang, code-scanning]
|
||||
secrets: inherit
|
||||
with:
|
||||
version: ${{ needs.variables.outputs.version }}
|
||||
build_multi_arch_images: ${{ github.ref_name == 'main' || startsWith(github.ref_name, 'release-') }}
|
||||
|
||||
e2e-test:
|
||||
needs: [image, variables]
|
||||
secrets: inherit
|
||||
uses: ./.github/workflows/e2e.yaml
|
||||
with:
|
||||
version: ${{ needs.variables.outputs.version }}
|
||||
5
.github/workflows/code_scanning.yaml
vendored
5
.github/workflows/code_scanning.yaml
vendored
@@ -15,6 +15,7 @@
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
workflow_call: {}
|
||||
pull_request:
|
||||
types:
|
||||
- opened
|
||||
@@ -22,10 +23,6 @@ on:
|
||||
branches:
|
||||
- main
|
||||
- release-*
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- release-*
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
|
||||
98
.github/workflows/e2e.yaml
vendored
Normal file
98
.github/workflows/e2e.yaml
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
# Copyright 2025 NVIDIA CORPORATION
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: End-to-end Tests
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
version:
|
||||
required: true
|
||||
type: string
|
||||
secrets:
|
||||
AWS_ACCESS_KEY_ID:
|
||||
required: true
|
||||
AWS_SECRET_ACCESS_KEY:
|
||||
required: true
|
||||
AWS_SSH_KEY:
|
||||
required: true
|
||||
E2E_SSH_USER:
|
||||
required: true
|
||||
SLACK_BOT_TOKEN:
|
||||
required: true
|
||||
SLACK_CHANNEL_ID:
|
||||
required: true
|
||||
|
||||
jobs:
|
||||
e2e-tests:
|
||||
runs-on: linux-amd64-cpu4
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Calculate build vars
|
||||
id: vars
|
||||
run: |
|
||||
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
|
||||
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
|
||||
GOLANG_VERSION=$(./hack/golang-version.sh)
|
||||
echo "GOLANG_VERSION=${GOLANG_VERSION##GOLANG_VERSION := }" >> $GITHUB_ENV
|
||||
|
||||
- name: Install Go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: ${{ env.GOLANG_VERSION }}
|
||||
|
||||
- name: Set up Holodeck
|
||||
uses: NVIDIA/holodeck@v0.2.7
|
||||
with:
|
||||
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
|
||||
holodeck_config: "tests/e2e/infra/aws.yaml"
|
||||
|
||||
- name: Get public dns name
|
||||
id: holodeck_public_dns_name
|
||||
uses: mikefarah/yq@master
|
||||
with:
|
||||
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
|
||||
|
||||
- name: Run e2e tests
|
||||
env:
|
||||
IMAGE_NAME: ghcr.io/nvidia/container-toolkit
|
||||
VERSION: ${{ inputs.version }}
|
||||
SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
|
||||
E2E_SSH_USER: ${{ secrets.E2E_SSH_USER }}
|
||||
E2E_SSH_HOST: ${{ steps.holodeck_public_dns_name.outputs.result }}
|
||||
E2E_INSTALL_CTK: "true"
|
||||
run: |
|
||||
e2e_ssh_key=$(mktemp)
|
||||
echo "$SSH_KEY" > "$e2e_ssh_key"
|
||||
chmod 600 "$e2e_ssh_key"
|
||||
export E2E_SSH_KEY="$e2e_ssh_key"
|
||||
|
||||
make -f tests/e2e/Makefile test
|
||||
|
||||
- name: Send Slack alert notification
|
||||
if: ${{ failure() }}
|
||||
uses: slackapi/slack-github-action@v2.1.0
|
||||
with:
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: ${{ secrets.SLACK_CHANNEL_ID }}
|
||||
text: |
|
||||
:x: On repository ${{ github.repository }}, the Workflow *${{ github.workflow }}* has failed.
|
||||
|
||||
Details: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
||||
5
.github/workflows/golang.yaml
vendored
5
.github/workflows/golang.yaml
vendored
@@ -15,6 +15,7 @@
|
||||
name: Golang
|
||||
|
||||
on:
|
||||
workflow_call: {}
|
||||
pull_request:
|
||||
types:
|
||||
- opened
|
||||
@@ -22,10 +23,6 @@ on:
|
||||
branches:
|
||||
- main
|
||||
- release-*
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- release-*
|
||||
|
||||
jobs:
|
||||
check:
|
||||
|
||||
68
.github/workflows/image.yaml
vendored
68
.github/workflows/image.yaml
vendored
@@ -16,21 +16,18 @@
|
||||
name: image
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types:
|
||||
- opened
|
||||
- synchronize
|
||||
branches:
|
||||
- main
|
||||
- release-*
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- release-*
|
||||
workflow_call:
|
||||
inputs:
|
||||
version:
|
||||
required: true
|
||||
type: string
|
||||
build_multi_arch_images:
|
||||
required: true
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
packages:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: linux-amd64-cpu4
|
||||
strategy:
|
||||
matrix:
|
||||
target:
|
||||
@@ -41,7 +38,7 @@ jobs:
|
||||
- centos7-x86_64
|
||||
- centos8-ppc64le
|
||||
ispr:
|
||||
- ${{github.event_name == 'pull_request'}}
|
||||
- ${{ github.ref_name != 'main' && !startsWith( github.ref_name, 'release-' ) }}
|
||||
exclude:
|
||||
- ispr: true
|
||||
target: ubuntu18.04-arm64
|
||||
@@ -52,18 +49,25 @@ jobs:
|
||||
- ispr: true
|
||||
target: centos8-ppc64le
|
||||
fail-fast: false
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
name: Check out code
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
with:
|
||||
image: tonistiigi/binfmt:master
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: build ${{ matrix.target }} packages
|
||||
run: |
|
||||
sudo apt-get install -y coreutils build-essential sed git bash make
|
||||
echo "Building packages"
|
||||
./scripts/build-packages.sh ${{ matrix.target }}
|
||||
|
||||
- name: 'Upload Artifacts'
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
@@ -72,7 +76,7 @@ jobs:
|
||||
path: ${{ github.workspace }}/dist/*
|
||||
|
||||
image:
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: linux-amd64-cpu4
|
||||
strategy:
|
||||
matrix:
|
||||
dist:
|
||||
@@ -80,7 +84,7 @@ jobs:
|
||||
- ubi8
|
||||
- packaging
|
||||
ispr:
|
||||
- ${{github.event_name == 'pull_request'}}
|
||||
- ${{ github.ref_name != 'main' && !startsWith( github.ref_name, 'release-' ) }}
|
||||
exclude:
|
||||
- ispr: true
|
||||
dist: ubi8
|
||||
@@ -88,34 +92,15 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
name: Check out code
|
||||
- name: Calculate build vars
|
||||
id: vars
|
||||
run: |
|
||||
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
|
||||
echo "LOWERCASE_REPO_OWNER=$(echo "${GITHUB_REPOSITORY_OWNER}" | awk '{print tolower($0)}')" >> $GITHUB_ENV
|
||||
REPO_FULL_NAME="${{ github.event.pull_request.head.repo.full_name }}"
|
||||
echo "${REPO_FULL_NAME}"
|
||||
echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV
|
||||
|
||||
PUSH_ON_BUILD="false"
|
||||
BUILD_MULTI_ARCH_IMAGES="false"
|
||||
if [[ "${{ github.event_name }}" == "pull_request" ]]; then
|
||||
if [[ "${{ github.actor }}" != "dependabot[bot]" && "${{ github.event.pull_request.head.repo.full_name }}" == "${{ github.repository }}" ]]; then
|
||||
# For non-fork PRs that are not created by dependabot we do push images
|
||||
PUSH_ON_BUILD="true"
|
||||
fi
|
||||
elif [[ "${{ github.event_name }}" == "push" ]]; then
|
||||
# On push events we do generate images and enable muilti-arch builds
|
||||
PUSH_ON_BUILD="true"
|
||||
BUILD_MULTI_ARCH_IMAGES="true"
|
||||
fi
|
||||
echo "PUSH_ON_BUILD=${PUSH_ON_BUILD}" >> $GITHUB_ENV
|
||||
echo "BUILD_MULTI_ARCH_IMAGES=${BUILD_MULTI_ARCH_IMAGES}" >> $GITHUB_ENV
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
with:
|
||||
image: tonistiigi/binfmt:master
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Get built packages
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
@@ -129,10 +114,13 @@ jobs:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Build image
|
||||
env:
|
||||
IMAGE_NAME: ghcr.io/${LOWERCASE_REPO_OWNER}/container-toolkit
|
||||
VERSION: ${COMMIT_SHORT_SHA}
|
||||
IMAGE_NAME: ghcr.io/nvidia/container-toolkit
|
||||
VERSION: ${{ inputs.version }}
|
||||
PUSH_ON_BUILD: "true"
|
||||
BUILD_MULTI_ARCH_IMAGES: ${{ inputs.build_multi_arch_images }}
|
||||
run: |
|
||||
echo "${VERSION}"
|
||||
make -f deployments/container/Makefile build-${{ matrix.dist }}
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -3,7 +3,7 @@ artifacts
|
||||
*.swp
|
||||
*.swo
|
||||
/coverage.out*
|
||||
/test/output/
|
||||
/tests/output/
|
||||
/nvidia-container-runtime
|
||||
/nvidia-container-runtime.*
|
||||
/nvidia-container-runtime-hook
|
||||
|
||||
56
CHANGELOG.md
56
CHANGELOG.md
@@ -1,5 +1,61 @@
|
||||
# NVIDIA Container Toolkit Changelog
|
||||
|
||||
## v1.17.8
|
||||
|
||||
- Updated the ordering of Mounts in CDI to have a deterministic output. This makes testing more consistent.
|
||||
- Added NVIDIA_CTK_DEBUG envvar to hooks.
|
||||
|
||||
### Changes in libnvidia-container
|
||||
|
||||
- Fixed bug in setting default for `--cuda-compat-mode` flag. This caused failures in use cases invoking the `nvidia-container-cli` directly.
|
||||
- Added additional logging to the `nvidia-container-cli`.
|
||||
- Fixed variable initialisation when updating the ldcache. This caused failures on Arch linux or other platforms where the `nvidia-container-cli` was built from source.
|
||||
|
||||
## v1.17.7
|
||||
|
||||
- Fix mode detection on Thor-based systems. This correctly resolves `auto` mode to `csv`.
|
||||
- Fix resolution of libs in LDCache on ARM. This fixes CDI spec generation on ARM-based systems using NVML.
|
||||
- Run update-ldcache hook in isolated namespaces.
|
||||
|
||||
### Changes in the Toolkit Container
|
||||
|
||||
- Bump CUDA base image version to 12.9.0
|
||||
|
||||
### Changes in libnvidia-container
|
||||
|
||||
- Add `--cuda-compat-mode` flag to the `nvidia-container-cli configure` command.
|
||||
|
||||
## v1.17.6
|
||||
|
||||
### Changes in the Toolkit Container
|
||||
|
||||
- Allow container runtime executable path to be specified when configuring containerd.
|
||||
- Bump CUDA base image version to 12.8.1
|
||||
|
||||
### Changes in libnvidia-container
|
||||
|
||||
- Skip files when user has insufficient permissions. This prevents errors when discovering IPC sockets when the `nvidia-container-cli` is run as a non-root user.
|
||||
- Fix building with Go 1.24
|
||||
- Fix some typos in text.
|
||||
|
||||
## v1.17.5
|
||||
|
||||
- Allow the `enabled-cuda-compat` hook to be skipped when generating CDI specifications. This improves compatibility with older NVIDIA Container Toolkit installations. The hook is explicitly ignored for management CDI specifications.
|
||||
- Add IMEX binaries to CDI discovery. This includes the IMEX Daemon and IMEX Control binaries in containers.
|
||||
- Fix bug that may overwrite docker feature flags when configuring CDI from the `nvidia-ctk runtime configure` command.
|
||||
- Remove the unused `Set()` function from engine config API.
|
||||
- Add an `EnableCDI()` method to engine config API.
|
||||
- Add an `ignore-imex-channel-requests` feature flag. This ensures that the NVIDIA Container Runtime can be configured to ignore IMEX channel requests when these should be managed by another component.
|
||||
- Update the `update-ldcache` hook to run the host `ldconfig` from a MEMFD.
|
||||
- Add support for CUDA Forward Compatibility (removed by default in v1.17.4) using a dedicated `enable-cuda-compat` hook. This can be disabled using a `disable-cuda-compat-lib-hook` feature flag.
|
||||
- Disable nvsandboxutils in the `nvcdi` API. This prevents a segmentation violation with NVIDIA GPU Drivers from the 565 branch.
|
||||
- Fix a bug where `cdi` mode would not work with the `--gpus` flag even if the NVIDIA Container Runtime was used.
|
||||
|
||||
### Changes in the Toolkit Container
|
||||
|
||||
- Enable CDI in container engine (Containerd, Cri-o, Docker) if CDI_ENABLED is set.
|
||||
- Bump CUDA base image version to 12.8.0
|
||||
|
||||
## v1.17.4
|
||||
- Disable mounting of compat libs from container by default
|
||||
- Add allow-cuda-compat-libs-from-container feature flag
|
||||
|
||||
@@ -34,7 +34,7 @@ environment variables.
|
||||
|
||||
## Testing packages locally
|
||||
|
||||
The [test/release](./test/release/) folder contains documentation on how the installation of local or staged packages can be tested.
|
||||
The [tests/release](./tests/release/) folder contains documentation on how the installation of local or staged packages can be tested.
|
||||
|
||||
|
||||
## Releasing
|
||||
|
||||
@@ -21,6 +21,7 @@ import (
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/chmod"
|
||||
symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-symlinks"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/cudacompat"
|
||||
ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/update-ldcache"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
)
|
||||
@@ -32,5 +33,6 @@ func New(logger logger.Interface) []*cli.Command {
|
||||
ldcache.NewCommand(logger),
|
||||
symlinks.NewCommand(logger),
|
||||
chmod.NewCommand(logger),
|
||||
cudacompat.NewCommand(logger),
|
||||
}
|
||||
}
|
||||
|
||||
76
cmd/nvidia-cdi-hook/cudacompat/container-root.go
Normal file
76
cmd/nvidia-cdi-hook/cudacompat/container-root.go
Normal file
@@ -0,0 +1,76 @@
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package cudacompat
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/moby/sys/symlink"
|
||||
)
|
||||
|
||||
// A containerRoot represents the root filesystem of a container.
|
||||
type containerRoot string
|
||||
|
||||
// hasPath checks whether the specified path exists in the root.
|
||||
func (r containerRoot) hasPath(path string) bool {
|
||||
resolved, err := r.resolve(path)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
if _, err := os.Stat(resolved); err != nil && os.IsNotExist(err) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// globFiles matches the specified pattern in the root.
|
||||
// The files that match must be regular files.
|
||||
func (r containerRoot) globFiles(pattern string) ([]string, error) {
|
||||
patternPath, err := r.resolve(pattern)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
matches, err := filepath.Glob(patternPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var files []string
|
||||
for _, match := range matches {
|
||||
info, err := os.Lstat(match)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Ignore symlinks.
|
||||
if info.Mode()&os.ModeSymlink != 0 {
|
||||
continue
|
||||
}
|
||||
// Ignore directories.
|
||||
if info.IsDir() {
|
||||
continue
|
||||
}
|
||||
files = append(files, match)
|
||||
}
|
||||
return files, nil
|
||||
}
|
||||
|
||||
// resolve returns the absolute path including root path.
|
||||
// Symlinks are resolved, but are guaranteed to resolve in the root.
|
||||
func (r containerRoot) resolve(path string) (string, error) {
|
||||
absolute := filepath.Clean(filepath.Join(string(r), path))
|
||||
return symlink.FollowSymlinkInScope(absolute, string(r))
|
||||
}
|
||||
221
cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
Normal file
221
cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
Normal file
@@ -0,0 +1,221 @@
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package cudacompat
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/urfave/cli/v2"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
)
|
||||
|
||||
const (
|
||||
cudaCompatPath = "/usr/local/cuda/compat"
|
||||
// cudaCompatLdsoconfdFilenamePattern specifies the pattern for the filename
|
||||
// in ld.so.conf.d that includes a reference to the CUDA compat path.
|
||||
// The 00-compat prefix is chosen to ensure that these libraries have a
|
||||
// higher precedence than other libraries on the system.
|
||||
cudaCompatLdsoconfdFilenamePattern = "00-compat-*.conf"
|
||||
)
|
||||
|
||||
type command struct {
|
||||
logger logger.Interface
|
||||
}
|
||||
|
||||
type options struct {
|
||||
hostDriverVersion string
|
||||
containerSpec string
|
||||
}
|
||||
|
||||
// NewCommand constructs a cuda-compat command with the specified logger
|
||||
func NewCommand(logger logger.Interface) *cli.Command {
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
return c.build()
|
||||
}
|
||||
|
||||
// build the enable-cuda-compat command
|
||||
func (m command) build() *cli.Command {
|
||||
cfg := options{}
|
||||
|
||||
// Create the 'enable-cuda-compat' command
|
||||
c := cli.Command{
|
||||
Name: "enable-cuda-compat",
|
||||
Usage: "This hook ensures that the folder containing the CUDA compat libraries is added to the ldconfig search path if required.",
|
||||
Before: func(c *cli.Context) error {
|
||||
return m.validateFlags(c, &cfg)
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
return m.run(c, &cfg)
|
||||
},
|
||||
}
|
||||
|
||||
c.Flags = []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "host-driver-version",
|
||||
Usage: "Specify the host driver version. If the CUDA compat libraries detected in the container do not have a higher MAJOR version, the hook is a no-op.",
|
||||
Destination: &cfg.hostDriverVersion,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "container-spec",
|
||||
Hidden: true,
|
||||
Category: "testing-only",
|
||||
Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN",
|
||||
Destination: &cfg.containerSpec,
|
||||
},
|
||||
}
|
||||
|
||||
return &c
|
||||
}
|
||||
|
||||
func (m command) validateFlags(_ *cli.Context, cfg *options) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m command) run(_ *cli.Context, cfg *options) error {
|
||||
if cfg.hostDriverVersion == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
s, err := oci.LoadContainerState(cfg.containerSpec)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load container state: %w", err)
|
||||
}
|
||||
|
||||
containerRootDir, err := s.GetContainerRoot()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to determined container root: %w", err)
|
||||
}
|
||||
|
||||
containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get container forward compat directory: %w", err)
|
||||
}
|
||||
if containerForwardCompatDir == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir)
|
||||
}
|
||||
|
||||
func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) {
|
||||
if hostDriverVersion == "" {
|
||||
m.logger.Debugf("Host driver version not specified")
|
||||
return "", nil
|
||||
}
|
||||
if !containerRoot.hasPath(cudaCompatPath) {
|
||||
m.logger.Debugf("No CUDA forward compatibility libraries directory in container")
|
||||
return "", nil
|
||||
}
|
||||
if !containerRoot.hasPath("/etc/ld.so.cache") {
|
||||
m.logger.Debugf("The container does not have an LDCache")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*"))
|
||||
if err != nil {
|
||||
m.logger.Warningf("Failed to find CUDA compat library: %w", err)
|
||||
return "", nil
|
||||
}
|
||||
|
||||
if len(libs) == 0 {
|
||||
m.logger.Debugf("No CUDA forward compatibility libraries container")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
if len(libs) != 1 {
|
||||
m.logger.Warningf("Unexpected number of CUDA compat libraries in container: %v", libs)
|
||||
return "", nil
|
||||
}
|
||||
|
||||
compatDriverVersion := strings.TrimPrefix(filepath.Base(libs[0]), "libcuda.so.")
|
||||
compatMajor, err := extractMajorVersion(compatDriverVersion)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to extract major version from %q: %v", compatDriverVersion, err)
|
||||
}
|
||||
|
||||
driverMajor, err := extractMajorVersion(hostDriverVersion)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to extract major version from %q: %v", hostDriverVersion, err)
|
||||
}
|
||||
|
||||
if driverMajor >= compatMajor {
|
||||
m.logger.Debugf("Compat major version is not greater than the host driver major version (%v >= %v)", hostDriverVersion, compatDriverVersion)
|
||||
return "", nil
|
||||
}
|
||||
|
||||
resolvedCompatDir := strings.TrimPrefix(filepath.Dir(libs[0]), string(containerRoot))
|
||||
return resolvedCompatDir, nil
|
||||
}
|
||||
|
||||
// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/ in the specified root.
|
||||
// The file is created at /etc/ld.so.conf.d/{{ .pattern }} using `CreateTemp` and
|
||||
// contains the specified directories on each line.
|
||||
func (m command) createLdsoconfdFile(in containerRoot, pattern string, dirs ...string) error {
|
||||
if len(dirs) == 0 {
|
||||
m.logger.Debugf("No directories to add to /etc/ld.so.conf")
|
||||
return nil
|
||||
}
|
||||
|
||||
ldsoconfdDir, err := in.resolve("/etc/ld.so.conf.d")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.MkdirAll(ldsoconfdDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create ld.so.conf.d: %w", err)
|
||||
}
|
||||
|
||||
configFile, err := os.CreateTemp(ldsoconfdDir, pattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create config file: %w", err)
|
||||
}
|
||||
defer configFile.Close()
|
||||
|
||||
m.logger.Debugf("Adding directories %v to %v", dirs, configFile.Name())
|
||||
|
||||
added := make(map[string]bool)
|
||||
for _, dir := range dirs {
|
||||
if added[dir] {
|
||||
continue
|
||||
}
|
||||
_, err = configFile.WriteString(fmt.Sprintf("%s\n", dir))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to update config file: %w", err)
|
||||
}
|
||||
added[dir] = true
|
||||
}
|
||||
|
||||
// The created file needs to be world readable for the cases where the container is run as a non-root user.
|
||||
if err := configFile.Chmod(0644); err != nil {
|
||||
return fmt.Errorf("failed to chmod config file: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// extractMajorVersion parses a version string and returns the major version as an int.
|
||||
func extractMajorVersion(version string) (int, error) {
|
||||
majorString := strings.SplitN(version, ".", 2)[0]
|
||||
return strconv.Atoi(majorString)
|
||||
}
|
||||
182
cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
Normal file
182
cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
Normal file
@@ -0,0 +1,182 @@
|
||||
/*
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package cudacompat
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
testlog "github.com/sirupsen/logrus/hooks/test"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestCompatLibs(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
|
||||
testCases := []struct {
|
||||
description string
|
||||
contents map[string]string
|
||||
hostDriverVersion string
|
||||
expectedContainerForwardCompatDir string
|
||||
}{
|
||||
{
|
||||
description: "empty root",
|
||||
hostDriverVersion: "222.55.66",
|
||||
},
|
||||
{
|
||||
description: "compat lib is newer; no ldcache",
|
||||
contents: map[string]string{
|
||||
"/usr/local/cuda/compat/libcuda.so.333.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
},
|
||||
{
|
||||
description: "compat lib is newer; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.333.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "/usr/local/cuda/compat",
|
||||
},
|
||||
{
|
||||
description: "compat lib is older; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.111.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "",
|
||||
},
|
||||
{
|
||||
description: "compat lib has same major version; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "",
|
||||
},
|
||||
{
|
||||
description: "numeric comparison is used; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "99.55.66",
|
||||
expectedContainerForwardCompatDir: "/usr/local/cuda/compat",
|
||||
},
|
||||
{
|
||||
description: "driver version empty; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "",
|
||||
},
|
||||
{
|
||||
description: "symlinks are followed",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/etc/alternatives/cuda/compat/libcuda.so.333.88.99": "",
|
||||
"/usr/local/cuda": "symlink=/etc/alternatives/cuda",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "/etc/alternatives/cuda/compat",
|
||||
},
|
||||
{
|
||||
description: "symlinks stay in container",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/compat/libcuda.so.333.88.99": "",
|
||||
"/usr/local/cuda": "symlink=../../../../../../",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "/compat",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
containerRootDir := t.TempDir()
|
||||
for name, contents := range tc.contents {
|
||||
target := filepath.Join(containerRootDir, name)
|
||||
require.NoError(t, os.MkdirAll(filepath.Dir(target), 0755))
|
||||
|
||||
if strings.HasPrefix(contents, "symlink=") {
|
||||
require.NoError(t, os.Symlink(strings.TrimPrefix(contents, "symlink="), target))
|
||||
continue
|
||||
}
|
||||
|
||||
require.NoError(t, os.WriteFile(target, []byte(contents), 0600))
|
||||
}
|
||||
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion)
|
||||
require.NoError(t, err)
|
||||
require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateLdconfig(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
testCases := []struct {
|
||||
description string
|
||||
folders []string
|
||||
expectedContents string
|
||||
}{
|
||||
{
|
||||
description: "no folders; have no contents",
|
||||
},
|
||||
{
|
||||
description: "single folder is added",
|
||||
folders: []string{"/usr/local/cuda/compat"},
|
||||
expectedContents: "/usr/local/cuda/compat\n",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
containerRootDir := t.TempDir()
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
err := c.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, tc.folders...)
|
||||
require.NoError(t, err)
|
||||
|
||||
matches, err := filepath.Glob(filepath.Join(containerRootDir, "/etc/ld.so.conf.d/00-compat-*.conf"))
|
||||
require.NoError(t, err)
|
||||
|
||||
if tc.expectedContents == "" {
|
||||
require.Empty(t, matches)
|
||||
return
|
||||
}
|
||||
|
||||
require.Len(t, matches, 1)
|
||||
contents, err := os.ReadFile(matches[0])
|
||||
require.NoError(t, err)
|
||||
|
||||
require.EqualValues(t, tc.expectedContents, string(contents))
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
@@ -58,13 +58,15 @@ func main() {
|
||||
Aliases: []string{"d"},
|
||||
Usage: "Enable debug-level logging",
|
||||
Destination: &opts.Debug,
|
||||
EnvVars: []string{"NVIDIA_CDI_DEBUG"},
|
||||
// TODO: Support for NVIDIA_CDI_DEBUG is deprecated and NVIDIA_CTK_DEBUG should be used instead.
|
||||
EnvVars: []string{"NVIDIA_CTK_DEBUG", "NVIDIA_CDI_DEBUG"},
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "quiet",
|
||||
Usage: "Suppress all output except for errors; overrides --debug",
|
||||
Destination: &opts.Quiet,
|
||||
EnvVars: []string{"NVIDIA_CDI_QUIET"},
|
||||
// TODO: Support for NVIDIA_CDI_QUIET is deprecated and NVIDIA_CTK_QUIET should be used instead.
|
||||
EnvVars: []string{"NVDIA_CTK_QUIET", "NVIDIA_CDI_QUIET"},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
46
cmd/nvidia-cdi-hook/update-ldcache/container-root.go
Normal file
46
cmd/nvidia-cdi-hook/update-ldcache/container-root.go
Normal file
@@ -0,0 +1,46 @@
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package ldcache
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/moby/sys/symlink"
|
||||
)
|
||||
|
||||
// A containerRoot represents the root filesystem of a container.
|
||||
type containerRoot string
|
||||
|
||||
// hasPath checks whether the specified path exists in the root.
|
||||
func (r containerRoot) hasPath(path string) bool {
|
||||
resolved, err := r.resolve(path)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
if _, err := os.Stat(resolved); err != nil && os.IsNotExist(err) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// resolve returns the absolute path including root path.
|
||||
// Symlinks are resolved, but are guaranteed to resolve in the root.
|
||||
func (r containerRoot) resolve(path string) (string, error) {
|
||||
absolute := filepath.Clean(filepath.Join(string(r), path))
|
||||
return symlink.FollowSymlinkInScope(absolute, string(r))
|
||||
}
|
||||
200
cmd/nvidia-cdi-hook/update-ldcache/ldconfig_linux.go
Normal file
200
cmd/nvidia-cdi-hook/update-ldcache/ldconfig_linux.go
Normal file
@@ -0,0 +1,200 @@
|
||||
//go:build linux
|
||||
|
||||
/**
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package ldcache
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"syscall"
|
||||
|
||||
securejoin "github.com/cyphar/filepath-securejoin"
|
||||
|
||||
"github.com/moby/sys/reexec"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// pivotRoot will call pivot_root such that rootfs becomes the new root
|
||||
// filesystem, and everything else is cleaned up.
|
||||
// This is adapted from the implementation here:
|
||||
//
|
||||
// https://github.com/opencontainers/runc/blob/e89a29929c775025419ab0d218a43588b4c12b9a/libcontainer/rootfs_linux.go#L1056-L1113
|
||||
//
|
||||
// With the `mount` and `unmount` calls changed to direct unix.Mount and unix.Unmount calls.
|
||||
func pivotRoot(rootfs string) error {
|
||||
// While the documentation may claim otherwise, pivot_root(".", ".") is
|
||||
// actually valid. What this results in is / being the new root but
|
||||
// /proc/self/cwd being the old root. Since we can play around with the cwd
|
||||
// with pivot_root this allows us to pivot without creating directories in
|
||||
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
|
||||
|
||||
oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return &os.PathError{Op: "open", Path: "/", Err: err}
|
||||
}
|
||||
defer unix.Close(oldroot) //nolint: errcheck
|
||||
|
||||
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return &os.PathError{Op: "open", Path: rootfs, Err: err}
|
||||
}
|
||||
defer unix.Close(newroot) //nolint: errcheck
|
||||
|
||||
// Change to the new root so that the pivot_root actually acts on it.
|
||||
if err := unix.Fchdir(newroot); err != nil {
|
||||
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err}
|
||||
}
|
||||
|
||||
if err := unix.PivotRoot(".", "."); err != nil {
|
||||
return &os.PathError{Op: "pivot_root", Path: ".", Err: err}
|
||||
}
|
||||
|
||||
// Currently our "." is oldroot (according to the current kernel code).
|
||||
// However, purely for safety, we will fchdir(oldroot) since there isn't
|
||||
// really any guarantee from the kernel what /proc/self/cwd will be after a
|
||||
// pivot_root(2).
|
||||
|
||||
if err := unix.Fchdir(oldroot); err != nil {
|
||||
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err}
|
||||
}
|
||||
|
||||
// Make oldroot rslave to make sure our unmounts don't propagate to the
|
||||
// host (and thus bork the machine). We don't use rprivate because this is
|
||||
// known to cause issues due to races where we still have a reference to a
|
||||
// mount while a process in the host namespace are trying to operate on
|
||||
// something they think has no mounts (devicemapper in particular).
|
||||
if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
|
||||
return err
|
||||
}
|
||||
// Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
|
||||
if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Switch back to our shiny new root.
|
||||
if err := unix.Chdir("/"); err != nil {
|
||||
return &os.PathError{Op: "chdir", Path: "/", Err: err}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// mountLdConfig mounts the host ldconfig to the mount namespace of the hook.
|
||||
// We use WithProcfd to perform the mount operations to ensure that the changes
|
||||
// are persisted across the pivot root.
|
||||
func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) {
|
||||
hostLdconfigInfo, err := os.Stat(hostLdconfigPath)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error reading host ldconfig: %w", err)
|
||||
}
|
||||
|
||||
hookScratchDirPath := "/var/run/nvidia-ctk-hook"
|
||||
ldconfigPath := filepath.Join(hookScratchDirPath, "ldconfig")
|
||||
if err := utils.MkdirAllInRoot(containerRootDirPath, hookScratchDirPath, 0755); err != nil {
|
||||
return "", fmt.Errorf("error creating hook scratch folder: %w", err)
|
||||
}
|
||||
|
||||
err = utils.WithProcfd(containerRootDirPath, hookScratchDirPath, func(hookScratchDirFdPath string) error {
|
||||
return createTmpFs(hookScratchDirFdPath, int(hostLdconfigInfo.Size()))
|
||||
|
||||
})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error creating tmpfs: %w", err)
|
||||
}
|
||||
|
||||
if _, err := createFileInRoot(containerRootDirPath, ldconfigPath, hostLdconfigInfo.Mode()); err != nil {
|
||||
return "", fmt.Errorf("error creating ldconfig: %w", err)
|
||||
}
|
||||
|
||||
err = utils.WithProcfd(containerRootDirPath, ldconfigPath, func(ldconfigFdPath string) error {
|
||||
return unix.Mount(hostLdconfigPath, ldconfigFdPath, "", unix.MS_BIND|unix.MS_RDONLY|unix.MS_NODEV|unix.MS_PRIVATE|unix.MS_NOSYMFOLLOW, "")
|
||||
})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error bind mounting host ldconfig: %w", err)
|
||||
}
|
||||
|
||||
return ldconfigPath, nil
|
||||
}
|
||||
|
||||
func createFileInRoot(containerRootDirPath string, destinationPath string, mode os.FileMode) (string, error) {
|
||||
dest, err := securejoin.SecureJoin(containerRootDirPath, destinationPath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Make the parent directory.
|
||||
destDir, destBase := filepath.Split(dest)
|
||||
destDirFd, err := utils.MkdirAllInRootOpen(containerRootDirPath, destDir, 0755)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error creating parent dir: %w", err)
|
||||
}
|
||||
defer destDirFd.Close()
|
||||
// Make the target file. We want to avoid opening any file that is
|
||||
// already there because it could be a "bad" file like an invalid
|
||||
// device or hung tty that might cause a DoS, so we use mknodat.
|
||||
// destBase does not contain any "/" components, and mknodat does
|
||||
// not follow trailing symlinks, so we can safely just call mknodat
|
||||
// here.
|
||||
if err := unix.Mknodat(int(destDirFd.Fd()), destBase, unix.S_IFREG|uint32(mode), 0); err != nil {
|
||||
// If we get EEXIST, there was already an inode there and
|
||||
// we can consider that a success.
|
||||
if !errors.Is(err, unix.EEXIST) {
|
||||
return "", fmt.Errorf("error creating empty file: %w", err)
|
||||
}
|
||||
}
|
||||
return dest, nil
|
||||
}
|
||||
|
||||
// mountProc mounts a clean proc filesystem in the new root.
|
||||
func mountProc(newroot string) error {
|
||||
target := filepath.Join(newroot, "/proc")
|
||||
|
||||
if err := os.MkdirAll(target, 0755); err != nil {
|
||||
return fmt.Errorf("error creating directory: %w", err)
|
||||
}
|
||||
return unix.Mount("proc", target, "proc", 0, "")
|
||||
}
|
||||
|
||||
// createTmpFs creates a tmpfs at the specified location with the specified size.
|
||||
func createTmpFs(target string, size int) error {
|
||||
return unix.Mount("tmpfs", target, "tmpfs", 0, fmt.Sprintf("size=%d", size))
|
||||
}
|
||||
|
||||
// createReexecCommand creates a command that can be used to trigger the reexec
|
||||
// initializer.
|
||||
// On linux this command runs in new namespaces.
|
||||
func createReexecCommand(args []string) *exec.Cmd {
|
||||
cmd := reexec.Command(args...)
|
||||
cmd.Stdin = os.Stdin
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||
Cloneflags: syscall.CLONE_NEWNS |
|
||||
syscall.CLONE_NEWUTS |
|
||||
syscall.CLONE_NEWIPC |
|
||||
syscall.CLONE_NEWPID |
|
||||
syscall.CLONE_NEWNET,
|
||||
}
|
||||
|
||||
return cmd
|
||||
}
|
||||
51
cmd/nvidia-cdi-hook/update-ldcache/ldconfig_other.go
Normal file
51
cmd/nvidia-cdi-hook/update-ldcache/ldconfig_other.go
Normal file
@@ -0,0 +1,51 @@
|
||||
//go:build !linux
|
||||
|
||||
/**
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package ldcache
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
|
||||
"github.com/moby/sys/reexec"
|
||||
)
|
||||
|
||||
func pivotRoot(newroot string) error {
|
||||
return fmt.Errorf("not supported")
|
||||
}
|
||||
|
||||
func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) {
|
||||
return "", fmt.Errorf("not supported")
|
||||
}
|
||||
|
||||
func mountProc(newroot string) error {
|
||||
return fmt.Errorf("not supported")
|
||||
}
|
||||
|
||||
// createReexecCommand creates a command that can be used ot trigger the reexec
|
||||
// initializer.
|
||||
func createReexecCommand(args []string) *exec.Cmd {
|
||||
cmd := reexec.Command(args...)
|
||||
cmd.Stdin = os.Stdin
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
return cmd
|
||||
}
|
||||
58
cmd/nvidia-cdi-hook/update-ldcache/safe-exec_linux.go
Normal file
58
cmd/nvidia-cdi-hook/update-ldcache/safe-exec_linux.go
Normal file
@@ -0,0 +1,58 @@
|
||||
//go:build linux
|
||||
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package ldcache
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"syscall"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/dmz"
|
||||
)
|
||||
|
||||
// SafeExec attempts to clone the specified binary (as an memfd, for example) before executing it.
|
||||
func SafeExec(path string, args []string, envv []string) error {
|
||||
safeExe, err := cloneBinary(path)
|
||||
if err != nil {
|
||||
//nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
|
||||
return syscall.Exec(path, args, envv)
|
||||
}
|
||||
defer safeExe.Close()
|
||||
|
||||
exePath := "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd()))
|
||||
//nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
|
||||
return syscall.Exec(exePath, args, envv)
|
||||
}
|
||||
|
||||
func cloneBinary(path string) (*os.File, error) {
|
||||
exe, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("opening current binary: %w", err)
|
||||
}
|
||||
defer exe.Close()
|
||||
|
||||
stat, err := exe.Stat()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("checking %v size: %w", path, err)
|
||||
}
|
||||
size := stat.Size()
|
||||
|
||||
return dmz.CloneBinary(exe, size, path, os.TempDir())
|
||||
}
|
||||
28
cmd/nvidia-cdi-hook/update-ldcache/safe-exec_other.go
Normal file
28
cmd/nvidia-cdi-hook/update-ldcache/safe-exec_other.go
Normal file
@@ -0,0 +1,28 @@
|
||||
//go:build !linux
|
||||
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package ldcache
|
||||
|
||||
import "syscall"
|
||||
|
||||
// SafeExec is not implemented on non-linux systems and forwards directly to the
|
||||
// Exec syscall.
|
||||
func SafeExec(path string, args []string, envv []string) error {
|
||||
//nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
|
||||
return syscall.Exec(path, args, envv)
|
||||
}
|
||||
@@ -19,11 +19,11 @@ package ldcache
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
"github.com/moby/sys/reexec"
|
||||
"github.com/urfave/cli/v2"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||
@@ -31,6 +31,17 @@ import (
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
)
|
||||
|
||||
const (
|
||||
// ldsoconfdFilenamePattern specifies the pattern for the filename
|
||||
// in ld.so.conf.d that includes references to the specified directories.
|
||||
// The 00-nvcr prefix is chosen to ensure that these libraries have a
|
||||
// higher precedence than other libraries on the system, but lower than
|
||||
// the 00-cuda-compat that is included in some containers.
|
||||
ldsoconfdFilenamePattern = "00-nvcr-*.conf"
|
||||
|
||||
reexecUpdateLdCacheCommandName = "reexec-update-ldcache"
|
||||
)
|
||||
|
||||
type command struct {
|
||||
logger logger.Interface
|
||||
}
|
||||
@@ -41,6 +52,13 @@ type options struct {
|
||||
containerSpec string
|
||||
}
|
||||
|
||||
func init() {
|
||||
reexec.Register(reexecUpdateLdCacheCommandName, updateLdCacheHandler)
|
||||
if reexec.Init() {
|
||||
os.Exit(0)
|
||||
}
|
||||
}
|
||||
|
||||
// NewCommand constructs an update-ldcache command with the specified logger
|
||||
func NewCommand(logger logger.Interface) *cli.Command {
|
||||
c := command{
|
||||
@@ -100,97 +118,137 @@ func (m command) run(c *cli.Context, cfg *options) error {
|
||||
return fmt.Errorf("failed to load container state: %v", err)
|
||||
}
|
||||
|
||||
containerRoot, err := s.GetContainerRoot()
|
||||
if err != nil {
|
||||
containerRootDir, err := s.GetContainerRoot()
|
||||
if err != nil || containerRootDir == "" || containerRootDir == "/" {
|
||||
return fmt.Errorf("failed to determined container root: %v", err)
|
||||
}
|
||||
|
||||
ldconfigPath := m.resolveLDConfigPath(cfg.ldconfigPath)
|
||||
args := []string{filepath.Base(ldconfigPath)}
|
||||
if containerRoot != "" {
|
||||
args = append(args, "-r", containerRoot)
|
||||
args := []string{
|
||||
reexecUpdateLdCacheCommandName,
|
||||
strings.TrimPrefix(config.NormalizeLDConfigPath("@"+cfg.ldconfigPath), "@"),
|
||||
containerRootDir,
|
||||
}
|
||||
args = append(args, cfg.folders.Value()...)
|
||||
|
||||
cmd := createReexecCommand(args)
|
||||
|
||||
return cmd.Run()
|
||||
}
|
||||
|
||||
// updateLdCacheHandler wraps updateLdCache with error handling.
|
||||
func updateLdCacheHandler() {
|
||||
if err := updateLdCache(os.Args); err != nil {
|
||||
log.Printf("Error updating ldcache: %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// updateLdCache is invoked from a reexec'd handler and provides namespace
|
||||
// isolation for the operations performed by this hook.
|
||||
// At the point where this is invoked, we are in a new mount namespace that is
|
||||
// cloned from the parent.
|
||||
//
|
||||
// args[0] is the reexec initializer function name
|
||||
// args[1] is the path of the ldconfig binary on the host
|
||||
// args[2] is the container root directory
|
||||
// The remaining args are folders that need to be added to the ldcache.
|
||||
func updateLdCache(args []string) error {
|
||||
if len(args) < 3 {
|
||||
return fmt.Errorf("incorrect arguments: %v", args)
|
||||
}
|
||||
hostLdconfigPath := args[1]
|
||||
containerRootDirPath := args[2]
|
||||
|
||||
// To prevent leaking the parent proc filesystem, we create a new proc mount
|
||||
// in the container root.
|
||||
if err := mountProc(containerRootDirPath); err != nil {
|
||||
return fmt.Errorf("error mounting /proc: %w", err)
|
||||
}
|
||||
|
||||
if root(containerRoot).hasPath("/etc/ld.so.cache") {
|
||||
// We mount the host ldconfig before we pivot root since host paths are not
|
||||
// visible after the pivot root operation.
|
||||
ldconfigPath, err := mountLdConfig(hostLdconfigPath, containerRootDirPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error mounting host ldconfig: %w", err)
|
||||
}
|
||||
|
||||
// We pivot to the container root for the new process, this further limits
|
||||
// access to the host.
|
||||
if err := pivotRoot(containerRootDirPath); err != nil {
|
||||
return fmt.Errorf("error running pivot_root: %w", err)
|
||||
}
|
||||
|
||||
return runLdconfig(ldconfigPath, args[3:]...)
|
||||
}
|
||||
|
||||
// runLdconfig runs the ldconfig binary and ensures that the specified directories
|
||||
// are processed for the ldcache.
|
||||
func runLdconfig(ldconfigPath string, directories ...string) error {
|
||||
args := []string{
|
||||
"ldconfig",
|
||||
// Explicitly specify using /etc/ld.so.conf since the host's ldconfig may
|
||||
// be configured to use a different config file by default.
|
||||
// Note that since we apply the `-r {{ .containerRootDir }}` argument, /etc/ld.so.conf is
|
||||
// in the container.
|
||||
"-f", "/etc/ld.so.conf",
|
||||
}
|
||||
|
||||
containerRoot := containerRoot("/")
|
||||
|
||||
if containerRoot.hasPath("/etc/ld.so.cache") {
|
||||
args = append(args, "-C", "/etc/ld.so.cache")
|
||||
} else {
|
||||
m.logger.Debugf("No ld.so.cache found, skipping update")
|
||||
args = append(args, "-N")
|
||||
}
|
||||
|
||||
folders := cfg.folders.Value()
|
||||
if root(containerRoot).hasPath("/etc/ld.so.conf.d") {
|
||||
err := m.createConfig(containerRoot, folders)
|
||||
if containerRoot.hasPath("/etc/ld.so.conf.d") {
|
||||
err := createLdsoconfdFile(ldsoconfdFilenamePattern, directories...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to update ld.so.conf.d: %v", err)
|
||||
return fmt.Errorf("failed to update ld.so.conf.d: %w", err)
|
||||
}
|
||||
} else {
|
||||
args = append(args, folders...)
|
||||
args = append(args, directories...)
|
||||
}
|
||||
|
||||
// Explicitly specify using /etc/ld.so.conf since the host's ldconfig may
|
||||
// be configured to use a different config file by default.
|
||||
args = append(args, "-f", "/etc/ld.so.conf")
|
||||
|
||||
//nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
|
||||
return syscall.Exec(ldconfigPath, args, nil)
|
||||
return SafeExec(ldconfigPath, args, nil)
|
||||
}
|
||||
|
||||
type root string
|
||||
|
||||
func (r root) hasPath(path string) bool {
|
||||
_, err := os.Stat(filepath.Join(string(r), path))
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// resolveLDConfigPath determines the LDConfig path to use for the system.
|
||||
// On systems such as Ubuntu where `/sbin/ldconfig` is a wrapper around
|
||||
// /sbin/ldconfig.real, the latter is returned.
|
||||
func (m command) resolveLDConfigPath(path string) string {
|
||||
return strings.TrimPrefix(config.NormalizeLDConfigPath("@"+path), "@")
|
||||
}
|
||||
|
||||
// createConfig creates (or updates) /etc/ld.so.conf.d/00-nvcr-<RANDOM_STRING>.conf in the container
|
||||
// to include the required paths.
|
||||
// Note that the 00-nvcr prefix is chosen to ensure that these libraries have
|
||||
// a higher precedence than other libraries on the system but are applied AFTER
|
||||
// 00-cuda-compat.conf.
|
||||
func (m command) createConfig(root string, folders []string) error {
|
||||
if len(folders) == 0 {
|
||||
m.logger.Debugf("No folders to add to /etc/ld.so.conf")
|
||||
// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/.
|
||||
// The file is created at /etc/ld.so.conf.d/{{ .pattern }} using `CreateTemp` and
|
||||
// contains the specified directories on each line.
|
||||
func createLdsoconfdFile(pattern string, dirs ...string) error {
|
||||
if len(dirs) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(filepath.Join(root, "/etc/ld.so.conf.d"), 0755); err != nil {
|
||||
return fmt.Errorf("failed to create ld.so.conf.d: %v", err)
|
||||
ldsoconfdDir := "/etc/ld.so.conf.d"
|
||||
if err := os.MkdirAll(ldsoconfdDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create ld.so.conf.d: %w", err)
|
||||
}
|
||||
|
||||
configFile, err := os.CreateTemp(filepath.Join(root, "/etc/ld.so.conf.d"), "00-nvcr-*.conf")
|
||||
configFile, err := os.CreateTemp(ldsoconfdDir, pattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create config file: %v", err)
|
||||
return fmt.Errorf("failed to create config file: %w", err)
|
||||
}
|
||||
defer configFile.Close()
|
||||
defer func() {
|
||||
_ = configFile.Close()
|
||||
}()
|
||||
|
||||
m.logger.Debugf("Adding folders %v to %v", folders, configFile.Name())
|
||||
|
||||
configured := make(map[string]bool)
|
||||
for _, folder := range folders {
|
||||
if configured[folder] {
|
||||
added := make(map[string]bool)
|
||||
for _, dir := range dirs {
|
||||
if added[dir] {
|
||||
continue
|
||||
}
|
||||
_, err = configFile.WriteString(fmt.Sprintf("%s\n", folder))
|
||||
_, err = fmt.Fprintf(configFile, "%s\n", dir)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to update ld.so.conf.d: %v", err)
|
||||
return fmt.Errorf("failed to update config file: %w", err)
|
||||
}
|
||||
configured[folder] = true
|
||||
added[dir] = true
|
||||
}
|
||||
|
||||
// The created file needs to be world readable for the cases where the container is run as a non-root user.
|
||||
if err := os.Chmod(configFile.Name(), 0644); err != nil {
|
||||
return fmt.Errorf("failed to chmod config file: %v", err)
|
||||
if err := configFile.Chmod(0644); err != nil {
|
||||
return fmt.Errorf("failed to chmod config file: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
||||
@@ -198,6 +198,10 @@ func getMigDevices(image image.CUDA, envvar string) *string {
|
||||
}
|
||||
|
||||
func (hookConfig *hookConfig) getImexChannels(image image.CUDA, privileged bool) []string {
|
||||
if hookConfig.Features.IgnoreImexChannelRequests.IsEnabled() {
|
||||
return nil
|
||||
}
|
||||
|
||||
// If enabled, try and get the device list from volume mounts first
|
||||
if hookConfig.AcceptDeviceListAsVolumeMounts {
|
||||
devices := image.ImexChannelsFromMounts()
|
||||
|
||||
@@ -104,3 +104,26 @@ func (c *hookConfig) getSwarmResourceEnvvars() []string {
|
||||
|
||||
return envvars
|
||||
}
|
||||
|
||||
// nvidiaContainerCliCUDACompatModeFlags returns required --cuda-compat-mode
|
||||
// flag(s) depending on the hook and runtime configurations.
|
||||
func (c *hookConfig) nvidiaContainerCliCUDACompatModeFlags() []string {
|
||||
var flag string
|
||||
switch c.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode {
|
||||
case config.CUDACompatModeLdconfig:
|
||||
flag = "--cuda-compat-mode=ldconfig"
|
||||
case config.CUDACompatModeMount:
|
||||
flag = "--cuda-compat-mode=mount"
|
||||
case config.CUDACompatModeDisabled, config.CUDACompatModeHook:
|
||||
flag = "--cuda-compat-mode=disabled"
|
||||
default:
|
||||
if !c.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
|
||||
flag = "--cuda-compat-mode=disabled"
|
||||
}
|
||||
}
|
||||
|
||||
if flag == "" {
|
||||
return nil
|
||||
}
|
||||
return []string{flag}
|
||||
}
|
||||
|
||||
@@ -114,9 +114,8 @@ func doPrestart() {
|
||||
}
|
||||
args = append(args, "configure")
|
||||
|
||||
if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
|
||||
args = append(args, "--no-cntlibs")
|
||||
}
|
||||
args = append(args, hook.nvidiaContainerCliCUDACompatModeFlags()...)
|
||||
|
||||
if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" {
|
||||
args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath))
|
||||
}
|
||||
|
||||
@@ -22,9 +22,9 @@ import (
|
||||
const (
|
||||
nvidiaRuntime = "nvidia-container-runtime"
|
||||
nvidiaHook = "nvidia-container-runtime-hook"
|
||||
bundlePathSuffix = "test/output/bundle/"
|
||||
bundlePathSuffix = "tests/output/bundle/"
|
||||
specFile = "config.json"
|
||||
unmodifiedSpecFileSuffix = "test/input/test_spec.json"
|
||||
unmodifiedSpecFileSuffix = "tests/input/test_spec.json"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -46,8 +46,8 @@ func TestMain(m *testing.M) {
|
||||
if err != nil {
|
||||
log.Fatalf("error in test setup: could not get module root: %v", err)
|
||||
}
|
||||
testBinPath := filepath.Join(moduleRoot, "test", "bin")
|
||||
testInputPath := filepath.Join(moduleRoot, "test", "input")
|
||||
testBinPath := filepath.Join(moduleRoot, "tests", "bin")
|
||||
testInputPath := filepath.Join(moduleRoot, "tests", "input")
|
||||
|
||||
// Set the environment variables for the test
|
||||
os.Setenv("PATH", test.PrependToPath(testBinPath, moduleRoot))
|
||||
|
||||
@@ -68,12 +68,11 @@ type config struct {
|
||||
dryRun bool
|
||||
runtime string
|
||||
configFilePath string
|
||||
executablePath string
|
||||
configSource string
|
||||
mode string
|
||||
hookFilePath string
|
||||
|
||||
runtimeConfigOverrideJSON string
|
||||
|
||||
nvidiaRuntime struct {
|
||||
name string
|
||||
path string
|
||||
@@ -120,6 +119,11 @@ func (m command) build() *cli.Command {
|
||||
Usage: "path to the config file for the target runtime",
|
||||
Destination: &config.configFilePath,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "executable-path",
|
||||
Usage: "The path to the runtime executable. This is used to extract the current config",
|
||||
Destination: &config.executablePath,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "config-mode",
|
||||
Usage: "the config mode for runtimes that support multiple configuration mechanisms",
|
||||
@@ -163,7 +167,7 @@ func (m command) build() *cli.Command {
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "cdi.enabled",
|
||||
Aliases: []string{"cdi.enable"},
|
||||
Aliases: []string{"cdi.enable", "enable-cdi"},
|
||||
Usage: "Enable CDI in the configured runtime",
|
||||
Destination: &config.cdi.enabled,
|
||||
},
|
||||
@@ -208,9 +212,9 @@ func (m command) validateFlags(c *cli.Context, config *config) error {
|
||||
config.cdi.enabled = false
|
||||
}
|
||||
|
||||
if config.runtimeConfigOverrideJSON != "" && config.runtime != "containerd" {
|
||||
m.logger.Warningf("Ignoring runtime-config-override flag for %v", config.runtime)
|
||||
config.runtimeConfigOverrideJSON = ""
|
||||
if config.executablePath != "" && config.runtime == "docker" {
|
||||
m.logger.Warningf("Ignoring executable-path=%q flag for %v", config.executablePath, config.runtime)
|
||||
config.executablePath = ""
|
||||
}
|
||||
|
||||
switch config.configSource {
|
||||
@@ -292,9 +296,8 @@ func (m command) configureConfigFile(c *cli.Context, config *config) error {
|
||||
return fmt.Errorf("unable to update config: %v", err)
|
||||
}
|
||||
|
||||
err = enableCDI(config, cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to enable CDI in %s: %w", config.runtime, err)
|
||||
if config.cdi.enabled {
|
||||
cfg.EnableCDI()
|
||||
}
|
||||
|
||||
outputPath := config.getOutputConfigPath()
|
||||
@@ -331,9 +334,9 @@ func (c *config) resolveConfigSource() (toml.Loader, error) {
|
||||
func (c *config) getCommandConfigSource() toml.Loader {
|
||||
switch c.runtime {
|
||||
case "containerd":
|
||||
return containerd.CommandLineSource("")
|
||||
return containerd.CommandLineSource("", c.executablePath)
|
||||
case "crio":
|
||||
return crio.CommandLineSource("")
|
||||
return crio.CommandLineSource("", c.executablePath)
|
||||
}
|
||||
return toml.Empty
|
||||
}
|
||||
@@ -354,19 +357,3 @@ func (m *command) configureOCIHook(c *cli.Context, config *config) error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// enableCDI enables the use of CDI in the corresponding container engine
|
||||
func enableCDI(config *config, cfg engine.Interface) error {
|
||||
if !config.cdi.enabled {
|
||||
return nil
|
||||
}
|
||||
switch config.runtime {
|
||||
case "containerd":
|
||||
cfg.Set("enable_cdi", true)
|
||||
case "docker":
|
||||
cfg.Set("features", map[string]bool{"cdi": true})
|
||||
default:
|
||||
return fmt.Errorf("enabling CDI in %s is not supported", config.runtime)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
ARG GOLANG_VERSION=x.x.x
|
||||
|
||||
FROM nvidia/cuda:12.6.3-base-ubuntu20.04
|
||||
FROM nvidia/cuda:12.9.0-base-ubuntu20.04
|
||||
|
||||
ARG ARTIFACTS_ROOT
|
||||
COPY ${ARTIFACTS_ROOT} /artifacts/packages/
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
ARG GOLANG_VERSION=x.x.x
|
||||
ARG VERSION="N/A"
|
||||
|
||||
FROM nvidia/cuda:12.6.3-base-ubi8 as build
|
||||
FROM nvidia/cuda:12.9.0-base-ubi8 as build
|
||||
|
||||
RUN yum install -y \
|
||||
wget make git gcc \
|
||||
@@ -48,7 +48,7 @@ COPY . .
|
||||
RUN GOPATH=/artifacts go install -ldflags="-s -w -X 'main.Version=${VERSION}'" ./tools/...
|
||||
|
||||
|
||||
FROM nvidia/cuda:12.6.3-base-ubi8
|
||||
FROM nvidia/cuda:12.9.0-base-ubi8
|
||||
|
||||
ENV NVIDIA_DISABLE_REQUIRE="true"
|
||||
ENV NVIDIA_VISIBLE_DEVICES=void
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
ARG GOLANG_VERSION=x.x.x
|
||||
ARG VERSION="N/A"
|
||||
|
||||
FROM nvidia/cuda:12.6.3-base-ubuntu20.04 as build
|
||||
FROM nvidia/cuda:12.9.0-base-ubuntu20.04 as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y wget make git gcc \
|
||||
@@ -47,7 +47,7 @@ COPY . .
|
||||
RUN GOPATH=/artifacts go install -ldflags="-s -w -X 'main.Version=${VERSION}'" ./tools/...
|
||||
|
||||
|
||||
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu20.04
|
||||
FROM nvcr.io/nvidia/cuda:12.9.0-base-ubuntu20.04
|
||||
|
||||
# Remove the CUDA repository configurations to avoid issues with rotated GPG keys
|
||||
RUN rm -f /etc/apt/sources.list.d/cuda.list
|
||||
|
||||
@@ -27,12 +27,6 @@ DIST_DIR ?= $(CURDIR)/dist
|
||||
##### Global variables #####
|
||||
include $(CURDIR)/versions.mk
|
||||
|
||||
ifeq ($(IMAGE_NAME),)
|
||||
REGISTRY ?= nvidia
|
||||
IMAGE_NAME := $(REGISTRY)/container-toolkit
|
||||
endif
|
||||
|
||||
VERSION ?= $(LIB_VERSION)$(if $(LIB_TAG),-$(LIB_TAG))
|
||||
IMAGE_VERSION := $(VERSION)
|
||||
|
||||
IMAGE_TAG ?= $(VERSION)-$(DIST)
|
||||
@@ -49,6 +43,7 @@ DISTRIBUTIONS := ubuntu20.04 ubi8
|
||||
|
||||
META_TARGETS := packaging
|
||||
|
||||
IMAGE_TARGETS := $(patsubst %,image-%,$(DISTRIBUTIONS) $(META_TARGETS))
|
||||
BUILD_TARGETS := $(patsubst %,build-%,$(DISTRIBUTIONS) $(META_TARGETS))
|
||||
PUSH_TARGETS := $(patsubst %,push-%,$(DISTRIBUTIONS) $(META_TARGETS))
|
||||
TEST_TARGETS := $(patsubst %,test-%,$(DISTRIBUTIONS))
|
||||
@@ -89,7 +84,7 @@ build-%: DOCKERFILE = $(CURDIR)/deployments/container/Dockerfile.$(DOCKERFILE_SU
|
||||
ARTIFACTS_ROOT ?= $(shell realpath --relative-to=$(CURDIR) $(DIST_DIR))
|
||||
|
||||
# Use a generic build target to build the relevant images
|
||||
$(BUILD_TARGETS): build-%: $(ARTIFACTS_ROOT)
|
||||
$(IMAGE_TARGETS): image-%: $(ARTIFACTS_ROOT)
|
||||
DOCKER_BUILDKIT=1 \
|
||||
$(DOCKER) $(BUILDX) build --pull \
|
||||
--provenance=false --sbom=false \
|
||||
@@ -108,7 +103,6 @@ $(BUILD_TARGETS): build-%: $(ARTIFACTS_ROOT)
|
||||
-f $(DOCKERFILE) \
|
||||
$(CURDIR)
|
||||
|
||||
|
||||
build-ubuntu%: DOCKERFILE_SUFFIX := ubuntu
|
||||
build-ubuntu%: PACKAGE_DIST = ubuntu18.04
|
||||
|
||||
@@ -122,7 +116,13 @@ build-packaging: PACKAGE_DIST = all
|
||||
# Test targets
|
||||
test-%: DIST = $(*)
|
||||
|
||||
TEST_CASES ?= toolkit docker crio containerd
|
||||
# Handle the default build target.
|
||||
.PHONY: build
|
||||
build: $(DEFAULT_PUSH_TARGET)
|
||||
$(DEFAULT_PUSH_TARGET): build-$(DEFAULT_PUSH_TARGET)
|
||||
$(DEFAULT_PUSH_TARGET): DIST = $(DEFAULT_PUSH_TARGET)
|
||||
|
||||
TEST_CASES ?= docker crio containerd
|
||||
$(TEST_TARGETS): test-%:
|
||||
TEST_CASES="$(TEST_CASES)" bash -x $(CURDIR)/test/container/main.sh run \
|
||||
$(CURDIR)/shared-$(*) \
|
||||
|
||||
@@ -16,8 +16,7 @@ PUSH_ON_BUILD ?= false
|
||||
DOCKER_BUILD_OPTIONS = --output=type=image,push=$(PUSH_ON_BUILD)
|
||||
DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64,linux/arm64
|
||||
|
||||
# We only generate amd64 image for ubuntu18.04
|
||||
build-ubuntu18.04: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64
|
||||
$(BUILD_TARGETS): build-%: image-%
|
||||
|
||||
# We only generate a single image for packaging targets
|
||||
build-packaging: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64
|
||||
|
||||
@@ -12,4 +12,22 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64
|
||||
PUSH_ON_BUILD ?= false
|
||||
ARCH ?= $(shell uname -m)
|
||||
DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/$(ARCH)
|
||||
|
||||
ifeq ($(PUSH_ON_BUILD),true)
|
||||
DOCKER_BUILD_OPTIONS = --output=type=image,push=$(PUSH_ON_BUILD)
|
||||
$(BUILD_TARGETS): build-%: image-%
|
||||
$(DOCKER) push "$(IMAGE)"
|
||||
else
|
||||
$(BUILD_TARGETS): build-%: image-%
|
||||
endif
|
||||
|
||||
# For the default distribution we also retag the image.
|
||||
# Note: This needs to be updated for multi-arch images.
|
||||
ifeq ($(IMAGE_TAG),$(VERSION)-$(DIST))
|
||||
$(DEFAULT_PUSH_TARGET):
|
||||
$(DOCKER) image inspect $(IMAGE) > /dev/null || $(DOCKER) pull $(IMAGE)
|
||||
$(DOCKER) tag $(IMAGE) $(subst :$(IMAGE_TAG),:$(VERSION),$(IMAGE))
|
||||
endif
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
# This Dockerfile is also used to define the golang version used in this project
|
||||
# This allows dependabot to manage this version in addition to other images.
|
||||
FROM golang:1.23.4
|
||||
FROM golang:1.23.9
|
||||
|
||||
WORKDIR /work
|
||||
COPY * .
|
||||
|
||||
17
go.mod
17
go.mod
@@ -1,20 +1,23 @@
|
||||
module github.com/NVIDIA/nvidia-container-toolkit
|
||||
|
||||
go 1.20
|
||||
go 1.22
|
||||
|
||||
require (
|
||||
github.com/NVIDIA/go-nvlib v0.6.1
|
||||
github.com/NVIDIA/go-nvlib v0.7.2
|
||||
github.com/NVIDIA/go-nvml v0.12.4-1
|
||||
github.com/cyphar/filepath-securejoin v0.4.1
|
||||
github.com/fsnotify/fsnotify v1.7.0
|
||||
github.com/moby/sys/reexec v0.1.0
|
||||
github.com/moby/sys/symlink v0.3.0
|
||||
github.com/opencontainers/runtime-spec v1.2.0
|
||||
github.com/opencontainers/runc v1.2.6
|
||||
github.com/opencontainers/runtime-spec v1.2.1
|
||||
github.com/pelletier/go-toml v1.9.5
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/stretchr/testify v1.9.0
|
||||
github.com/stretchr/testify v1.10.0
|
||||
github.com/urfave/cli/v2 v2.27.5
|
||||
golang.org/x/mod v0.20.0
|
||||
golang.org/x/sys v0.26.0
|
||||
tags.cncf.io/container-device-interface v0.8.0
|
||||
golang.org/x/sys v0.28.0
|
||||
tags.cncf.io/container-device-interface v0.8.1
|
||||
tags.cncf.io/container-device-interface/specs-go v0.8.0
|
||||
)
|
||||
|
||||
@@ -25,8 +28,8 @@ require (
|
||||
github.com/hashicorp/errwrap v1.1.0 // indirect
|
||||
github.com/kr/pretty v0.3.1 // indirect
|
||||
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
|
||||
github.com/opencontainers/selinux v1.11.0 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/rogpeppe/go-internal v1.11.0 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
|
||||
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
|
||||
|
||||
29
go.sum
29
go.sum
@@ -1,5 +1,5 @@
|
||||
github.com/NVIDIA/go-nvlib v0.6.1 h1:0/5FvaKvDJoJeJ+LFlh+NDQMxMlVw9wOXrOVrGXttfE=
|
||||
github.com/NVIDIA/go-nvlib v0.6.1/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY=
|
||||
github.com/NVIDIA/go-nvlib v0.7.2 h1:7sy/NVUa4sM9FLKwH6CjBfHSWrJUmv8emVyxLTzjfOA=
|
||||
github.com/NVIDIA/go-nvlib v0.7.2/go.mod h1:2Kh2kYSP5IJ8EKf0/SYDzHiQKb9EJkwOf2LQzu6pXzY=
|
||||
github.com/NVIDIA/go-nvml v0.12.4-1 h1:WKUvqshhWSNTfm47ETRhv0A0zJyr1ncCuHiXwoTrBEc=
|
||||
github.com/NVIDIA/go-nvml v0.12.4-1/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
|
||||
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
|
||||
@@ -7,6 +7,8 @@ github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2y
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s=
|
||||
github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
@@ -28,12 +30,16 @@ github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/mndrix/tap-go v0.0.0-20171203230836-629fa407e90b/go.mod h1:pzzDgJWZ34fGzaAZGFW22KVZDfyrYW+QABMrWnJBnSs=
|
||||
github.com/moby/sys/reexec v0.1.0 h1:RrBi8e0EBTLEgfruBOFcxtElzRGTEUkeIFaVXgU7wok=
|
||||
github.com/moby/sys/reexec v0.1.0/go.mod h1:EqjBg8F3X7iZe5pU6nRZnYCMUTXoxsjiIfHup5wYIN8=
|
||||
github.com/moby/sys/symlink v0.3.0 h1:GZX89mEZ9u53f97npBy4Rc3vJKj7JBDj/PN2I22GrNU=
|
||||
github.com/moby/sys/symlink v0.3.0/go.mod h1:3eNdhduHmYPcgsJtZXW1W4XUJdZGBIkttZ8xKqPUJq0=
|
||||
github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
|
||||
github.com/opencontainers/runc v1.2.6 h1:P7Hqg40bsMvQGCS4S7DJYhUZOISMLJOB2iGX5COWiPk=
|
||||
github.com/opencontainers/runc v1.2.6/go.mod h1:dOQeFo29xZKBNeRBI0B19mJtfHv68YgCTh1X+YphA+4=
|
||||
github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||
github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
|
||||
github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||
github.com/opencontainers/runtime-spec v1.2.1 h1:S4k4ryNgEpxW1dzyqffOmhI1BHYcjzU8lpJfSlR0xww=
|
||||
github.com/opencontainers/runtime-spec v1.2.1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 h1:DmNGcqH3WDbV5k8OJ+esPWbqUOX5rMLR2PMvziDMJi0=
|
||||
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626/go.mod h1:BRHJJd0E+cx42OybVYSgUvZmU0B8P9gZuRXlZUP7TKI=
|
||||
github.com/opencontainers/selinux v1.9.1/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI=
|
||||
@@ -44,8 +50,9 @@ github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCko
|
||||
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
|
||||
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
||||
github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
|
||||
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
|
||||
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
|
||||
@@ -55,8 +62,8 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
|
||||
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
|
||||
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI=
|
||||
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
|
||||
github.com/urfave/cli v1.19.1/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA=
|
||||
@@ -76,8 +83,8 @@ golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
|
||||
golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
|
||||
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
@@ -88,7 +95,7 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo=
|
||||
sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8=
|
||||
tags.cncf.io/container-device-interface v0.8.0 h1:8bCFo/g9WODjWx3m6EYl3GfUG31eKJbaggyBDxEldRc=
|
||||
tags.cncf.io/container-device-interface v0.8.0/go.mod h1:Apb7N4VdILW0EVdEMRYXIDVRZfNJZ+kmEUss2kRRQ6Y=
|
||||
tags.cncf.io/container-device-interface v0.8.1 h1:c0jN4Mt6781jD67NdPajmZlD1qrqQyov/Xfoab37lj0=
|
||||
tags.cncf.io/container-device-interface v0.8.1/go.mod h1:Apb7N4VdILW0EVdEMRYXIDVRZfNJZ+kmEUss2kRRQ6Y=
|
||||
tags.cncf.io/container-device-interface/specs-go v0.8.0 h1:QYGFzGxvYK/ZLMrjhvY0RjpUavIn4KcmRmVP/JjdBTA=
|
||||
tags.cncf.io/container-device-interface/specs-go v0.8.0/go.mod h1:BhJIkjjPh4qpys+qm4DAYtUyryaTDg9zris+AczXyws=
|
||||
|
||||
@@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
|
||||
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
|
||||
SpecDirs: cdi.DefaultSpecDirs,
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: defaultCUDACompatMode,
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
|
||||
@@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
|
||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "ldconfig",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@@ -93,6 +96,7 @@ func TestGetConfig(t *testing.T) {
|
||||
"nvidia-container-cli.load-kmods = false",
|
||||
"nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"",
|
||||
"nvidia-container-cli.user = \"foo:bar\"",
|
||||
"nvidia-container-cli.cuda-compat-mode = \"mount\"",
|
||||
"nvidia-container-runtime.debug = \"/foo/bar\"",
|
||||
"nvidia-container-runtime.discover-mode = \"not-legacy\"",
|
||||
"nvidia-container-runtime.log-level = \"debug\"",
|
||||
@@ -102,6 +106,7 @@ func TestGetConfig(t *testing.T) {
|
||||
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
|
||||
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
||||
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||
"nvidia-container-runtime.modes.legacy.cuda-compat-mode = \"mount\"",
|
||||
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
||||
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
|
||||
},
|
||||
@@ -134,6 +139,9 @@ func TestGetConfig(t *testing.T) {
|
||||
"/not/var/run/cdi",
|
||||
},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "mount",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@@ -178,6 +186,9 @@ func TestGetConfig(t *testing.T) {
|
||||
"/var/run/cdi",
|
||||
},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "ldconfig",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@@ -200,6 +211,7 @@ func TestGetConfig(t *testing.T) {
|
||||
"root = \"/bar/baz\"",
|
||||
"load-kmods = false",
|
||||
"ldconfig = \"@/foo/bar/ldconfig\"",
|
||||
"cuda-compat-mode = \"mount\"",
|
||||
"user = \"foo:bar\"",
|
||||
"[nvidia-container-runtime]",
|
||||
"debug = \"/foo/bar\"",
|
||||
@@ -213,6 +225,8 @@ func TestGetConfig(t *testing.T) {
|
||||
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
||||
"[nvidia-container-runtime.modes.csv]",
|
||||
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||
"[nvidia-container-runtime.modes.legacy]",
|
||||
"cuda-compat-mode = \"mount\"",
|
||||
"[nvidia-container-runtime-hook]",
|
||||
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
||||
"[nvidia-ctk]",
|
||||
@@ -247,6 +261,9 @@ func TestGetConfig(t *testing.T) {
|
||||
"/not/var/run/cdi",
|
||||
},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "mount",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@@ -283,6 +300,9 @@ func TestGetConfig(t *testing.T) {
|
||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "ldconfig",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@@ -322,6 +342,9 @@ func TestGetConfig(t *testing.T) {
|
||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "ldconfig",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
|
||||
@@ -25,9 +25,26 @@ type features struct {
|
||||
// If this feature flag is not set to 'true' only host-rooted config paths
|
||||
// (i.e. paths starting with an '@' are considered valid)
|
||||
AllowLDConfigFromContainer *feature `toml:"allow-ldconfig-from-container,omitempty"`
|
||||
// DisableCUDACompatLibHook, when enabled skips the injection of a specific
|
||||
// hook to process CUDA compatibility libraries.
|
||||
//
|
||||
// Note: Since this mechanism replaces the logic in the `nvidia-container-cli`,
|
||||
// toggling this feature has no effect if `allow-cuda-compat-libs-from-container` is enabled.
|
||||
DisableCUDACompatLibHook *feature `toml:"disable-cuda-compat-lib-hook,omitempty"`
|
||||
// DisableImexChannelCreation ensures that the implicit creation of
|
||||
// requested IMEX channels is skipped when invoking the nvidia-container-cli.
|
||||
DisableImexChannelCreation *feature `toml:"disable-imex-channel-creation,omitempty"`
|
||||
// IgnoreImexChannelRequests configures the NVIDIA Container Toolkit to
|
||||
// ignore IMEX channel requests through the NVIDIA_IMEX_CHANNELS envvar or
|
||||
// volume mounts.
|
||||
// This ensures that the NVIDIA Container Toolkit cannot be used to provide
|
||||
// access to an IMEX channel by simply specifying an environment variable,
|
||||
// possibly bypassing other checks by an orchestration system such as
|
||||
// kubernetes.
|
||||
// Note that this is not enabled by default to maintain backward compatibility
|
||||
// with the existing behaviour when the NVIDIA Container Toolkit is used in
|
||||
// non-kubernetes environments.
|
||||
IgnoreImexChannelRequests *feature `toml:"ignore-imex-channel-requests,omitempty"`
|
||||
}
|
||||
|
||||
type feature bool
|
||||
|
||||
@@ -29,8 +29,9 @@ type RuntimeConfig struct {
|
||||
|
||||
// modesConfig defines (optional) per-mode configs
|
||||
type modesConfig struct {
|
||||
CSV csvModeConfig `toml:"csv"`
|
||||
CDI cdiModeConfig `toml:"cdi"`
|
||||
CSV csvModeConfig `toml:"csv"`
|
||||
CDI cdiModeConfig `toml:"cdi"`
|
||||
Legacy legacyModeConfig `toml:"legacy"`
|
||||
}
|
||||
|
||||
type cdiModeConfig struct {
|
||||
@@ -45,3 +46,31 @@ type cdiModeConfig struct {
|
||||
type csvModeConfig struct {
|
||||
MountSpecPath string `toml:"mount-spec-path"`
|
||||
}
|
||||
|
||||
type legacyModeConfig struct {
|
||||
// CUDACompatMode sets the mode to be used to make CUDA Forward Compat
|
||||
// libraries discoverable in the container.
|
||||
CUDACompatMode cudaCompatMode `toml:"cuda-compat-mode,omitempty"`
|
||||
}
|
||||
|
||||
type cudaCompatMode string
|
||||
|
||||
const (
|
||||
defaultCUDACompatMode = CUDACompatModeLdconfig
|
||||
// CUDACompatModeDisabled explicitly disables the handling of CUDA Forward
|
||||
// Compatibility in the NVIDIA Container Runtime and NVIDIA Container
|
||||
// Runtime Hook.
|
||||
CUDACompatModeDisabled = cudaCompatMode("disabled")
|
||||
// CUDACompatModeHook uses a container lifecycle hook to implement CUDA
|
||||
// Forward Compatibility support. This requires the use of the NVIDIA
|
||||
// Container Runtime and is not compatible with use cases where only the
|
||||
// NVIDIA Container Runtime Hook is used (e.g. the Docker --gpus flag).
|
||||
CUDACompatModeHook = cudaCompatMode("hook")
|
||||
// CUDACompatModeLdconfig adds the folders containing CUDA Forward Compat
|
||||
// libraries to the ldconfig command invoked from the NVIDIA Container
|
||||
// Runtime Hook.
|
||||
CUDACompatModeLdconfig = cudaCompatMode("ldconfig")
|
||||
// CUDACompatModeMount mounts CUDA Forward Compat folders from the container
|
||||
// to the container when using the NVIDIA Container Runtime Hook.
|
||||
CUDACompatModeMount = cudaCompatMode("mount")
|
||||
)
|
||||
|
||||
@@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.legacy]
|
||||
cuda-compat-mode = "ldconfig"
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "nvidia-container-runtime-hook"
|
||||
skip-mode-detection = false
|
||||
|
||||
24
internal/discover/compat_libs.go
Normal file
24
internal/discover/compat_libs.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package discover
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
|
||||
)
|
||||
|
||||
// NewCUDACompatHookDiscoverer creates a discoverer for a enable-cuda-compat hook.
|
||||
// This hook is responsible for setting up CUDA compatibility in the container and depends on the host driver version.
|
||||
func NewCUDACompatHookDiscoverer(logger logger.Interface, nvidiaCDIHookPath string, driver *root.Driver) Discover {
|
||||
_, cudaVersionPattern := getCUDALibRootAndVersionPattern(logger, driver)
|
||||
var args []string
|
||||
if !strings.Contains(cudaVersionPattern, "*") {
|
||||
args = append(args, "--host-driver-version="+cudaVersionPattern)
|
||||
}
|
||||
|
||||
return CreateNvidiaCDIHook(
|
||||
nvidiaCDIHookPath,
|
||||
"enable-cuda-compat",
|
||||
args...,
|
||||
)
|
||||
}
|
||||
@@ -34,6 +34,7 @@ type Hook struct {
|
||||
Lifecycle string
|
||||
Path string
|
||||
Args []string
|
||||
Env []string
|
||||
}
|
||||
|
||||
// Discover defines an interface for discovering the devices, mounts, and hooks available on a system
|
||||
|
||||
@@ -70,6 +70,7 @@ func TestGraphicsLibrariesDiscoverer(t *testing.T) {
|
||||
Args: []string{"nvidia-cdi-hook", "create-symlinks",
|
||||
"--link", "../libnvidia-allocator.so.1::/usr/lib64/gbm/nvidia-drm_gbm.so",
|
||||
},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -97,6 +98,7 @@ func TestGraphicsLibrariesDiscoverer(t *testing.T) {
|
||||
Args: []string{"nvidia-cdi-hook", "create-symlinks",
|
||||
"--link", "libnvidia-vulkan-producer.so.123.45.67::/usr/lib64/libnvidia-vulkan-producer.so",
|
||||
},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -128,6 +130,7 @@ func TestGraphicsLibrariesDiscoverer(t *testing.T) {
|
||||
"--link", "../libnvidia-allocator.so.1::/usr/lib64/gbm/nvidia-drm_gbm.so",
|
||||
"--link", "libnvidia-vulkan-producer.so.123.45.67::/usr/lib64/libnvidia-vulkan-producer.so",
|
||||
},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
package discover
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
|
||||
"tags.cncf.io/container-device-interface/pkg/cdi"
|
||||
@@ -69,6 +70,7 @@ func (c cdiHook) Create(name string, args ...string) Hook {
|
||||
Lifecycle: cdi.CreateContainerHook,
|
||||
Path: string(c),
|
||||
Args: append(c.requiredArgs(name), args...),
|
||||
Env: []string{fmt.Sprintf("NVIDIA_CTK_DEBUG=%v", false)},
|
||||
}
|
||||
}
|
||||
func (c cdiHook) requiredArgs(name string) []string {
|
||||
|
||||
@@ -95,6 +95,7 @@ func TestLDCacheUpdateHook(t *testing.T) {
|
||||
Path: testNvidiaCDIHookPath,
|
||||
Args: tc.expectedArgs,
|
||||
Lifecycle: "createContainer",
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
}
|
||||
|
||||
d, err := NewLDCacheUpdateHook(logger, mountMock, testNvidiaCDIHookPath, tc.ldconfigPath)
|
||||
|
||||
@@ -21,26 +21,28 @@ import "fmt"
|
||||
// list is a discoverer that contains a list of Discoverers. The output of the
|
||||
// Mounts functions is the concatenation of the output for each of the
|
||||
// elements in the list.
|
||||
type list struct {
|
||||
discoverers []Discover
|
||||
}
|
||||
type list []Discover
|
||||
|
||||
var _ Discover = (*list)(nil)
|
||||
|
||||
// Merge creates a discoverer that is the composite of a list of discoverers.
|
||||
func Merge(d ...Discover) Discover {
|
||||
l := list{
|
||||
discoverers: d,
|
||||
func Merge(discoverers ...Discover) Discover {
|
||||
var l list
|
||||
for _, d := range discoverers {
|
||||
if d == nil {
|
||||
continue
|
||||
}
|
||||
l = append(l, d)
|
||||
}
|
||||
|
||||
return &l
|
||||
return l
|
||||
}
|
||||
|
||||
// Devices returns all devices from the included discoverers
|
||||
func (d list) Devices() ([]Device, error) {
|
||||
var allDevices []Device
|
||||
|
||||
for i, di := range d.discoverers {
|
||||
for i, di := range d {
|
||||
devices, err := di.Devices()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error discovering devices for discoverer %v: %v", i, err)
|
||||
@@ -55,7 +57,7 @@ func (d list) Devices() ([]Device, error) {
|
||||
func (d list) Mounts() ([]Mount, error) {
|
||||
var allMounts []Mount
|
||||
|
||||
for i, di := range d.discoverers {
|
||||
for i, di := range d {
|
||||
mounts, err := di.Mounts()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error discovering mounts for discoverer %v: %v", i, err)
|
||||
@@ -70,7 +72,7 @@ func (d list) Mounts() ([]Mount, error) {
|
||||
func (d list) Hooks() ([]Hook, error) {
|
||||
var allHooks []Hook
|
||||
|
||||
for i, di := range d.discoverers {
|
||||
for i, di := range d {
|
||||
hooks, err := di.Hooks()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error discovering hooks for discoverer %v: %v", i, err)
|
||||
|
||||
@@ -69,8 +69,8 @@ func (d *mounts) Mounts() ([]Mount, error) {
|
||||
d.Lock()
|
||||
defer d.Unlock()
|
||||
|
||||
uniqueMounts := make(map[string]Mount)
|
||||
|
||||
var mounts []Mount
|
||||
seen := make(map[string]bool)
|
||||
for _, candidate := range d.required {
|
||||
d.logger.Debugf("Locating %v", candidate)
|
||||
located, err := d.lookup.Locate(candidate)
|
||||
@@ -84,7 +84,7 @@ func (d *mounts) Mounts() ([]Mount, error) {
|
||||
}
|
||||
d.logger.Debugf("Located %v as %v", candidate, located)
|
||||
for _, p := range located {
|
||||
if _, ok := uniqueMounts[p]; ok {
|
||||
if seen[p] {
|
||||
d.logger.Debugf("Skipping duplicate mount %v", p)
|
||||
continue
|
||||
}
|
||||
@@ -95,7 +95,7 @@ func (d *mounts) Mounts() ([]Mount, error) {
|
||||
}
|
||||
|
||||
d.logger.Infof("Selecting %v as %v", p, r)
|
||||
uniqueMounts[p] = Mount{
|
||||
mount := Mount{
|
||||
HostPath: p,
|
||||
Path: r,
|
||||
Options: []string{
|
||||
@@ -105,14 +105,11 @@ func (d *mounts) Mounts() ([]Mount, error) {
|
||||
"bind",
|
||||
},
|
||||
}
|
||||
mounts = append(mounts, mount)
|
||||
seen[p] = true
|
||||
}
|
||||
}
|
||||
|
||||
var mounts []Mount
|
||||
for _, m := range uniqueMounts {
|
||||
mounts = append(mounts, m)
|
||||
}
|
||||
|
||||
d.cache = mounts
|
||||
|
||||
return d.cache, nil
|
||||
|
||||
@@ -44,13 +44,14 @@ func TestMounts(t *testing.T) {
|
||||
"bind",
|
||||
}
|
||||
|
||||
logger, logHook := testlog.NewNullLogger()
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
|
||||
testCases := []struct {
|
||||
description string
|
||||
expectedError error
|
||||
expectedMounts []Mount
|
||||
input *mounts
|
||||
repeat int
|
||||
}{
|
||||
{
|
||||
description: "nill lookup returns error",
|
||||
@@ -159,31 +160,68 @@ func TestMounts(t *testing.T) {
|
||||
{Path: "/located", HostPath: "/some/root/located", Options: mountOptions},
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "multiple mounts ordering",
|
||||
input: &mounts{
|
||||
lookup: &lookup.LocatorMock{
|
||||
LocateFunc: func(s string) ([]string, error) {
|
||||
return []string{
|
||||
"first",
|
||||
"second",
|
||||
"third",
|
||||
"fourth",
|
||||
"second",
|
||||
"second",
|
||||
"second",
|
||||
"fifth",
|
||||
"sixth"}, nil
|
||||
},
|
||||
},
|
||||
required: []string{""},
|
||||
},
|
||||
expectedMounts: []Mount{
|
||||
{Path: "first", HostPath: "first", Options: mountOptions},
|
||||
{Path: "second", HostPath: "second", Options: mountOptions},
|
||||
{Path: "third", HostPath: "third", Options: mountOptions},
|
||||
{Path: "fourth", HostPath: "fourth", Options: mountOptions},
|
||||
{Path: "fifth", HostPath: "fifth", Options: mountOptions},
|
||||
{Path: "sixth", HostPath: "sixth", Options: mountOptions},
|
||||
},
|
||||
repeat: 10,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
logHook.Reset()
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
tc.input.logger = logger
|
||||
mounts, err := tc.input.Mounts()
|
||||
|
||||
if tc.expectedError != nil {
|
||||
require.Error(t, err)
|
||||
} else {
|
||||
require.NoError(t, err)
|
||||
for i := 1; ; i++ {
|
||||
test_name := tc.description
|
||||
if tc.repeat > 1 {
|
||||
test_name += fmt.Sprintf("/%d", i)
|
||||
}
|
||||
require.ElementsMatch(t, tc.expectedMounts, mounts)
|
||||
success := t.Run(test_name, func(t *testing.T) {
|
||||
tc.input.logger = logger
|
||||
mounts, err := tc.input.Mounts()
|
||||
|
||||
// We check that the mock is called for each element of required
|
||||
if tc.input.lookup != nil {
|
||||
mock := tc.input.lookup.(*lookup.LocatorMock)
|
||||
require.Len(t, mock.LocateCalls(), len(tc.input.required))
|
||||
var args []string
|
||||
for _, c := range mock.LocateCalls() {
|
||||
args = append(args, c.S)
|
||||
if tc.expectedError != nil {
|
||||
require.Error(t, err)
|
||||
} else {
|
||||
require.NoError(t, err)
|
||||
}
|
||||
require.EqualValues(t, args, tc.input.required)
|
||||
require.EqualValues(t, tc.expectedMounts, mounts)
|
||||
|
||||
// We check that the mock is called for each element of required
|
||||
if i == 1 && tc.input.lookup != nil {
|
||||
mock := tc.input.lookup.(*lookup.LocatorMock)
|
||||
require.Len(t, mock.LocateCalls(), len(tc.input.required))
|
||||
var args []string
|
||||
for _, c := range mock.LocateCalls() {
|
||||
args = append(args, c.S)
|
||||
}
|
||||
require.EqualValues(t, args, tc.input.required)
|
||||
}
|
||||
})
|
||||
if !success || i >= tc.repeat {
|
||||
break
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,6 +115,7 @@ func TestWithWithDriverDotSoSymlinks(t *testing.T) {
|
||||
Lifecycle: "createContainer",
|
||||
Path: "/path/to/nvidia-cdi-hook",
|
||||
Args: []string{"nvidia-cdi-hook", "create-symlinks", "--link", "libcuda.so.1::/usr/lib/libcuda.so"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -147,6 +148,7 @@ func TestWithWithDriverDotSoSymlinks(t *testing.T) {
|
||||
Lifecycle: "createContainer",
|
||||
Path: "/path/to/nvidia-cdi-hook",
|
||||
Args: []string{"nvidia-cdi-hook", "create-symlinks", "--link", "libcuda.so.1::/usr/lib/libcuda.so"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -178,6 +180,7 @@ func TestWithWithDriverDotSoSymlinks(t *testing.T) {
|
||||
Lifecycle: "createContainer",
|
||||
Path: "/path/to/nvidia-cdi-hook",
|
||||
Args: []string{"nvidia-cdi-hook", "create-symlinks", "--link", "libcuda.so.1::/usr/lib/libcuda.so"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -247,6 +250,7 @@ func TestWithWithDriverDotSoSymlinks(t *testing.T) {
|
||||
Lifecycle: "createContainer",
|
||||
Path: "/path/to/nvidia-cdi-hook",
|
||||
Args: []string{"nvidia-cdi-hook", "create-symlinks", "--link", "libcuda.so.1::/usr/lib/libcuda.so"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -301,6 +305,7 @@ func TestWithWithDriverDotSoSymlinks(t *testing.T) {
|
||||
"--link", "libGLX_nvidia.so.1.2.3::/usr/lib/libGLX_indirect.so.0",
|
||||
"--link", "libnvidia-opticalflow.so.1::/usr/lib/libnvidia-opticalflow.so",
|
||||
},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
@@ -38,10 +38,15 @@ func (d hook) toEdits() *cdi.ContainerEdits {
|
||||
// toSpec converts a discovered Hook to a CDI Spec Hook. Note
|
||||
// that missing info is filled in when edits are applied by querying the Hook node.
|
||||
func (d hook) toSpec() *specs.Hook {
|
||||
env := d.Env
|
||||
if env == nil {
|
||||
env = []string{"NVIDIA_CTK_DEBUG=false"}
|
||||
}
|
||||
s := specs.Hook{
|
||||
HookName: d.Lifecycle,
|
||||
Path: d.Path,
|
||||
Args: d.Args,
|
||||
Env: env,
|
||||
}
|
||||
|
||||
return &s
|
||||
|
||||
@@ -216,7 +216,7 @@ func TestResolveAutoMode(t *testing.T) {
|
||||
HasTegraFilesFunc: func() (bool, string) {
|
||||
return tc.info["tegra"], "tegra"
|
||||
},
|
||||
UsesOnlyNVGPUModuleFunc: func() (bool, string) {
|
||||
HasOnlyIntegratedGPUsFunc: func() (bool, string) {
|
||||
return tc.info["nvgpu"], "nvgpu"
|
||||
},
|
||||
}
|
||||
|
||||
@@ -47,6 +47,11 @@ const (
|
||||
flagArchX8664 = 0x0300
|
||||
flagArchX32 = 0x0800
|
||||
flagArchPpc64le = 0x0500
|
||||
|
||||
// flagArch_ARM_LIBHF is the flag value for 32-bit ARM libs using hard-float.
|
||||
flagArch_ARM_LIBHF = 0x0900
|
||||
// flagArch_AARCH64_LIB64 is the flag value for 64-bit ARM libs.
|
||||
flagArch_AARCH64_LIB64 = 0x0a00
|
||||
)
|
||||
|
||||
var errInvalidCache = errors.New("invalid ld.so.cache file")
|
||||
@@ -195,10 +200,14 @@ func (c *ldcache) getEntries() []entry {
|
||||
switch e.Flags & flagArchMask {
|
||||
case flagArchX8664:
|
||||
fallthrough
|
||||
case flagArch_AARCH64_LIB64:
|
||||
fallthrough
|
||||
case flagArchPpc64le:
|
||||
bits = 64
|
||||
case flagArchX32:
|
||||
fallthrough
|
||||
case flagArch_ARM_LIBHF:
|
||||
fallthrough
|
||||
case flagArchI386:
|
||||
bits = 32
|
||||
default:
|
||||
|
||||
@@ -68,20 +68,10 @@ func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image
|
||||
return nil, fmt.Errorf("failed to get CDI spec: %v", err)
|
||||
}
|
||||
|
||||
cdiModifier, err := cdi.New(
|
||||
return cdi.New(
|
||||
cdi.WithLogger(logger),
|
||||
cdi.WithSpec(spec.Raw()),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to construct CDI modifier: %v", err)
|
||||
}
|
||||
|
||||
modifiers := Merge(
|
||||
nvidiaContainerRuntimeHookRemover{logger},
|
||||
cdiModifier,
|
||||
)
|
||||
|
||||
return modifiers, nil
|
||||
}
|
||||
|
||||
func checkRequirements(logger logger.Interface, image image.CUDA) error {
|
||||
|
||||
@@ -19,7 +19,6 @@ package modifier
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
testlog "github.com/sirupsen/logrus/hooks/test"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
@@ -74,66 +73,3 @@ func TestNewCSVModifier(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCSVModifierRemovesHook(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
|
||||
testCases := []struct {
|
||||
description string
|
||||
spec *specs.Spec
|
||||
expectedError error
|
||||
expectedSpec *specs.Spec
|
||||
}{
|
||||
{
|
||||
description: "modification removes existing nvidia-container-runtime-hook",
|
||||
spec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-runtime-hook",
|
||||
Args: []string{"/path/to/nvidia-container-runtime-hook", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedSpec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "modification removes existing nvidia-container-toolkit",
|
||||
spec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-toolkit",
|
||||
Args: []string{"/path/to/nvidia-container-toolkit", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedSpec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
m := nvidiaContainerRuntimeHookRemover{logger: logger}
|
||||
|
||||
err := m.Modify(tc.spec)
|
||||
if tc.expectedError != nil {
|
||||
require.Error(t, err)
|
||||
} else {
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
require.Empty(t, tc.spec.Hooks.Prestart)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,12 +78,14 @@ func TestDiscoverModifier(t *testing.T) {
|
||||
{
|
||||
Path: "/hook/a",
|
||||
Args: []string{"/hook/a", "arga"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
CreateContainer: []specs.Hook{
|
||||
{
|
||||
Path: "/hook/b",
|
||||
Args: []string{"/hook/b", "argb"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -123,6 +125,7 @@ func TestDiscoverModifier(t *testing.T) {
|
||||
{
|
||||
Path: "/hook/b",
|
||||
Args: []string{"/hook/b", "argb"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
@@ -23,6 +23,7 @@ import (
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
)
|
||||
|
||||
@@ -35,7 +36,7 @@ import (
|
||||
// NVIDIA_GDRCOPY=enabled
|
||||
//
|
||||
// If not devices are selected, no changes are made.
|
||||
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
|
||||
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver) (oci.SpecModifier, error) {
|
||||
if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
logger.Infof("No modification required; no devices requested")
|
||||
return nil, nil
|
||||
@@ -78,5 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
|
||||
discoverers = append(discoverers, d)
|
||||
}
|
||||
|
||||
// If the feature flag has explicitly been toggled, we don't make any modification.
|
||||
if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
|
||||
cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err)
|
||||
}
|
||||
discoverers = append(discoverers, cudaCompatDiscoverer)
|
||||
}
|
||||
|
||||
return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
|
||||
}
|
||||
|
||||
func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) {
|
||||
// For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook.
|
||||
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode != config.CUDACompatModeHook {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
|
||||
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
|
||||
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
|
||||
return compatLibHookDiscoverer, nil
|
||||
}
|
||||
|
||||
// For legacy mode, we also need to inject a hook to update the LDCache
|
||||
// after we have modifed the configuration.
|
||||
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
|
||||
logger,
|
||||
discover.None{},
|
||||
cfg.NVIDIACTKConfig.Path,
|
||||
"",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
|
||||
}
|
||||
|
||||
return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil
|
||||
}
|
||||
|
||||
@@ -33,6 +33,13 @@ type nvidiaContainerRuntimeHookRemover struct {
|
||||
|
||||
var _ oci.SpecModifier = (*nvidiaContainerRuntimeHookRemover)(nil)
|
||||
|
||||
// NewNvidiaContainerRuntimeHookRemover creates a modifier that removes any NVIDIA Container Runtime hooks from the provided spec.
|
||||
func NewNvidiaContainerRuntimeHookRemover(logger logger.Interface) oci.SpecModifier {
|
||||
return nvidiaContainerRuntimeHookRemover{
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
|
||||
// Modify removes any NVIDIA Container Runtime hooks from the provided spec
|
||||
func (m nvidiaContainerRuntimeHookRemover) Modify(spec *specs.Spec) error {
|
||||
if spec == nil {
|
||||
|
||||
@@ -19,14 +19,14 @@ func TestMaintainSpec(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, f := range files {
|
||||
inputSpecPath := filepath.Join(moduleRoot, "test/input", f)
|
||||
inputSpecPath := filepath.Join(moduleRoot, "tests/input", f)
|
||||
|
||||
spec := NewFileSpec(inputSpecPath).(*fileSpec)
|
||||
|
||||
_, err := spec.Load()
|
||||
require.NoError(t, err)
|
||||
|
||||
outputSpecPath := filepath.Join(moduleRoot, "test/output", f)
|
||||
outputSpecPath := filepath.Join(moduleRoot, "tests/output", f)
|
||||
spec.path = outputSpecPath
|
||||
spec.Flush()
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ func TestGetFileList(t *testing.T) {
|
||||
}{
|
||||
{
|
||||
description: "returns list of CSV files",
|
||||
root: "test/input/csv_samples/",
|
||||
root: "tests/input/csv_samples/",
|
||||
files: []string{
|
||||
"jetson.csv",
|
||||
"simple_wrong.csv",
|
||||
@@ -46,15 +46,15 @@ func TestGetFileList(t *testing.T) {
|
||||
},
|
||||
{
|
||||
description: "handles empty folder",
|
||||
root: "test/input/csv_samples/empty",
|
||||
root: "tests/input/csv_samples/empty",
|
||||
},
|
||||
{
|
||||
description: "handles non-existent folder",
|
||||
root: "test/input/csv_samples/NONEXISTENT",
|
||||
root: "tests/input/csv_samples/NONEXISTENT",
|
||||
},
|
||||
{
|
||||
description: "handles non-existent folder root",
|
||||
root: "/NONEXISTENT/test/input/csv_samples/",
|
||||
root: "/NONEXISTENT/tests/input/csv_samples/",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -97,6 +97,7 @@ func TestDiscovererFromCSVFiles(t *testing.T) {
|
||||
"--link",
|
||||
"/usr/lib/aarch64-linux-gnu/tegra/libv4l2_nvargus.so::/usr/lib/aarch64-linux-gnu/libv4l/plugins/nv/libv4l2_nvargus.so",
|
||||
},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -153,6 +154,7 @@ func TestDiscovererFromCSVFiles(t *testing.T) {
|
||||
"--link",
|
||||
"/usr/lib/aarch64-linux-gnu/tegra/libv4l2_nvargus.so::/usr/lib/aarch64-linux-gnu/libv4l/plugins/nv/libv4l2_nvargus.so",
|
||||
},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
@@ -75,6 +75,8 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
||||
}
|
||||
|
||||
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
|
||||
// We update the mode here so that we can continue passing just the config to other functions.
|
||||
cfg.NVIDIAContainerRuntimeConfig.Mode = mode
|
||||
modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -85,6 +87,8 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
||||
switch modifierType {
|
||||
case "mode":
|
||||
modifiers = append(modifiers, modeModifier)
|
||||
case "nvidia-hook-remover":
|
||||
modifiers = append(modifiers, modifier.NewNvidiaContainerRuntimeHookRemover(logger))
|
||||
case "graphics":
|
||||
graphicsModifier, err := modifier.NewGraphicsModifier(logger, cfg, image, driver)
|
||||
if err != nil {
|
||||
@@ -92,7 +96,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
||||
}
|
||||
modifiers = append(modifiers, graphicsModifier)
|
||||
case "feature-gated":
|
||||
featureGatedModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image)
|
||||
featureGatedModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image, driver)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -121,11 +125,11 @@ func supportedModifierTypes(mode string) []string {
|
||||
switch mode {
|
||||
case "cdi":
|
||||
// For CDI mode we make no additional modifications.
|
||||
return []string{"mode"}
|
||||
return []string{"nvidia-hook-remover", "mode"}
|
||||
case "csv":
|
||||
// For CSV mode we support mode and feature-gated modification.
|
||||
return []string{"mode", "feature-gated"}
|
||||
return []string{"nvidia-hook-remover", "feature-gated", "mode"}
|
||||
default:
|
||||
return []string{"mode", "graphics", "feature-gated"}
|
||||
return []string{"feature-gated", "graphics", "mode"}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ import (
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/test"
|
||||
)
|
||||
|
||||
@@ -45,7 +46,7 @@ func TestMain(m *testing.M) {
|
||||
if err != nil {
|
||||
log.Fatalf("error in test setup: could not get module root: %v", err)
|
||||
}
|
||||
testBinPath := filepath.Join(moduleRoot, "test", "bin")
|
||||
testBinPath := filepath.Join(moduleRoot, "tests", "bin")
|
||||
|
||||
// Set the environment variables for the test
|
||||
os.Setenv("PATH", test.PrependToPath(testBinPath, moduleRoot))
|
||||
@@ -165,3 +166,181 @@ func TestFactoryMethod(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewSpecModifier(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
driver := root.New(
|
||||
root.WithDriverRoot("/nvidia/driver/root"),
|
||||
)
|
||||
testCases := []struct {
|
||||
description string
|
||||
config *config.Config
|
||||
spec *specs.Spec
|
||||
expectedSpec *specs.Spec
|
||||
}{
|
||||
{
|
||||
description: "csv mode removes nvidia-container-runtime-hook",
|
||||
config: &config.Config{
|
||||
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
|
||||
Mode: "csv",
|
||||
},
|
||||
},
|
||||
spec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-runtime-hook",
|
||||
Args: []string{"/path/to/nvidia-container-runtime-hook", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedSpec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: nil,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "csv mode removes nvidia-container-toolkit",
|
||||
config: &config.Config{
|
||||
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
|
||||
Mode: "csv",
|
||||
},
|
||||
},
|
||||
spec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-toolkit",
|
||||
Args: []string{"/path/to/nvidia-container-toolkit", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedSpec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: nil,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "cdi mode removes nvidia-container-runtime-hook",
|
||||
config: &config.Config{
|
||||
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
|
||||
Mode: "cdi",
|
||||
},
|
||||
},
|
||||
spec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-runtime-hook",
|
||||
Args: []string{"/path/to/nvidia-container-runtime-hook", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedSpec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: nil,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "cdi mode removes nvidia-container-toolkit",
|
||||
config: &config.Config{
|
||||
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
|
||||
Mode: "cdi",
|
||||
},
|
||||
},
|
||||
spec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-toolkit",
|
||||
Args: []string{"/path/to/nvidia-container-toolkit", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedSpec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: nil,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "legacy mode keeps nvidia-container-runtime-hook",
|
||||
config: &config.Config{
|
||||
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
|
||||
Mode: "legacy",
|
||||
},
|
||||
},
|
||||
spec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-runtime-hook",
|
||||
Args: []string{"/path/to/nvidia-container-runtime-hook", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedSpec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-runtime-hook",
|
||||
Args: []string{"/path/to/nvidia-container-runtime-hook", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "legacy mode keeps nvidia-container-toolkit",
|
||||
config: &config.Config{
|
||||
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
|
||||
Mode: "legacy",
|
||||
},
|
||||
},
|
||||
spec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-toolkit",
|
||||
Args: []string{"/path/to/nvidia-container-toolkit", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedSpec: &specs.Spec{
|
||||
Hooks: &specs.Hooks{
|
||||
Prestart: []specs.Hook{
|
||||
{
|
||||
Path: "/path/to/nvidia-container-toolkit",
|
||||
Args: []string{"/path/to/nvidia-container-toolkit", "prestart"},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
spec := &oci.SpecMock{
|
||||
LoadFunc: func() (*specs.Spec, error) {
|
||||
return tc.spec, nil
|
||||
},
|
||||
}
|
||||
m, err := newSpecModifier(logger, tc.config, spec, driver)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = m.Modify(tc.spec)
|
||||
require.NoError(t, err)
|
||||
require.EqualValues(t, tc.expectedSpec, tc.spec)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,10 +20,10 @@ package engine
|
||||
type Interface interface {
|
||||
AddRuntime(string, string, bool) error
|
||||
DefaultRuntime() string
|
||||
EnableCDI()
|
||||
GetRuntimeConfig(string) (RuntimeConfig, error)
|
||||
RemoveRuntime(string) error
|
||||
Save(string) (int64, error)
|
||||
Set(string, interface{})
|
||||
String() string
|
||||
}
|
||||
|
||||
|
||||
@@ -96,13 +96,6 @@ func (c *Config) getRuntimeAnnotations(path []string) ([]string, error) {
|
||||
return annotations, nil
|
||||
}
|
||||
|
||||
// Set sets the specified containerd option.
|
||||
func (c *Config) Set(key string, value interface{}) {
|
||||
config := *c.Tree
|
||||
config.SetPath([]string{"plugins", c.CRIRuntimePluginName, key}, value)
|
||||
*c.Tree = config
|
||||
}
|
||||
|
||||
// DefaultRuntime returns the default runtime for the cri-o config
|
||||
func (c Config) DefaultRuntime() string {
|
||||
if runtime, ok := c.GetPath([]string{"plugins", c.CRIRuntimePluginName, "containerd", "default_runtime_name"}).(string); ok {
|
||||
@@ -111,6 +104,13 @@ func (c Config) DefaultRuntime() string {
|
||||
return ""
|
||||
}
|
||||
|
||||
// EnableCDI sets the enable_cdi field in the Containerd config to true.
|
||||
func (c *Config) EnableCDI() {
|
||||
config := *c.Tree
|
||||
config.SetPath([]string{"plugins", c.CRIRuntimePluginName, "enable_cdi"}, true)
|
||||
*c.Tree = config
|
||||
}
|
||||
|
||||
// RemoveRuntime removes a runtime from the docker config
|
||||
func (c *Config) RemoveRuntime(name string) error {
|
||||
if c == nil || c.Tree == nil {
|
||||
|
||||
@@ -143,13 +143,6 @@ func (c *ConfigV1) RemoveRuntime(name string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Set sets the specified containerd option.
|
||||
func (c *ConfigV1) Set(key string, value interface{}) {
|
||||
config := *c.Tree
|
||||
config.SetPath([]string{"plugins", "cri", "containerd", key}, value)
|
||||
*c.Tree = config
|
||||
}
|
||||
|
||||
// Save writes the config to a file
|
||||
func (c ConfigV1) Save(path string) (int64, error) {
|
||||
return (Config)(c).Save(path)
|
||||
@@ -165,3 +158,9 @@ func (c *ConfigV1) GetRuntimeConfig(name string) (engine.RuntimeConfig, error) {
|
||||
tree: runtimeData,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *ConfigV1) EnableCDI() {
|
||||
config := *c.Tree
|
||||
config.SetPath([]string{"plugins", "cri", "containerd", "enable_cdi"}, true)
|
||||
*c.Tree = config
|
||||
}
|
||||
|
||||
@@ -162,8 +162,11 @@ func (c *Config) GetRuntimeConfig(name string) (engine.RuntimeConfig, error) {
|
||||
}
|
||||
|
||||
// CommandLineSource returns the CLI-based containerd config loader
|
||||
func CommandLineSource(hostRoot string) toml.Loader {
|
||||
return toml.FromCommandLine(chrootIfRequired(hostRoot, "containerd", "config", "dump")...)
|
||||
func CommandLineSource(hostRoot string, executablePath string) toml.Loader {
|
||||
if executablePath == "" {
|
||||
executablePath = "containerd"
|
||||
}
|
||||
return toml.FromCommandLine(chrootIfRequired(hostRoot, executablePath, "config", "dump")...)
|
||||
}
|
||||
|
||||
func chrootIfRequired(hostRoot string, commandLine ...string) []string {
|
||||
|
||||
@@ -153,10 +153,16 @@ func (c *Config) GetRuntimeConfig(name string) (engine.RuntimeConfig, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
// EnableCDI is a no-op for CRI-O since it always enabled where supported.
|
||||
func (c *Config) EnableCDI() {}
|
||||
|
||||
// CommandLineSource returns the CLI-based crio config loader
|
||||
func CommandLineSource(hostRoot string) toml.Loader {
|
||||
func CommandLineSource(hostRoot string, executablePath string) toml.Loader {
|
||||
if executablePath == "" {
|
||||
executablePath = "crio"
|
||||
}
|
||||
return toml.LoadFirst(
|
||||
toml.FromCommandLine(chrootIfRequired(hostRoot, "crio", "status", "config")...),
|
||||
toml.FromCommandLine(chrootIfRequired(hostRoot, executablePath, "status", "config")...),
|
||||
toml.FromCommandLine(chrootIfRequired(hostRoot, "crio-status", "config")...),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -103,6 +103,24 @@ func (c Config) DefaultRuntime() string {
|
||||
return r
|
||||
}
|
||||
|
||||
// EnableCDI sets features.cdi to true in the docker config.
|
||||
func (c *Config) EnableCDI() {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
config := *c
|
||||
|
||||
features, ok := config["features"].(map[string]bool)
|
||||
if !ok {
|
||||
features = make(map[string]bool)
|
||||
}
|
||||
features["cdi"] = true
|
||||
|
||||
config["features"] = features
|
||||
|
||||
*c = config
|
||||
}
|
||||
|
||||
// RemoveRuntime removes a runtime from the docker config
|
||||
func (c *Config) RemoveRuntime(name string) error {
|
||||
if c == nil {
|
||||
@@ -132,11 +150,6 @@ func (c *Config) RemoveRuntime(name string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Set sets the specified docker option
|
||||
func (c *Config) Set(key string, value interface{}) {
|
||||
(*c)[key] = value
|
||||
}
|
||||
|
||||
// Save writes the config to the specified path
|
||||
func (c Config) Save(path string) (int64, error) {
|
||||
output, err := json.MarshalIndent(c, "", " ")
|
||||
|
||||
@@ -53,3 +53,13 @@ type Interface interface {
|
||||
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error)
|
||||
GetDeviceSpecsByID(...string) ([]specs.Device, error)
|
||||
}
|
||||
|
||||
// A HookName refers to one of the predefined set of CDI hooks that may be
|
||||
// included in the generated CDI specification.
|
||||
type HookName string
|
||||
|
||||
const (
|
||||
// HookEnableCudaCompat refers to the hook used to enable CUDA Forward Compatibility.
|
||||
// This was added with v1.17.5 of the NVIDIA Container Toolkit.
|
||||
HookEnableCudaCompat = HookName("enable-cuda-compat")
|
||||
)
|
||||
|
||||
@@ -41,7 +41,7 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) {
|
||||
l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err)
|
||||
}
|
||||
|
||||
driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, l.nvmllib)
|
||||
driverFiles, err := l.NewDriverDiscoverer()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err)
|
||||
}
|
||||
|
||||
@@ -34,41 +34,41 @@ import (
|
||||
|
||||
// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation.
|
||||
// The supplied NVML Library is used to query the expected driver version.
|
||||
func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, nvmllib nvml.Interface) (discover.Discover, error) {
|
||||
if r := nvmllib.Init(); r != nvml.SUCCESS {
|
||||
func (l *nvmllib) NewDriverDiscoverer() (discover.Discover, error) {
|
||||
if r := l.nvmllib.Init(); r != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("failed to initialize NVML: %v", r)
|
||||
}
|
||||
defer func() {
|
||||
if r := nvmllib.Shutdown(); r != nvml.SUCCESS {
|
||||
logger.Warningf("failed to shutdown NVML: %v", r)
|
||||
if r := l.nvmllib.Shutdown(); r != nvml.SUCCESS {
|
||||
l.logger.Warningf("failed to shutdown NVML: %v", r)
|
||||
}
|
||||
}()
|
||||
|
||||
version, r := nvmllib.SystemGetDriverVersion()
|
||||
version, r := l.nvmllib.SystemGetDriverVersion()
|
||||
if r != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("failed to determine driver version: %v", r)
|
||||
}
|
||||
|
||||
return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version)
|
||||
return (*nvcdilib)(l).newDriverVersionDiscoverer(version)
|
||||
}
|
||||
|
||||
func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) {
|
||||
libraries, err := NewDriverLibraryDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version)
|
||||
func (l *nvcdilib) newDriverVersionDiscoverer(version string) (discover.Discover, error) {
|
||||
libraries, err := l.NewDriverLibraryDiscoverer(version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err)
|
||||
}
|
||||
|
||||
ipcs, err := discover.NewIPCDiscoverer(logger, driver.Root)
|
||||
ipcs, err := discover.NewIPCDiscoverer(l.logger, l.driver.Root)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err)
|
||||
}
|
||||
|
||||
firmwares, err := NewDriverFirmwareDiscoverer(logger, driver.Root, version)
|
||||
firmwares, err := NewDriverFirmwareDiscoverer(l.logger, l.driver.Root, version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for GSP firmware: %v", err)
|
||||
}
|
||||
|
||||
binaries := NewDriverBinariesDiscoverer(logger, driver.Root)
|
||||
binaries := NewDriverBinariesDiscoverer(l.logger, l.driver.Root)
|
||||
|
||||
d := discover.Merge(
|
||||
libraries,
|
||||
@@ -81,32 +81,41 @@ func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nv
|
||||
}
|
||||
|
||||
// NewDriverLibraryDiscoverer creates a discoverer for the libraries associated with the specified driver version.
|
||||
func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) {
|
||||
libraryPaths, err := getVersionLibs(logger, driver, version)
|
||||
func (l *nvcdilib) NewDriverLibraryDiscoverer(version string) (discover.Discover, error) {
|
||||
libraryPaths, err := getVersionLibs(l.logger, l.driver, version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get libraries for driver version: %v", err)
|
||||
}
|
||||
|
||||
libraries := discover.NewMounts(
|
||||
logger,
|
||||
l.logger,
|
||||
lookup.NewFileLocator(
|
||||
lookup.WithLogger(logger),
|
||||
lookup.WithRoot(driver.Root),
|
||||
lookup.WithLogger(l.logger),
|
||||
lookup.WithRoot(l.driver.Root),
|
||||
),
|
||||
driver.Root,
|
||||
l.driver.Root,
|
||||
libraryPaths,
|
||||
)
|
||||
|
||||
updateLDCache, _ := discover.NewLDCacheUpdateHook(logger, libraries, nvidiaCDIHookPath, ldconfigPath)
|
||||
var discoverers []discover.Discover
|
||||
|
||||
d := discover.Merge(
|
||||
discover.WithDriverDotSoSymlinks(
|
||||
libraries,
|
||||
version,
|
||||
nvidiaCDIHookPath,
|
||||
),
|
||||
updateLDCache,
|
||||
driverDotSoSymlinksDiscoverer := discover.WithDriverDotSoSymlinks(
|
||||
libraries,
|
||||
version,
|
||||
l.nvidiaCDIHookPath,
|
||||
)
|
||||
discoverers = append(discoverers, driverDotSoSymlinksDiscoverer)
|
||||
|
||||
if l.HookIsSupported(HookEnableCudaCompat) {
|
||||
// TODO: The following should use the version directly.
|
||||
cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(l.logger, l.nvidiaCDIHookPath, l.driver)
|
||||
discoverers = append(discoverers, cudaCompatLibHookDiscoverer)
|
||||
}
|
||||
|
||||
updateLDCache, _ := discover.NewLDCacheUpdateHook(l.logger, libraries, l.nvidiaCDIHookPath, l.ldconfigPath)
|
||||
discoverers = append(discoverers, updateLDCache)
|
||||
|
||||
d := discover.Merge(discoverers...)
|
||||
|
||||
return d, nil
|
||||
}
|
||||
@@ -184,6 +193,8 @@ func NewDriverBinariesDiscoverer(logger logger.Interface, driverRoot string) dis
|
||||
"nvidia-persistenced", /* Persistence mode utility */
|
||||
"nvidia-cuda-mps-control", /* Multi process service CLI */
|
||||
"nvidia-cuda-mps-server", /* Multi process service server */
|
||||
"nvidia-imex", /* NVIDIA IMEX Daemon */
|
||||
"nvidia-imex-ctl", /* NVIDIA IMEX control */
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
@@ -95,6 +95,7 @@ func TestNvidiaSMISymlinkHook(t *testing.T) {
|
||||
Path: "nvidia-cdi-hook",
|
||||
Args: []string{"nvidia-cdi-hook", "create-symlinks",
|
||||
"--link", "nvidia-smi::/usr/bin/nvidia-smi"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -115,6 +116,7 @@ func TestNvidiaSMISymlinkHook(t *testing.T) {
|
||||
Path: "nvidia-cdi-hook",
|
||||
Args: []string{"nvidia-cdi-hook", "create-symlinks",
|
||||
"--link", "/some/path/nvidia-smi::/usr/bin/nvidia-smi"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
@@ -135,6 +137,7 @@ func TestNvidiaSMISymlinkHook(t *testing.T) {
|
||||
Path: "nvidia-cdi-hook",
|
||||
Args: []string{"nvidia-cdi-hook", "create-symlinks",
|
||||
"--link", "/some/path/nvidia-smi::/usr/bin/nvidia-smi"},
|
||||
Env: []string{"NVIDIA_CTK_DEBUG=false"},
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
30
pkg/nvcdi/hooks.go
Normal file
30
pkg/nvcdi/hooks.go
Normal file
@@ -0,0 +1,30 @@
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
// disabledHooks allows individual hooks to be disabled.
|
||||
type disabledHooks map[HookName]bool
|
||||
|
||||
// HookIsSupported checks whether a hook of the specified name is supported.
|
||||
// Hooks must be explicitly disabled, meaning that if no disabled hooks are
|
||||
// all hooks are supported.
|
||||
func (l *nvcdilib) HookIsSupported(h HookName) bool {
|
||||
if len(l.disabledHooks) == 0 {
|
||||
return true
|
||||
}
|
||||
return !l.disabledHooks[h]
|
||||
}
|
||||
@@ -66,11 +66,15 @@ type nvcdilib struct {
|
||||
infolib info.Interface
|
||||
|
||||
mergedDeviceOptions []transform.MergedDeviceOption
|
||||
|
||||
disabledHooks disabledHooks
|
||||
}
|
||||
|
||||
// New creates a new nvcdi library
|
||||
func New(opts ...Option) (Interface, error) {
|
||||
l := &nvcdilib{}
|
||||
l := &nvcdilib{
|
||||
disabledHooks: make(disabledHooks),
|
||||
}
|
||||
for _, opt := range opts {
|
||||
opt(l)
|
||||
}
|
||||
@@ -111,19 +115,24 @@ func New(opts ...Option) (Interface, error) {
|
||||
}
|
||||
l.nvmllib = nvml.New(nvmlOpts...)
|
||||
}
|
||||
if l.nvsandboxutilslib == nil {
|
||||
var nvsandboxutilsOpts []nvsandboxutils.LibraryOption
|
||||
// Set the library path for libnvidia-sandboxutils
|
||||
candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1")
|
||||
if err != nil {
|
||||
l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err)
|
||||
} else {
|
||||
libNvidiaSandboxutilsPath := candidates[0]
|
||||
l.logger.Infof("Using %v", libNvidiaSandboxutilsPath)
|
||||
nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath))
|
||||
}
|
||||
l.nvsandboxutilslib = nvsandboxutils.New(nvsandboxutilsOpts...)
|
||||
}
|
||||
// TODO: Repeated calls to nvsandboxutils.Init and Shutdown are causing
|
||||
// segmentation violations. Here we disabled nvsandbox utils unless explicitly
|
||||
// specified.
|
||||
// This will be reenabled as soon as we have more visibility into why this is
|
||||
// happening and a mechanism to detect and disable this if required.
|
||||
// if l.nvsandboxutilslib == nil {
|
||||
// var nvsandboxutilsOpts []nvsandboxutils.LibraryOption
|
||||
// // Set the library path for libnvidia-sandboxutils
|
||||
// candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1")
|
||||
// if err != nil {
|
||||
// l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err)
|
||||
// } else {
|
||||
// libNvidiaSandboxutilsPath := candidates[0]
|
||||
// l.logger.Infof("Using %v", libNvidiaSandboxutilsPath)
|
||||
// nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath))
|
||||
// }
|
||||
// l.nvsandboxutilslib = nvsandboxutils.New(nvsandboxutilsOpts...)
|
||||
// }
|
||||
if l.devicelib == nil {
|
||||
l.devicelib = device.New(l.nvmllib)
|
||||
}
|
||||
@@ -147,6 +156,8 @@ func New(opts ...Option) (Interface, error) {
|
||||
if l.vendor == "" {
|
||||
l.vendor = "management.nvidia.com"
|
||||
}
|
||||
// Management containers in general do not require CUDA Forward compatibility.
|
||||
l.disabledHooks[HookEnableCudaCompat] = true
|
||||
lib = (*managementlib)(l)
|
||||
case ModeNvml:
|
||||
lib = (*nvmllib)(l)
|
||||
|
||||
@@ -80,7 +80,7 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
|
||||
}
|
||||
|
||||
driver, err := newDriverVersionDiscoverer(m.logger, m.driver, m.nvidiaCDIHookPath, m.ldconfigPath, version)
|
||||
driver, err := (*nvcdilib)(m).newDriverVersionDiscoverer(version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create driver library discoverer: %v", err)
|
||||
}
|
||||
|
||||
@@ -155,3 +155,14 @@ func WithLibrarySearchPaths(paths []string) Option {
|
||||
o.librarySearchPaths = paths
|
||||
}
|
||||
}
|
||||
|
||||
// WithDisabledHook allows specific hooks to the disabled.
|
||||
// This option can be specified multiple times for each hook.
|
||||
func WithDisabledHook(hook HookName) Option {
|
||||
return func(o *nvcdilib) {
|
||||
if o.disabledHooks == nil {
|
||||
o.disabledHooks = make(map[HookName]bool)
|
||||
}
|
||||
o.disabledHooks[hook] = true
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,3 @@
|
||||
FROM quay.io/centos/centos:stream8
|
||||
|
||||
RUN sed -i -e "s|mirrorlist=|#mirrorlist=|g" \
|
||||
-e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" \
|
||||
/etc/yum.repos.d/CentOS-Stream-*
|
||||
FROM quay.io/centos/centos:stream9
|
||||
|
||||
RUN yum install -y createrepo rpm-sign pinentry
|
||||
|
||||
@@ -1,76 +0,0 @@
|
||||
#! /bin/bash
|
||||
# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
testing::toolkit::install() {
|
||||
local -r uid=$(id -u)
|
||||
local -r gid=$(id -g)
|
||||
|
||||
local READLINK="readlink"
|
||||
local -r platform=$(uname)
|
||||
if [[ "${platform}" == "Darwin" ]]; then
|
||||
READLINK="greadlink"
|
||||
fi
|
||||
|
||||
testing::docker_run::toolkit::shell 'toolkit install --toolkit-root=/usr/local/nvidia/toolkit'
|
||||
docker run --rm -v "${shared_dir}:/work" alpine sh -c "chown -R ${uid}:${gid} /work/"
|
||||
|
||||
# Ensure toolkit dir is correctly setup
|
||||
test ! -z "$(ls -A "${shared_dir}/usr/local/nvidia/toolkit")"
|
||||
|
||||
test -L "${shared_dir}/usr/local/nvidia/toolkit/libnvidia-container.so.1"
|
||||
test -e "$(${READLINK} -f "${shared_dir}/usr/local/nvidia/toolkit/libnvidia-container.so.1")"
|
||||
test -L "${shared_dir}/usr/local/nvidia/toolkit/libnvidia-container-go.so.1"
|
||||
test -e "$(${READLINK} -f "${shared_dir}/usr/local/nvidia/toolkit/libnvidia-container-go.so.1")"
|
||||
|
||||
test -e "${shared_dir}/usr/local/nvidia/toolkit/nvidia-container-cli"
|
||||
test -e "${shared_dir}/usr/local/nvidia/toolkit/nvidia-container-runtime-hook"
|
||||
test -L "${shared_dir}/usr/local/nvidia/toolkit/nvidia-container-toolkit"
|
||||
test -e "${shared_dir}/usr/local/nvidia/toolkit/nvidia-container-runtime"
|
||||
|
||||
grep -q -E "nvidia driver modules are not yet loaded, invoking runc directly" "${shared_dir}/usr/local/nvidia/toolkit/nvidia-container-runtime"
|
||||
grep -q -E "exec runc \".@\"" "${shared_dir}/usr/local/nvidia/toolkit/nvidia-container-runtime"
|
||||
|
||||
test -e "${shared_dir}/usr/local/nvidia/toolkit/nvidia-container-cli.real"
|
||||
test -e "${shared_dir}/usr/local/nvidia/toolkit/nvidia-container-runtime-hook.real"
|
||||
test -e "${shared_dir}/usr/local/nvidia/toolkit/nvidia-container-runtime.real"
|
||||
|
||||
test -e "${shared_dir}/usr/local/nvidia/toolkit/.config/nvidia-container-runtime/config.toml"
|
||||
|
||||
# Ensure that the config file has the required contents.
|
||||
# NOTE: This assumes that RUN_DIR is '/run/nvidia'
|
||||
local -r nvidia_run_dir="/run/nvidia"
|
||||
grep -q -E "^\s*ldconfig = \"@${nvidia_run_dir}/driver/sbin/ldconfig(.real)?\"" "${shared_dir}/usr/local/nvidia/toolkit/.config/nvidia-container-runtime/config.toml"
|
||||
grep -q -E "^\s*root = \"${nvidia_run_dir}/driver\"" "${shared_dir}/usr/local/nvidia/toolkit/.config/nvidia-container-runtime/config.toml"
|
||||
grep -q -E "^\s*path = \"/usr/local/nvidia/toolkit/nvidia-container-cli\"" "${shared_dir}/usr/local/nvidia/toolkit/.config/nvidia-container-runtime/config.toml"
|
||||
grep -q -E "^\s*path = \"/usr/local/nvidia/toolkit/nvidia-ctk\"" "${shared_dir}/usr/local/nvidia/toolkit/.config/nvidia-container-runtime/config.toml"
|
||||
}
|
||||
|
||||
testing::toolkit::delete() {
|
||||
testing::docker_run::toolkit::shell 'mkdir -p /usr/local/nvidia/delete-toolkit'
|
||||
testing::docker_run::toolkit::shell 'touch /usr/local/nvidia/delete-toolkit/test.file'
|
||||
testing::docker_run::toolkit::shell 'toolkit delete --toolkit-root=/usr/local/nvidia/delete-toolkit'
|
||||
|
||||
test ! -z "$(ls -A "${shared_dir}/usr/local/nvidia")"
|
||||
test ! -e "${shared_dir}/usr/local/nvidia/delete-toolkit"
|
||||
}
|
||||
|
||||
testing::toolkit::main() {
|
||||
testing::toolkit::install
|
||||
testing::toolkit::delete
|
||||
}
|
||||
|
||||
testing::toolkit::cleanup() {
|
||||
:
|
||||
}
|
||||
@@ -19,7 +19,6 @@ shopt -s lastpipe
|
||||
readonly basedir="$(dirname "$(realpath "$0")")"
|
||||
source "${basedir}/common.sh"
|
||||
|
||||
source "${basedir}/toolkit_test.sh"
|
||||
source "${basedir}/docker_test.sh"
|
||||
source "${basedir}/crio_test.sh"
|
||||
source "${basedir}/containerd_test.sh"
|
||||
@@ -66,7 +65,7 @@ done
|
||||
|
||||
trap '"$CLEANUP" && testing::cleanup' ERR
|
||||
|
||||
readonly test_cases="${TEST_CASES:-toolkit docker crio containerd}"
|
||||
readonly test_cases="${TEST_CASES:-docker crio containerd}"
|
||||
|
||||
testing::cleanup
|
||||
for tc in ${test_cases}; do
|
||||
45
tests/e2e/Makefile
Normal file
45
tests/e2e/Makefile
Normal file
@@ -0,0 +1,45 @@
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
GO_CMD ?= go
|
||||
|
||||
include $(CURDIR)/versions.mk
|
||||
|
||||
E2E_RUNTIME ?= docker
|
||||
|
||||
E2E_INSTALL_CTK ?= false
|
||||
|
||||
ifeq ($($(DIST)),)
|
||||
DIST ?= ubuntu20.04
|
||||
endif
|
||||
IMAGE_TAG ?= $(VERSION)-$(DIST)
|
||||
IMAGE = $(IMAGE_NAME):$(IMAGE_TAG)
|
||||
|
||||
E2E_SSH_KEY ?=
|
||||
E2E_SSH_USER ?=
|
||||
E2E_SSH_HOST ?=
|
||||
E2E_SSH_PORT ?= 22
|
||||
|
||||
.PHONY: test
|
||||
test:
|
||||
cd $(CURDIR)/tests/e2e && $(GO_CMD) test -v . -args \
|
||||
-ginkgo.focus="$(E2E_RUNTIME)" \
|
||||
-test.timeout=1h \
|
||||
-ginkgo.v \
|
||||
-install-ctk=$(E2E_INSTALL_CTK) \
|
||||
-toolkit-image=$(IMAGE) \
|
||||
-ssh-key=$(E2E_SSH_KEY) \
|
||||
-ssh-user=$(E2E_SSH_USER) \
|
||||
-remote-host=$(E2E_SSH_HOST) \
|
||||
-remote-port=$(E2E_SSH_PORT)
|
||||
63
tests/e2e/e2e_test.go
Normal file
63
tests/e2e/e2e_test.go
Normal file
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Test context
|
||||
var (
|
||||
ctx context.Context
|
||||
|
||||
installCTK bool
|
||||
|
||||
image string
|
||||
|
||||
sshKey string
|
||||
sshUser string
|
||||
host string
|
||||
sshPort string
|
||||
)
|
||||
|
||||
func init() {
|
||||
flag.BoolVar(&installCTK, "install-ctk", false, "Install the NVIDIA Container Toolkit")
|
||||
flag.StringVar(&image, "toolkit-image", "", "Repository of the image to test")
|
||||
flag.StringVar(&sshKey, "ssh-key", "", "SSH key to use for remote login")
|
||||
flag.StringVar(&sshUser, "ssh-user", "", "SSH user to use for remote login")
|
||||
flag.StringVar(&host, "remote-host", "", "Hostname of the remote machine")
|
||||
flag.StringVar(&sshPort, "remote-port", "22", "SSH port to use for remote login")
|
||||
}
|
||||
|
||||
func TestMain(t *testing.T) {
|
||||
suiteName := "NVIDIA Container Toolkit E2E"
|
||||
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t,
|
||||
suiteName,
|
||||
)
|
||||
}
|
||||
|
||||
// BeforeSuite runs before the test suite
|
||||
var _ = BeforeSuite(func() {
|
||||
ctx = context.Background()
|
||||
})
|
||||
30
tests/e2e/infra/aws.yaml
Normal file
30
tests/e2e/infra/aws.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
apiVersion: holodeck.nvidia.com/v1alpha1
|
||||
kind: Environment
|
||||
metadata:
|
||||
name: HOLODECK_NAME
|
||||
description: "end-to-end test infrastructure"
|
||||
spec:
|
||||
provider: aws
|
||||
auth:
|
||||
keyName: cnt-ci
|
||||
privateKey: HOLODECK_PRIVATE_KEY
|
||||
instance:
|
||||
type: g4dn.xlarge
|
||||
region: us-west-1
|
||||
ingressIpRanges:
|
||||
- 18.190.12.32/32
|
||||
- 3.143.46.93/32
|
||||
- 44.230.241.223/32
|
||||
- 44.235.4.62/32
|
||||
- 52.15.119.136/32
|
||||
- 52.24.205.48/32
|
||||
image:
|
||||
architecture: amd64
|
||||
imageId: ami-0ce2cb35386fc22e9
|
||||
containerRuntime:
|
||||
install: true
|
||||
name: docker
|
||||
nvidiaContainerToolkit:
|
||||
install: false
|
||||
nvidiaDriver:
|
||||
install: true
|
||||
118
tests/e2e/installer.go
Normal file
118
tests/e2e/installer.go
Normal file
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"text/template"
|
||||
)
|
||||
|
||||
// dockerInstallTemplate is a template for installing the NVIDIA Container Toolkit
|
||||
// on a host using Docker.
|
||||
var dockerInstallTemplate = `
|
||||
#! /usr/bin/env bash
|
||||
set -xe
|
||||
|
||||
: ${IMAGE:={{.Image}}}
|
||||
|
||||
# Create a temporary directory
|
||||
TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
|
||||
mkdir -p "$TEMP_DIR"
|
||||
|
||||
# Given that docker has an init function that checks for the existence of the
|
||||
# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
|
||||
# in the /usr/bin directory.
|
||||
# See https://github.com/moby/moby/blob/20a05dabf44934447d1a66cdd616cc803b81d4e2/daemon/nvidia_linux.go#L32-L46
|
||||
sudo rm -f /usr/bin/nvidia-container-runtime-hook
|
||||
sudo ln -s "$TEMP_DIR/toolkit/nvidia-container-runtime-hook" /usr/bin/nvidia-container-runtime-hook
|
||||
|
||||
docker run --pid=host --rm -i --privileged \
|
||||
-v /:/host \
|
||||
-v /var/run/docker.sock:/var/run/docker.sock \
|
||||
-v "$TEMP_DIR:$TEMP_DIR" \
|
||||
-v /etc/docker:/config-root \
|
||||
${IMAGE} \
|
||||
--root "$TEMP_DIR" \
|
||||
--runtime=docker \
|
||||
--config=/config-root/daemon.json \
|
||||
--driver-root=/ \
|
||||
--no-daemon \
|
||||
--restart-mode=systemd
|
||||
`
|
||||
|
||||
type ToolkitInstaller struct {
|
||||
runner Runner
|
||||
template string
|
||||
|
||||
Image string
|
||||
}
|
||||
|
||||
type installerOption func(*ToolkitInstaller)
|
||||
|
||||
func WithRunner(r Runner) installerOption {
|
||||
return func(i *ToolkitInstaller) {
|
||||
i.runner = r
|
||||
}
|
||||
}
|
||||
|
||||
func WithImage(image string) installerOption {
|
||||
return func(i *ToolkitInstaller) {
|
||||
i.Image = image
|
||||
}
|
||||
}
|
||||
|
||||
func WithTemplate(template string) installerOption {
|
||||
return func(i *ToolkitInstaller) {
|
||||
i.template = template
|
||||
}
|
||||
}
|
||||
|
||||
func NewToolkitInstaller(opts ...installerOption) (*ToolkitInstaller, error) {
|
||||
i := &ToolkitInstaller{
|
||||
runner: localRunner{},
|
||||
template: dockerInstallTemplate,
|
||||
}
|
||||
|
||||
for _, opt := range opts {
|
||||
opt(i)
|
||||
}
|
||||
|
||||
if i.Image == "" {
|
||||
return nil, fmt.Errorf("image is required")
|
||||
}
|
||||
|
||||
return i, nil
|
||||
}
|
||||
|
||||
func (i *ToolkitInstaller) Install() error {
|
||||
// Parse the combined template
|
||||
tmpl, err := template.New("installScript").Parse(i.template)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing template: %w", err)
|
||||
}
|
||||
|
||||
// Execute the template
|
||||
var renderedScript bytes.Buffer
|
||||
err = tmpl.Execute(&renderedScript, i)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error executing template: %w", err)
|
||||
}
|
||||
|
||||
_, _, err = i.runner.Run(renderedScript.String())
|
||||
return err
|
||||
}
|
||||
218
tests/e2e/nvidia-container-toolkit_test.go
Normal file
218
tests/e2e/nvidia-container-toolkit_test.go
Normal file
@@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"context"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Integration tests for Docker runtime
|
||||
var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
var r Runner
|
||||
|
||||
// Install the NVIDIA Container Toolkit
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
r = NewRunner(
|
||||
WithHost(host),
|
||||
WithPort(sshPort),
|
||||
WithSshKey(sshKey),
|
||||
WithSshUser(sshUser),
|
||||
)
|
||||
if installCTK {
|
||||
installer, err := NewToolkitInstaller(
|
||||
WithRunner(r),
|
||||
WithImage(image),
|
||||
WithTemplate(dockerInstallTemplate),
|
||||
)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
err = installer.Install()
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
}
|
||||
})
|
||||
|
||||
// GPUs are accessible in a container: Running nvidia-smi -L inside the
|
||||
// container shows the same output inside the container as outside the
|
||||
// container. This means that the following commands must all produce
|
||||
// the same output
|
||||
When("running nvidia-smi -L", Ordered, func() {
|
||||
var hostOutput string
|
||||
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull ubuntu")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
hostOutput, _, err = r.Run("nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation with the --gpus flag", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --gpus=all --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
containerOutput, _, err := r.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(containerOutput).To(Equal(hostOutput))
|
||||
})
|
||||
})
|
||||
|
||||
// A vectorAdd sample runs in a container with access to all GPUs.
|
||||
// The following should all produce the same result.
|
||||
When("Running the cuda-vectorAdd sample", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
var referenceOutput string
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
var err error
|
||||
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out2))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out3))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out4))
|
||||
})
|
||||
})
|
||||
|
||||
// A deviceQuery sample runs in a container with access to all GPUs
|
||||
// The following should all produce the same result.
|
||||
When("Running the cuda-deviceQuery sample", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
var referenceOutput string
|
||||
|
||||
It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
|
||||
var err error
|
||||
referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
|
||||
})
|
||||
|
||||
It("should support automatic CDI spec generation", func(ctx context.Context) {
|
||||
out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out2))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
|
||||
out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out3))
|
||||
})
|
||||
|
||||
It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(referenceOutput).To(Equal(out4))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("CUDA Forward compatibility", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(compatOutput).ToNot(BeEmpty())
|
||||
compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.")
|
||||
compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0]
|
||||
|
||||
driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
parts := strings.SplitN(driverOutput, ":", 2)
|
||||
Expect(parts).To(HaveLen(2))
|
||||
|
||||
hostDriverVersion := strings.TrimSpace(parts[1])
|
||||
Expect(hostDriverVersion).ToNot(BeEmpty())
|
||||
driverMajor := strings.SplitN(hostDriverVersion, ".", 2)[0]
|
||||
|
||||
if driverMajor >= compatMajor {
|
||||
GinkgoLogr.Info("CUDA Forward Compatibility tests require an older driver version", "hostDriverVersion", hostDriverVersion, "compatDriverVersion", compatDriverVersion)
|
||||
Skip("CUDA Forward Compatibility tests require an older driver version")
|
||||
}
|
||||
})
|
||||
|
||||
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
|
||||
})
|
||||
})
|
||||
})
|
||||
171
tests/e2e/runner.go
Normal file
171
tests/e2e/runner.go
Normal file
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package e2e
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"golang.org/x/crypto/ssh"
|
||||
)
|
||||
|
||||
type localRunner struct{}
|
||||
type remoteRunner struct {
|
||||
sshKey string
|
||||
sshUser string
|
||||
host string
|
||||
port string
|
||||
}
|
||||
|
||||
type runnerOption func(*remoteRunner)
|
||||
|
||||
type Runner interface {
|
||||
Run(script string) (string, string, error)
|
||||
}
|
||||
|
||||
func WithSshKey(key string) runnerOption {
|
||||
return func(r *remoteRunner) {
|
||||
r.sshKey = key
|
||||
}
|
||||
}
|
||||
|
||||
func WithSshUser(user string) runnerOption {
|
||||
return func(r *remoteRunner) {
|
||||
r.sshUser = user
|
||||
}
|
||||
}
|
||||
|
||||
func WithHost(host string) runnerOption {
|
||||
return func(r *remoteRunner) {
|
||||
r.host = host
|
||||
}
|
||||
}
|
||||
|
||||
func WithPort(port string) runnerOption {
|
||||
return func(r *remoteRunner) {
|
||||
r.port = port
|
||||
}
|
||||
}
|
||||
|
||||
func NewRunner(opts ...runnerOption) Runner {
|
||||
r := &remoteRunner{}
|
||||
for _, opt := range opts {
|
||||
opt(r)
|
||||
}
|
||||
|
||||
// If the Host is empty, return a local runner
|
||||
if r.host == "" {
|
||||
return localRunner{}
|
||||
}
|
||||
|
||||
// Otherwise, return a remote runner
|
||||
return r
|
||||
}
|
||||
|
||||
func (l localRunner) Run(script string) (string, string, error) {
|
||||
// Create a command to run the script using bash
|
||||
cmd := exec.Command("bash", "-c", script)
|
||||
|
||||
// Buffer to capture standard output
|
||||
var stdout bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
|
||||
// Buffer to capture standard error
|
||||
var stderr bytes.Buffer
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
// Run the command
|
||||
err := cmd.Run()
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("script execution failed: %v\nSTDOUT: %s\nSTDERR: %s", err, stdout.String(), stderr.String())
|
||||
}
|
||||
|
||||
// Return the captured stdout and nil error
|
||||
return stdout.String(), "", nil
|
||||
}
|
||||
|
||||
func (r remoteRunner) Run(script string) (string, string, error) {
|
||||
// Create a new SSH connection
|
||||
client, err := connectOrDie(r.sshKey, r.sshUser, r.host, r.port)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("failed to connect to %s: %v", r.host, err)
|
||||
}
|
||||
defer client.Close()
|
||||
|
||||
// Create a session
|
||||
session, err := client.NewSession()
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("failed to create session: %v", err)
|
||||
}
|
||||
defer session.Close()
|
||||
|
||||
// Capture stdout and stderr
|
||||
var stdout, stderr bytes.Buffer
|
||||
session.Stdout = &stdout
|
||||
session.Stderr = &stderr
|
||||
|
||||
// Run the script
|
||||
err = session.Run(script)
|
||||
if err != nil {
|
||||
return "", "", fmt.Errorf("script execution failed: %v\nSTDOUT: %s\nSTDERR: %s", err, stdout.String(), stderr.String())
|
||||
}
|
||||
|
||||
// Return stdout as string if no errors
|
||||
return stdout.String(), "", nil
|
||||
}
|
||||
|
||||
// createSshClient creates a ssh client, and retries if it fails to connect
|
||||
func connectOrDie(sshKey, sshUser, host, port string) (*ssh.Client, error) {
|
||||
var client *ssh.Client
|
||||
var err error
|
||||
key, err := os.ReadFile(sshKey)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read key file: %v", err)
|
||||
}
|
||||
signer, err := ssh.ParsePrivateKey(key)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse private key: %v", err)
|
||||
}
|
||||
sshConfig := &ssh.ClientConfig{
|
||||
User: sshUser,
|
||||
Auth: []ssh.AuthMethod{
|
||||
ssh.PublicKeys(signer),
|
||||
},
|
||||
HostKeyCallback: ssh.InsecureIgnoreHostKey(),
|
||||
}
|
||||
|
||||
connectionFailed := false
|
||||
for i := 0; i < 20; i++ {
|
||||
client, err = ssh.Dial("tcp", host+":"+port, sshConfig)
|
||||
if err == nil {
|
||||
return client, nil // Connection succeeded, return the client.
|
||||
}
|
||||
connectionFailed = true
|
||||
// Sleep for a brief moment before retrying.
|
||||
// You can adjust the duration based on your requirements.
|
||||
time.Sleep(1 * time.Second)
|
||||
}
|
||||
|
||||
if connectionFailed {
|
||||
return nil, fmt.Errorf("failed to connect to %s after 10 retries, giving up", host)
|
||||
}
|
||||
|
||||
return client, nil
|
||||
}
|
||||
21
tests/go.mod
Normal file
21
tests/go.mod
Normal file
@@ -0,0 +1,21 @@
|
||||
module github.com/NVIDIA/nvidia-container-toolkit/tests
|
||||
|
||||
go 1.23.2
|
||||
|
||||
require (
|
||||
github.com/onsi/ginkgo/v2 v2.22.2
|
||||
github.com/onsi/gomega v1.36.2
|
||||
golang.org/x/crypto v0.35.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/go-logr/logr v1.4.2 // indirect
|
||||
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
|
||||
github.com/google/go-cmp v0.6.0 // indirect
|
||||
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
|
||||
golang.org/x/net v0.33.0 // indirect
|
||||
golang.org/x/sys v0.30.0 // indirect
|
||||
golang.org/x/text v0.22.0 // indirect
|
||||
golang.org/x/tools v0.28.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
36
tests/go.sum
Normal file
36
tests/go.sum
Normal file
@@ -0,0 +1,36 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
|
||||
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
|
||||
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg=
|
||||
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
|
||||
github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU=
|
||||
github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk=
|
||||
github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8=
|
||||
github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
golang.org/x/crypto v0.35.0 h1:b15kiHdrGCHrP6LvwaQ3c03kgNhhiMgvlhxHQhmg2Xs=
|
||||
golang.org/x/crypto v0.35.0/go.mod h1:dy7dXNW32cAb/6/PRuTNsix8T+vJAqvuIy5Bli/x0YQ=
|
||||
golang.org/x/net v0.33.0 h1:74SYHlV8BIgHIFC/LrYkOGIwL19eTYXQ5wc6TBuO36I=
|
||||
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
|
||||
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
|
||||
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
|
||||
golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
|
||||
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
|
||||
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
|
||||
golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8=
|
||||
golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw=
|
||||
google.golang.org/protobuf v1.36.1 h1:yBPeRvTftaleIgM3PZ/WBIZ7XM/eEYAaEyCwvyjq/gk=
|
||||
google.golang.org/protobuf v1.36.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user