Compare commits

..

27 Commits

Author SHA1 Message Date
Evan Lezar
8551c71a8a Merge branch 'master' into experimental 2022-01-31 14:07:36 +01:00
Evan Lezar
7ee81e91de Merge branch 'fix-dev-null-typo' into 'experimental'
Correct typo in check for null device

See merge request nvidia/container-toolkit/container-toolkit!101
2022-01-25 11:03:00 +00:00
Evan Lezar
5a6860adbf Correct typo in check for null device
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2022-01-25 11:18:28 +01:00
Evan Lezar
ce2b0dfdf0 Merge branch 'fix-log-level' into 'experimental'
Correctly set log-level from config.toml

See merge request nvidia/container-toolkit/container-toolkit!94
2022-01-14 12:08:45 +00:00
Evan Lezar
6f0129bdf2 Remove amazonlinux1 package builds from CI
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2022-01-14 12:03:43 +01:00
Evan Lezar
ffd98424d8 Correctly set log-level from config.toml
This change contains a bugfix where the log-level (if specified in the config.toml file)
would be assigned to config.Root instead of config.LogLevel.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2022-01-14 10:29:12 +01:00
Evan Lezar
cf192169a8 Add basic README for experimental runtime
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-12-08 21:43:06 +01:00
Evan Lezar
eeabd2b94d Merge branch 'master' into experimental 2021-12-08 20:44:10 +01:00
Evan Lezar
d81329da1f Merge branch 'master' into experimental 2021-11-16 13:52:58 +01:00
Evan Lezar
3b84f527cc Merge branch 'master' into experimental
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-11-12 11:17:02 +01:00
Evan Lezar
8da6e42d88 Merge branch 'master' into experimental 2021-10-27 15:00:41 +02:00
Evan Lezar
1a755d471a Use functions in oci package to parse arguments
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-10-18 14:36:10 +02:00
Evan Lezar
c05f9dffc5 Update imports
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-10-18 14:36:10 +02:00
Evan Lezar
7c5b913e09 Move pkg/* to internal/*
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-10-18 14:36:10 +02:00
Evan Lezar
c0d2d41f23 Remove unused internal/oci package
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-10-18 14:36:09 +02:00
Evan Lezar
08cf87eb21 Use oci package from experimental branch for nvidia-container-runtime
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-10-18 14:36:09 +02:00
Evan Lezar
d3a9f512c4 Merge branch 'merge-into-experimental' into 'experimental'
Merge master into experimental

See merge request nvidia/container-toolkit/container-toolkit!56
2021-10-18 12:33:55 +00:00
Evan Lezar
73baa74ea8 Merge master into experimental 2021-10-18 12:33:55 +00:00
Evan Lezar
39aa24690c Merge branch 'master' into experimental 2021-09-28 16:32:40 +02:00
Evan Lezar
dc6b7c703b Merge branch 'master' into experimental 2021-09-09 15:42:20 +02:00
Evan Lezar
b044b56c6e Merge branch 'migrate-experimental-runtime' into 'experimental'
Merge code from experimental runtime

See merge request nvidia/container-toolkit/container-toolkit!37
2021-07-05 11:49:44 +00:00
Evan Lezar
46cdf8c41a Address ineffectual assignment error
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-07-01 16:13:22 +02:00
Evan Lezar
22ee5eb64f Run CI checks in development image
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-07-01 15:59:20 +02:00
Evan Lezar
a7f3398a68 Update makefile with Development docker image
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-06-30 14:08:06 +02:00
Evan Lezar
72fe259745 Run go mod vendor and tidy
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-06-30 14:07:36 +02:00
Evan Lezar
3b27d91026 Update package references
This change replaces the reference to gitlab.com/nvidia/cloud-native/container-toolkit
with github.com/NVIDIA/nvidia-container-toolkit matching this project.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-06-30 14:07:36 +02:00
Evan Lezar
d3997eceb2 Copy files from nvidia-container-toolkit
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2021-06-30 14:07:34 +02:00
750 changed files with 36772 additions and 91007 deletions

View File

@@ -12,15 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
default:
image: docker
image: docker:stable
services:
- name: docker:dind
- name: docker:stable-dind
command: ["--experimental"]
variables:
GIT_SUBMODULE_STRATEGY: recursive
BUILDIMAGE: "${CI_REGISTRY_IMAGE}/build:${CI_COMMIT_SHORT_SHA}"
BUILD_MULTI_ARCH_IMAGES: "true"
stages:
- image
@@ -33,81 +32,45 @@ stages:
- test
- scan
- release
.main-or-manual:
rules:
- if: $CI_COMMIT_BRANCH == "main"
- if: $CI_COMMIT_TAG && $CI_COMMIT_TAG != ""
- if: $CI_PIPELINE_SOURCE == "schedule"
when: manual
- build-all
# Define the distribution targets
.dist-amazonlinux2:
rules:
- !reference [.main-or-manual, rules]
variables:
DIST: amazonlinux2
.dist-centos7:
rules:
- !reference [.main-or-manual, rules]
variables:
DIST: centos7
CVE_UPDATES: "cyrus-sasl-lib"
CVE_UPDATES: "nss"
.dist-centos8:
variables:
DIST: centos8
CVE_UPDATES: "cyrus-sasl-lib"
.dist-debian10:
rules:
- !reference [.main-or-manual, rules]
variables:
DIST: debian10
.dist-debian9:
rules:
- !reference [.main-or-manual, rules]
variables:
DIST: debian9
.dist-fedora35:
rules:
- !reference [.main-or-manual, rules]
variables:
DIST: fedora35
.dist-opensuse-leap15.1:
rules:
- !reference [.main-or-manual, rules]
variables:
DIST: opensuse-leap15.1
.dist-ubi8:
rules:
- !reference [.main-or-manual, rules]
variables:
DIST: ubi8
CVE_UPDATES: "cyrus-sasl-lib"
.dist-ubuntu16.04:
rules:
- !reference [.main-or-manual, rules]
variables:
DIST: ubuntu16.04
.dist-ubuntu18.04:
variables:
DIST: ubuntu18.04
CVE_UPDATES: "libsasl2-2 libsasl2-modules-db"
.dist-ubuntu20.04:
rules:
- !reference [.main-or-manual, rules]
variables:
DIST: ubuntu20.04
CVE_UPDATES: "libsasl2-2 libsasl2-modules-db"
.dist-packaging:
variables:
@@ -127,8 +90,6 @@ stages:
ARCH: arm64
.arch-ppc64le:
rules:
- !reference [.main-or-manual, rules]
variables:
ARCH: ppc64le
@@ -136,15 +97,6 @@ stages:
variables:
ARCH: x86_64
# Define the platform targets
.platform-amd64:
variables:
PLATFORM: linux/amd64
.platform-arm64:
variables:
PLATFORM: linux/arm64
# Define test helpers
.integration:
stage: test
@@ -166,30 +118,20 @@ test-packaging:
needs:
- image-packaging
# Download the regctl binary for use in the release steps
.regctl-setup:
before_script:
- export REGCTL_VERSION=v0.3.10
- apk add --no-cache curl
- mkdir -p bin
- curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64
- chmod a+x bin/regctl
- export PATH=$(pwd)/bin:${PATH}
# .release forms the base of the deployment jobs which push images to the CI registry.
# This is extended with the version to be deployed (e.g. the SHA or TAG) and the
# target os.
.release:
stage: release
stage:
release
variables:
# Define the source image for the release
IMAGE_NAME: "${CI_REGISTRY_IMAGE}/container-toolkit"
VERSION: "${CI_COMMIT_SHORT_SHA}"
# OUT_IMAGE_VERSION is overridden for external releases
OUT_IMAGE_VERSION: "${CI_COMMIT_SHORT_SHA}"
stage: release
before_script:
- !reference [.regctl-setup, before_script]
# We ensure that the OUT_IMAGE_VERSION is set
- 'echo Version: ${OUT_IMAGE_VERSION} ; [[ -n "${OUT_IMAGE_VERSION}" ]] || exit 1'
@@ -197,16 +139,16 @@ test-packaging:
# need to tag the image.
# Note: a leading 'v' is stripped from the version if present
- apk add --no-cache make bash
script:
# Log in to the "output" registry, tag the image and push the image
- 'echo "Logging in to CI registry ${CI_REGISTRY}"'
- regctl registry login "${CI_REGISTRY}" -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}"
- '[ ${CI_REGISTRY} = ${OUT_REGISTRY} ] || echo "Logging in to output registry ${OUT_REGISTRY}"'
- '[ ${CI_REGISTRY} = ${OUT_REGISTRY} ] || regctl registry login "${OUT_REGISTRY}" -u "${OUT_REGISTRY_USER}" -p "${OUT_REGISTRY_TOKEN}"'
# Since OUT_IMAGE_NAME and OUT_IMAGE_VERSION are set, this will push the CI image to the
# Target
- make -f build/container/Makefile push-${DIST}
- docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
- docker pull "${IMAGE_NAME}:${VERSION}-${DIST}"
script:
- docker tag "${IMAGE_NAME}:${VERSION}-${DIST}" "${OUT_IMAGE_NAME}:${OUT_IMAGE_VERSION}-${DIST}"
# Log in to the "output" registry, tag the image and push the image
- 'echo "Logging in to output registry ${OUT_REGISTRY}"'
- docker logout
- docker login -u "${OUT_REGISTRY_USER}" -p "${OUT_REGISTRY_TOKEN}" "${OUT_REGISTRY}"
- make IMAGE_NAME=${OUT_IMAGE_NAME} VERSION=${OUT_IMAGE_VERSION} -f build/container/Makefile push-${DIST}
# Define a staging release step that pushes an image to an internal "staging" repository
# This is triggered for all pipelines (i.e. not only tags) to test the pipeline steps
@@ -221,7 +163,7 @@ test-packaging:
OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/staging/container-toolkit"
# Define an external release step that pushes an image to an external repository.
# This includes a devlopment image off main.
# This includes a devlopment image off master.
.release:external:
extends:
- .release
@@ -241,6 +183,13 @@ release:staging-centos7:
needs:
- image-centos7
release:staging-centos8:
extends:
- .release:staging
- .dist-centos8
needs:
- image-centos8
release:staging-ubi8:
extends:
- .release:staging
@@ -258,13 +207,6 @@ release:staging-ubuntu18.04:
- test-crio-ubuntu18.04
- test-docker-ubuntu18.04
release:staging-ubuntu20.04:
extends:
- .release:staging
- .dist-ubuntu20.04
needs:
- image-ubuntu20.04
release:staging-packaging:
extends:
- .release:staging

5
.gitignore vendored
View File

@@ -1,11 +1,8 @@
dist
*.swp
*.swo
/coverage.out*
/coverage.out
/test/output/
/nvidia-container-runtime
/nvidia-container-runtime-hook
/nvidia-container-toolkit
/nvidia-ctk
/shared-*
/release-*

View File

@@ -94,9 +94,8 @@ unit-tests:
- .multi-arch-build
- .package-artifacts
stage: package-build
timeout: 3h
script:
- ./scripts/build-packages.sh ${DIST}-${ARCH}
- ./scripts/release.sh ${DIST}-${ARCH}
artifacts:
name: ${ARTIFACTS_NAME}
@@ -158,18 +157,6 @@ package-debian9-amd64:
- .dist-debian9
- .arch-amd64
package-fedora35-aarch64:
extends:
- .package-build
- .dist-fedora35
- .arch-aarch64
package-fedora35-x86_64:
extends:
- .package-build
- .dist-fedora35
- .arch-x86_64
package-opensuse-leap15.1-x86_64:
extends:
- .package-build
@@ -206,33 +193,19 @@ package-ubuntu18.04-ppc64le:
- .dist-ubuntu18.04
- .arch-ppc64le
.buildx-setup:
before_script:
- export BUILDX_VERSION=v0.6.3
- apk add --no-cache curl
- mkdir -p ~/.docker/cli-plugins
- curl -sSLo ~/.docker/cli-plugins/docker-buildx "https://github.com/docker/buildx/releases/download/${BUILDX_VERSION}/buildx-${BUILDX_VERSION}.linux-amd64"
- chmod a+x ~/.docker/cli-plugins/docker-buildx
- docker buildx create --use --platform=linux/amd64,linux/arm64
- '[[ -n "${SKIP_QEMU_SETUP}" ]] || docker run --rm --privileged multiarch/qemu-user-static --reset -p yes'
# Define the image build targets
.image-build:
stage: image-build
variables:
IMAGE_NAME: "${CI_REGISTRY_IMAGE}/container-toolkit"
VERSION: "${CI_COMMIT_SHORT_SHA}"
PUSH_ON_BUILD: "true"
before_script:
- !reference [.buildx-setup, before_script]
- apk add --no-cache bash make
- 'echo "Logging in to CI registry ${CI_REGISTRY}"'
- docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
script:
- make -f build/container/Makefile build-${DIST}
- make -f build/container/Makefile push-${DIST}
image-centos7:
extends:
@@ -243,33 +216,31 @@ image-centos7:
- package-centos7-ppc64le
- package-centos7-x86_64
image-centos8:
extends:
- .image-build
- .package-artifacts
- .dist-centos8
needs:
- package-centos8-aarch64
- package-centos8-x86_64
- package-centos8-ppc64le
image-ubi8:
extends:
- .image-build
- .package-artifacts
- .dist-ubi8
needs:
# Note: The ubi8 image uses the centos8 packages
- package-centos8-aarch64
- package-centos8-x86_64
- package-centos8-ppc64le
# Note: The ubi8 image currently uses the centos7 packages
- package-centos7-ppc64le
- package-centos7-x86_64
image-ubuntu18.04:
extends:
- .image-build
- .package-artifacts
- .dist-ubuntu18.04
needs:
- package-ubuntu18.04-amd64
- package-ubuntu18.04-arm64
- job: package-ubuntu18.04-ppc64le
optional: true
image-ubuntu20.04:
extends:
- .image-build
- .package-artifacts
- .dist-ubuntu20.04
needs:
- package-ubuntu18.04-amd64
- package-ubuntu18.04-arm64
@@ -282,36 +253,21 @@ image-packaging:
- .package-artifacts
- .dist-packaging
needs:
- job: package-centos8-aarch64
- job: package-centos8-x86_64
- job: package-ubuntu18.04-amd64
- job: package-ubuntu18.04-arm64
- job: package-amazonlinux2-aarch64
optional: true
- job: package-amazonlinux2-x86_64
optional: true
- job: package-centos7-ppc64le
optional: true
- job: package-centos7-x86_64
optional: true
- job: package-centos8-ppc64le
optional: true
- job: package-debian10-amd64
optional: true
- job: package-debian9-amd64
optional: true
- job: package-fedora35-aarch64
optional: true
- job: package-fedora35-x86_64
optional: true
- job: package-opensuse-leap15.1-x86_64
optional: true
- job: package-ubuntu16.04-amd64
optional: true
- job: package-ubuntu16.04-ppc64le
optional: true
- job: package-ubuntu18.04-ppc64le
optional: true
- package-amazonlinux2-aarch64
- package-amazonlinux2-x86_64
- package-centos7-ppc64le
- package-centos7-x86_64
- package-centos8-aarch64
- package-centos8-ppc64le
- package-centos8-x86_64
- package-debian10-amd64
- package-debian9-amd64
- package-opensuse-leap15.1-x86_64
- package-ubuntu16.04-amd64
- package-ubuntu16.04-ppc64le
- package-ubuntu18.04-amd64
- package-ubuntu18.04-arm64
- package-ubuntu18.04-ppc64le
# Define publish test helpers
.test:toolkit:
@@ -371,3 +327,46 @@ test-docker-ubuntu18.04:
needs:
- image-ubuntu18.04
# build-all jobs build packages for every OS / ARCH combination we support.
#
# They are run under two conditions:
# 1) Automatically whenever a new tag is pushed to the repo (e.g. v1.1.0)
# 2) Manually by a reviewer just before merging a MR.
.build-all-for-arch:
variables:
# Setting DIST=docker invokes the docker- release targets
DIST: docker
extends:
- .package-build
stage: build-all
timeout: 2h 30m
rules:
- if: $CI_COMMIT_TAG
when: always
# The full set of build-all jobs organized to
# have builds for each ARCH run in parallel.
build-all-amd64:
extends:
- .build-all-for-arch
- .arch-amd64
build-all-x86_64:
extends:
- .build-all-for-arch
- .arch-x86_64
build-all-ppc64le:
extends:
- .build-all-for-arch
- .arch-ppc64le
build-all-arm64:
extends:
- .build-all-for-arch
- .arch-arm64
build-all-aarch64:
extends:
- .build-all-for-arch
- .arch-aarch64

3
.gitmodules vendored
View File

@@ -1,12 +1,9 @@
[submodule "third_party/libnvidia-container"]
path = third_party/libnvidia-container
url = https://gitlab.com/nvidia/container-toolkit/libnvidia-container.git
branch = main
[submodule "third_party/nvidia-container-runtime"]
path = third_party/nvidia-container-runtime
url = https://gitlab.com/nvidia/container-toolkit/container-runtime.git
branch = main
[submodule "third_party/nvidia-docker"]
path = third_party/nvidia-docker
url = https://gitlab.com/nvidia/container-toolkit/nvidia-docker.git
branch = main

View File

@@ -27,8 +27,8 @@ default:
variables:
DOCKER_DRIVER: overlay2
DOCKER_TLS_CERTDIR: "/certs"
# Release "devel"-tagged images off the main branch
RELEASE_DEVEL_BRANCH: "main"
# Release "devel"-tagged images off the master branch
RELEASE_DEVEL_BRANCH: "master"
DEVEL_RELEASE_IMAGE_VERSION: "devel"
# On the multi-arch builder we don't need the qemu setup.
SKIP_QEMU_SETUP: "1"
@@ -46,11 +46,9 @@ variables:
OUT_REGISTRY_TOKEN: "${CI_REGISTRY_PASSWORD}"
OUT_REGISTRY: "${CI_REGISTRY}"
OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/container-toolkit"
PUSH_MULTIPLE_TAGS: "false"
# We delay the job start to allow the public pipeline to generate the required images.
rules:
- when: delayed
start_in: 30 minutes
when: delayed
start_in: 30 minutes
timeout: 30 minutes
retry:
max: 2
@@ -58,39 +56,39 @@ variables:
- job_execution_timeout
- stuck_or_timeout_failure
before_script:
- !reference [.regctl-setup, before_script]
- apk add --no-cache make bash
- >
regctl manifest get ${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}-${DIST} --list > /dev/null && echo "${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}-${DIST}" || ( echo "${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}-${DIST} does not exist" && sleep infinity )
docker pull ${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}-${DIST} > /dev/null && echo "${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}-${DIST}" || ( echo "${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}-${DIST} does not exist" && sleep infinity )
script:
- regctl registry login "${OUT_REGISTRY}" -u "${OUT_REGISTRY_USER}" -p "${OUT_REGISTRY_TOKEN}"
- make -f build/container/Makefile IMAGE=${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}-${DIST} OUT_IMAGE=${OUT_IMAGE_NAME}:${CI_COMMIT_SHORT_SHA}-${DIST} push-${DIST}
- docker pull ${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}-${DIST}
- docker tag ${IN_REGISTRY}/${IN_IMAGE_NAME}:${IN_VERSION}-${DIST} ${OUT_IMAGE_NAME}:${CI_COMMIT_SHORT_SHA}-${DIST}
- docker login -u "${OUT_REGISTRY_USER}" -p "${OUT_REGISTRY_TOKEN}" "${OUT_REGISTRY}"
- docker push ${OUT_IMAGE_NAME}:${CI_COMMIT_SHORT_SHA}-${DIST}
image-centos7:
extends:
- .dist-centos7
- .image-pull
- .dist-centos7
image-centos8:
extends:
- .image-pull
- .dist-centos8
image-ubi8:
extends:
- .dist-ubi8
- .image-pull
- .dist-ubi8
image-ubuntu18.04:
extends:
- .image-pull
- .dist-ubuntu18.04
- .image-pull
image-ubuntu20.04:
extends:
- .dist-ubuntu20.04
- .image-pull
# The DIST=packaging target creates an image containing all built packages
image-packaging:
extends:
- .dist-packaging
- .image-pull
- .dist-packaging
# We skip the integration tests for the internal CI:
.integration:
@@ -108,13 +106,13 @@ image-packaging:
variables:
IMAGE: "${CI_REGISTRY_IMAGE}/container-toolkit:${CI_COMMIT_SHORT_SHA}-${DIST}"
IMAGE_ARCHIVE: "container-toolkit.tar"
rules:
- if: $SKIP_SCANS != "yes"
- when: manual
except:
variables:
- $SKIP_SCANS && $SKIP_SCANS == "yes"
before_script:
- docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
# TODO: We should specify the architecture here and scan all architectures
- docker pull --platform="${PLATFORM}" "${IMAGE}"
- docker pull "${IMAGE}"
- docker save "${IMAGE}" -o "${IMAGE_ARCHIVE}"
- AuthHeader=$(echo -n $SSA_CLIENT_ID:$SSA_CLIENT_SECRET | base64 -w0)
- >
@@ -133,65 +131,34 @@ image-packaging:
- policy_evaluation.json
# Define the scan targets
scan-centos7-amd64:
scan-centos7:
extends:
- .dist-centos7
- .platform-amd64
- .scan
- .dist-centos7
needs:
- image-centos7
scan-centos7-arm64:
scan-centos8:
extends:
- .dist-centos7
- .platform-arm64
- .scan
- .dist-centos8
needs:
- image-centos7
- scan-centos7-amd64
- image-centos8
scan-ubuntu18.04-amd64:
scan-ubuntu18.04:
extends:
- .scan
- .dist-ubuntu18.04
- .platform-amd64
- .scan
needs:
- image-ubuntu18.04
scan-ubuntu20.04-amd64:
scan-ubi8:
extends:
- .dist-ubuntu20.04
- .platform-amd64
- .scan
needs:
- image-ubuntu20.04
scan-ubuntu20.04-arm64:
extends:
- .dist-ubuntu20.04
- .platform-arm64
- .scan
needs:
- image-ubuntu20.04
- scan-ubuntu20.04-amd64
scan-ubi8-amd64:
extends:
- .dist-ubi8
- .platform-amd64
- .scan
needs:
- image-ubi8
scan-ubi8-arm64:
extends:
- .dist-ubi8
- .platform-arm64
- .scan
needs:
- image-ubi8
- scan-ubi8-amd64
# Define external release helpers
.release:ngc:
extends:
@@ -202,6 +169,15 @@ scan-ubi8-arm64:
OUT_REGISTRY: "${NGC_REGISTRY}"
OUT_IMAGE_NAME: "${NGC_REGISTRY_IMAGE}"
.release:dockerhub:
extends:
- .release:external
variables:
OUT_REGISTRY_USER: "${REGISTRY_USER}"
OUT_REGISTRY_TOKEN: "${REGISTRY_TOKEN}"
OUT_REGISTRY: "${DOCKERHUB_REGISTRY}"
OUT_IMAGE_NAME: "${REGISTRY_IMAGE}"
release:staging-ubuntu18.04:
extends:
- .release:staging
@@ -213,20 +189,41 @@ release:staging-ubuntu18.04:
# Release to NGC
release:ngc-centos7:
extends:
- .release:ngc
- .dist-centos7
- .release:ngc
release:ngc-ubuntu18.04:
release:ngc-centos8:
extends:
- .release:ngc
- .dist-centos8
release:ngc-ubuntu18:
extends:
- .release:ngc
- .dist-ubuntu18.04
- .release:ngc
release:ngc-ubuntu20.04:
extends:
- .dist-ubuntu20.04
- .release:ngc
release:ngc-ubi8:
extends:
- .dist-ubi8
- .release:ngc
- .dist-ubi8
# Release to Dockerhub
release:dockerhub-centos7:
extends:
- .release:dockerhub
- .dist-centos7
release:dockerhub-centos8:
extends:
- .release:dockerhub
- .dist-centos8
release:dockerhub-ubuntu18:
extends:
- .release:dockerhub
- .dist-ubuntu18.04
release:dockerhub-ubi8:
extends:
- .release:dockerhub
- .dist-ubi8

View File

@@ -1,202 +0,0 @@
# NVIDIA Container Toolkit Changelog
## v1.12.0-rc.1
* Add support for multiple Docker Swarm resources
* Improve injection of Vulkan configurations and libraries
* Add `nvidia-ctk info generate-cdi` command to generated CDI specification for available devices
* [libnvidia-container] Include NVVM compiler library in compute libs
## v1.11.0
* Promote v1.11.0-rc.3 to v1.11.0
## v1.11.0-rc.3
* Build fedora35 packages
* Introduce an `nvidia-container-toolkit-base` package for better dependency management
* Fix removal of `nvidia-container-runtime-hook` on RPM-based systems
* Inject platform files into container on Tegra-based systems
* [toolkit container] Update CUDA base images to 11.7.1
* [libnvidia-container] Preload libgcc_s.so.1 on arm64 systems
## v1.11.0-rc.2
* Allow `accept-nvidia-visible-devices-*` config options to be set by toolkit container
* [libnvidia-container] Fix bug where LDCache was not updated when the `--no-pivot-root` option was specified
## v1.11.0-rc.1
* Add discovery of GPUDirect Storage (`nvidia-fs*`) devices if the `NVIDIA_GDS` environment variable of the container is set to `enabled`
* Add discovery of MOFED Infiniband devices if the `NVIDIA_MOFED` environment variable of the container is set to `enabled`
* Fix bug in CSV mode where libraries listed as `sym` entries in mount specification are not added to the LDCache.
* Rename `nvidia-container-toolkit` executable to `nvidia-container-runtime-hook` and create `nvidia-container-toolkit` as a symlink to `nvidia-container-runtime-hook` instead.
* Add `nvidia-ctk runtime configure` command to configure the Docker config file (e.g. `/etc/docker/daemon.json`) for use with the NVIDIA Container Runtime.
## v1.10.0
* Promote v1.10.0-rc.3 to v1.10.0
## v1.10.0-rc.3
* Use default config instead of raising an error if config file cannot be found
* Ignore NVIDIA_REQUIRE_JETPACK* environment variables for requirement checks
* Fix bug in detection of Tegra systems where `/sys/devices/soc0/family` is ignored
* Fix bug where links to devices were detected as devices
* [libnvida-container] Fix bug introduced when adding libcudadebugger.so to list of libraries
## v1.10.0-rc.2
* Add support for NVIDIA_REQUIRE_* checks for cuda version and arch to csv mode
* Switch to debug logging to reduce log verbosity
* Support logging to logs requested in command line
* Fix bug when launching containers with relative root path (e.g. using containerd)
* Allow low-level runtime path to be set explicitly as nvidia-container-runtime.runtimes option
* Fix failure to locate low-level runtime if PATH envvar is unset
* Replace experimental option for NVIDIA Container Runtime with nvidia-container-runtime.mode = csv option
* Use csv as default mode on Tegra systems without NVML
* Add --version flag to all CLIs
* [libnvidia-container] Bump libtirpc to 1.3.2
* [libnvidia-container] Fix bug when running host ldconfig using glibc compiled with a non-standard prefix
* [libnvidia-container] Add libcudadebugger.so to list of compute libraries
## v1.10.0-rc.1
* Include nvidia-ctk CLI in installed binaries
* Add experimental option to NVIDIA Container Runtime
## v1.9.0
* [libnvidia-container] Add additional check for Tegra in /sys/.../family file in CLI
* [libnvidia-container] Update jetpack-specific CLI option to only load Base CSV files by default
* [libnvidia-container] Fix bug (from 1.8.0) when mounting GSP firmware into containers without /lib to /usr/lib symlinks
* [libnvidia-container] Update nvml.h to CUDA 11.6.1 nvML_DEV 11.6.55
* [libnvidia-container] Update switch statement to include new brands from latest nvml.h
* [libnvidia-container] Process all --require flags on Jetson platforms
* [libnvidia-container] Fix long-standing issue with running ldconfig on Debian systems
## v1.8.1
* [libnvidia-container] Fix bug in determining cgroup root when running in nested containers
* [libnvidia-container] Fix permission issue when determining cgroup version
## v1.8.0
* Promote 1.8.0-rc.2-1 to 1.8.0
## v1.8.0-rc.2
* Remove support for building amazonlinux1 packages
## v1.8.0-rc.1
* [libnvidia-container] Add support for cgroupv2
* Release toolkit-container images from nvidia-container-toolkit repository
## v1.7.0
* Promote 1.7.0-rc.1-1 to 1.7.0
* Bump Golang version to 1.16.4
## v1.7.0-rc.1
* Specify containerd runtime type as string in config tools to remove dependency on containerd package
* Add supported-driver-capabilities config option to allow for a subset of all driver capabilities to be specified
## v1.6.0
* Promote 1.6.0-rc.3-1 to 1.6.0
* Fix unnecessary logging to stderr instead of configured nvidia-container-runtime log file
## v1.6.0-rc.3
* Add supported-driver-capabilities config option to the nvidia-container-toolkit
* Move OCI and command line checks for runtime to internal oci package
## v1.6.0-rc.2
* Use relative path to OCI specification file (config.json) if bundle path is not specified as an argument to the nvidia-container-runtime
## v1.6.0-rc.1
* Add AARCH64 package for Amazon Linux 2
* Include nvidia-container-runtime into nvidia-container-toolkit package
## v1.5.1
* Fix bug where Docker Swarm device selection is ignored if NVIDIA_VISIBLE_DEVICES is also set
* Improve unit testing by using require package and adding coverage reports
* Remove unneeded go dependencies by running go mod tidy
* Move contents of pkg directory to cmd for CLI tools
* Ensure make binary target explicitly sets GOOS
## v1.5.0
* Add dependence on libnvidia-container-tools >= 1.4.0
* Add golang check targets to Makefile
* Add Jenkinsfile definition for build targets
* Move docker.mk to docker folder
## v1.4.2
* Add dependence on libnvidia-container-tools >= 1.3.3
## v1.4.1
* Ignore NVIDIA_VISIBLE_DEVICES for containers with insufficent privileges
* Add dependence on libnvidia-container-tools >= 1.3.2
## v1.4.0
* Add 'compute' capability to list of defaults
* Add dependence on libnvidia-container-tools >= 1.3.1
## v1.3.0
* Promote 1.3.0-rc.2-1 to 1.3.0
* Add dependence on libnvidia-container-tools >= 1.3.0
## v1.3.0-rc.2
* 2c180947 Add more tests for new semantics with device list from volume mounts
* 7c003857 Refactor accepting device lists from volume mounts as a boolean
## v1.3.0-rc.1
* b50d86c1 Update build system to accept a TAG variable for things like rc.x
* fe65573b Add common CI tests for things like golint, gofmt, unit tests, etc.
* da6fbb34 Revert "Add ability to merge envars of the form NVIDIA_VISIBLE_DEVICES_*"
* a7fb3330 Flip build-all targets to run automatically on merge requests
* 8b248b66 Rename github.com/NVIDIA/container-toolkit to nvidia-container-toolkit
* da36874e Add new config options to pull device list from mounted files instead of ENVVAR
## v1.2.1
* 4e6e0ed4 Add 'ngx' to list of*all* driver capabilities
* 2f4af743 List config.toml as a config file in the RPM SPEC
## v1.2.0
* 8e0aab46 Fix repo listed in changelog for debian distributions
* 320bb6e4 Update dependence on libnvidia-container to 1.2.0
* 6cfc8097 Update package license to match source license
* e7dc3cbb Fix debian copyright file
* d3aee3e0 Add the 'ngx' driver capability
## v1.1.2
* c32237f3 Add support for parsing Linux Capabilities for older OCI specs
## v1.1.1
* d202aded Update dependence to libnvidia-container 1.1.1
## v1.1.0
* 4e4de762 Update build system to support multi-arch builds
* fcc1d116 Add support for MIG (Multi-Instance GPUs)
* d4ff0416 Add ability to merge envars of the form NVIDIA_VISIBLE_DEVICES_*
* 60f165ad Add no-pivot option to toolkit
## v1.0.5
* Initial release. Replaces older package nvidia-container-runtime-hook. (Closes: #XXXXXX)

View File

@@ -13,7 +13,7 @@ The `nvidia-container-toolkit` resides in this repo directly.
In oder to build the packages, the following command is executed
```sh
./scripts/build-packages.sh TARGET
./scripts/build-all-components.sh TARGET
```
where `TARGET` is a make target that is valid for each of the sub-components.
@@ -21,8 +21,6 @@ These include:
* `ubuntu18.04-amd64`
* `centos8-x86_64`
If no `TARGET` is specified, all valid release targets are built.
The packages are generated in the `dist` folder.
## Testing local changes
@@ -39,23 +37,9 @@ The [test/release](./test/release/) folder contains documentation on how the ins
## Releasing
In order to release packages required for a release, a utility script
[`scripts/release-packages.sh`](./scripts/release-packages.sh) is provided.
This script can be executed as follows:
```bash
GPG_LOCAL_USER="GPG_USER" \
MASTER_KEY_PATH=/path/to/gpg-master.key \
SUB_KEY_PATH=/path/to/gpg-subkey.key \
./scripts/release-packages.sh REPO PACKAGE_REPO_ROOT [REFERENCE]
A utility script [`scripts/release.sh`](./scripts/release.sh) is provided to build
packages required for release. If run without arguments, all supported distribution-architecture combinations are built. A specific distribution-architecture pair can also be provided
```sh
./scripts/release.sh ubuntu18.04-amd64
```
Where `REPO` is one of `stable` or `experimental`, `PACKAGE_REPO_ROOT` is the local path to the `libnvidia-container` repository checked out to the `gh-pages` branch, and `REFERENCE` is the git SHA that is to be released. If reference is not specified `HEAD` is assumed.
This scripts performs the following basic functions:
* Pulls the package image defined by the `REFERENCE` git SHA from the staging registry,
* Copies the required packages to the package repository at `PACKAGE_REPO_ROOT/REPO`,
* Signs the packages using the specified GPG keys
While the last two are performed, commits are added to the package repository. These can be pushed to the relevant repository.
where the `amd64` builds for `ubuntu18.04` are provided as an example.

View File

@@ -38,21 +38,16 @@ EXAMPLE_TARGETS := $(patsubst %,example-%, $(EXAMPLES))
CMDS := $(patsubst ./cmd/%/,%,$(sort $(dir $(wildcard ./cmd/*/))))
CMD_TARGETS := $(patsubst %,cmd-%, $(CMDS))
$(info CMD_TARGETS=$(CMD_TARGETS))
CHECK_TARGETS := assert-fmt vet lint ineffassign misspell
MAKE_TARGETS := binaries build check fmt lint-internal test examples cmds coverage generate licenses $(CHECK_TARGETS)
MAKE_TARGETS := binaries build check fmt lint-internal test examples cmds coverage generate $(CHECK_TARGETS)
TARGETS := $(MAKE_TARGETS) $(EXAMPLE_TARGETS) $(CMD_TARGETS)
DOCKER_TARGETS := $(patsubst %,docker-%, $(TARGETS))
.PHONY: $(TARGETS) $(DOCKER_TARGETS)
ifeq ($(VERSION),)
CLI_VERSION = $(LIB_VERSION)$(if $(LIB_TAG),-$(LIB_TAG))
else
CLI_VERSION = $(VERSION)
endif
CLI_VERSION_PACKAGE = github.com/NVIDIA/nvidia-container-toolkit/internal/info
GOOS ?= linux
binaries: cmds
@@ -61,7 +56,7 @@ cmd-%: COMMAND_BUILD_OPTIONS = -o $(PREFIX)/$(*)
endif
cmds: $(CMD_TARGETS)
$(CMD_TARGETS): cmd-%:
GOOS=$(GOOS) go build -ldflags "-s -w -X $(CLI_VERSION_PACKAGE).gitCommit=$(GIT_COMMIT) -X $(CLI_VERSION_PACKAGE).version=$(CLI_VERSION)" $(COMMAND_BUILD_OPTIONS) $(MODULE)/cmd/$(*)
GOOS=$(GOOS) go build -ldflags "-s -w" $(COMMAND_BUILD_OPTIONS) $(MODULE)/cmd/$(*)
build:
GOOS=$(GOOS) go build ./...
@@ -95,7 +90,11 @@ ineffassign:
lint:
# We use `go list -f '{{.Dir}}' $(MODULE)/...` to skip the `vendor` folder.
go list -f '{{.Dir}}' $(MODULE)/... | xargs golint -set_exit_status
go list -f '{{.Dir}}' $(MODULE)/... | grep -v /internal/ | xargs golint -set_exit_status
lint-internal:
# We use `go list -f '{{.Dir}}' $(MODULE)/...` to skip the `vendor` folder.
go list -f '{{.Dir}}' $(MODULE)/internal/... | xargs golint -set_exit_status
misspell:
misspell $(MODULE)/...
@@ -103,9 +102,6 @@ misspell:
vet:
go vet $(MODULE)/...
licenses:
go-licenses csv $(MODULE)/...
COVERAGE_FILE := coverage.out
test: build cmds
go test -v -coverprofile=$(COVERAGE_FILE) $(MODULE)/...
@@ -146,15 +142,3 @@ $(DOCKER_TARGETS): docker-%: .build-image
--user $$(id -u):$$(id -g) \
$(BUILDIMAGE) \
make $(*)
# Start an interactive shell using the development image.
PHONY: .shell
.shell:
$(DOCKER) run \
--rm \
-ti \
-e GOCACHE=/tmp/.cache \
-v $(PWD):$(PWD) \
-w $(PWD) \
--user $$(id -u):$$(id -g) \
$(BUILDIMAGE)

View File

@@ -1,6 +1,6 @@
# NVIDIA Container Toolkit
[![GitHub license](https://img.shields.io/github/license/NVIDIA/nvidia-container-toolkit?style=flat-square)](https://raw.githubusercontent.com/NVIDIA/nvidia-container-toolkit/main/LICENSE)
[![GitHub license](https://img.shields.io/github/license/NVIDIA/nvidia-container-toolkit?style=flat-square)](https://raw.githubusercontent.com/NVIDIA/nvidia-container-toolkit/master/LICENSE)
[![Documentation](https://img.shields.io/badge/documentation-wiki-blue.svg?style=flat-square)](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html)
[![Package repository](https://img.shields.io/badge/packages-repository-b956e8.svg?style=flat-square)](https://nvidia.github.io/libnvidia-container)

View File

@@ -42,16 +42,6 @@ RUN GOPATH=/artifacts go install -ldflags="-s -w -X 'main.Version=${VERSION}'" .
FROM nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST}
ARG BASE_DIST
# See https://www.centos.org/centos-linux-eol/
# and https://stackoverflow.com/a/70930049 for move to vault.centos.org
# and https://serverfault.com/questions/1093922/failing-to-run-yum-update-in-centos-8 for move to vault.epel.cloud
RUN [[ "${BASE_DIST}" != "centos8" ]] || \
( \
sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-Linux-* && \
sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.epel.cloud|g' /etc/yum.repos.d/CentOS-Linux-* \
)
ENV NVIDIA_DISABLE_REQUIRE="true"
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=utility
@@ -63,13 +53,11 @@ COPY ${ARTIFACTS_ROOT}/${PACKAGE_DIST} /artifacts/packages/${PACKAGE_DIST}
WORKDIR /artifacts/packages
ARG PACKAGE_VERSION
ARG TARGETARCH
ENV PACKAGE_ARCH ${TARGETARCH}
RUN PACKAGE_ARCH=${PACKAGE_ARCH/amd64/x86_64} && PACKAGE_ARCH=${PACKAGE_ARCH/arm64/aarch64} && \
yum localinstall -y \
${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container1-1.*.rpm \
${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container-tools-1.*.rpm \
${PACKAGE_DIST}/${PACKAGE_ARCH}/nvidia-container-toolkit*-${PACKAGE_VERSION}*.rpm
ARG PACKAGE_ARCH
RUN yum localinstall -y \
${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container1-${PACKAGE_VERSION}*.rpm \
${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container-tools-${PACKAGE_VERSION}*.rpm \
${PACKAGE_DIST}/${PACKAGE_ARCH}/nvidia-container-toolkit-${PACKAGE_VERSION}*.rpm
WORKDIR /work
@@ -85,7 +73,7 @@ LABEL release="N/A"
LABEL summary="Automatically Configure your Container Runtime for GPU support."
LABEL description="See summary"
RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE
COPY ./LICENSE /licenses/LICENSE
# Install / upgrade packages here that are required to resolve CVEs
ARG CVE_UPDATES

View File

@@ -26,4 +26,4 @@ COPY ${ARTIFACTS_ROOT} /artifacts/packages/
WORKDIR /artifacts/packages
RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE
COPY ./LICENSE /licenses/LICENSE

View File

@@ -40,15 +40,11 @@ COPY . .
RUN GOPATH=/artifacts go install -ldflags="-s -w -X 'main.Version=${VERSION}'" ./tools/...
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST}
# Remove the CUDA repository configurations to avoid issues with rotated GPG keys
RUN rm -f /etc/apt/sources.list.d/cuda.list
FROM nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST}
ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
libcap2 \
curl \
&& \
rm -rf /var/lib/apt/lists/*
@@ -63,21 +59,11 @@ COPY ${ARTIFACTS_ROOT}/${PACKAGE_DIST} /artifacts/packages/${PACKAGE_DIST}
WORKDIR /artifacts/packages
ARG PACKAGE_VERSION
ARG TARGETARCH
ENV PACKAGE_ARCH ${TARGETARCH}
ARG LIBNVIDIA_CONTAINER_REPO="https://nvidia.github.io/libnvidia-container"
ARG LIBNVIDIA_CONTAINER0_VERSION
RUN if [ "${PACKAGE_ARCH}" = "arm64" ]; then \
curl -L ${LIBNVIDIA_CONTAINER_REPO}/${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container0_${LIBNVIDIA_CONTAINER0_VERSION}_${PACKAGE_ARCH}.deb \
--output ${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container0_${LIBNVIDIA_CONTAINER0_VERSION}_${PACKAGE_ARCH}.deb && \
dpkg -i ${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container0_${LIBNVIDIA_CONTAINER0_VERSION}_${PACKAGE_ARCH}.deb; \
fi
ARG PACKAGE_ARCH
RUN dpkg -i \
${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container1_1.*.deb \
${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container-tools_1.*.deb \
${PACKAGE_DIST}/${PACKAGE_ARCH}/nvidia-container-toolkit*_${PACKAGE_VERSION}*.deb
${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container1_${PACKAGE_VERSION}*.deb \
${PACKAGE_DIST}/${PACKAGE_ARCH}/libnvidia-container-tools_${PACKAGE_VERSION}*.deb \
${PACKAGE_DIST}/${PACKAGE_ARCH}/nvidia-container-toolkit_${PACKAGE_VERSION}*.deb
WORKDIR /work
@@ -93,13 +79,6 @@ LABEL release="N/A"
LABEL summary="Automatically Configure your Container Runtime for GPU support."
LABEL description="See summary"
RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE
# Install / upgrade packages here that are required to resolve CVEs
ARG CVE_UPDATES
RUN if [ -n "${CVE_UPDATES}" ]; then \
apt-get update && apt-get upgrade -y ${CVE_UPDATES} && \
rm -rf /var/lib/apt/lists/*; \
fi
COPY ./LICENSE /licenses/LICENSE
ENTRYPOINT ["/work/nvidia-toolkit"]

View File

@@ -12,14 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
BUILD_MULTI_ARCH_IMAGES ?= false
DOCKER ?= docker
BUILDX =
ifeq ($(BUILD_MULTI_ARCH_IMAGES),true)
BUILDX = buildx
endif
DOCKER ?= docker
MKDIR ?= mkdir
DIST_DIR ?= $(CURDIR)/dist
@@ -32,47 +25,36 @@ IMAGE_NAME := $(REGISTRY)/container-toolkit
endif
VERSION ?= $(LIB_VERSION)$(if $(LIB_TAG),-$(LIB_TAG))
IMAGE_VERSION := $(VERSION)
IMAGE_TAG ?= $(VERSION)-$(DIST)
IMAGE = $(IMAGE_NAME):$(IMAGE_TAG)
OUT_IMAGE_NAME ?= $(IMAGE_NAME)
OUT_IMAGE_VERSION ?= $(IMAGE_VERSION)
OUT_IMAGE_TAG = $(OUT_IMAGE_VERSION)-$(DIST)
OUT_IMAGE = $(OUT_IMAGE_NAME):$(OUT_IMAGE_TAG)
##### Public rules #####
DEFAULT_PUSH_TARGET := ubuntu20.04
DISTRIBUTIONS := ubuntu20.04 ubuntu18.04 ubi8 centos7
DEFAULT_PUSH_TARGET := ubuntu18.04
TARGETS := ubuntu20.04 ubuntu18.04 ubi8 centos7 centos8
META_TARGETS := packaging
BUILD_TARGETS := $(patsubst %,build-%,$(DISTRIBUTIONS) $(META_TARGETS))
PUSH_TARGETS := $(patsubst %,push-%,$(DISTRIBUTIONS) $(META_TARGETS))
TEST_TARGETS := $(patsubst %,test-%, $(DISTRIBUTIONS))
BUILD_TARGETS := $(patsubst %,build-%,$(TARGETS) $(META_TARGETS))
PUSH_TARGETS := $(patsubst %,push-%,$(TARGETS) $(META_TARGETS))
TEST_TARGETS := $(patsubst %,test-%, $(TARGETS))
.PHONY: $(DISTRIBUTIONS) $(PUSH_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
.PHONY: $(TARGETS) $(PUSH_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
ifneq ($(BUILD_MULTI_ARCH_IMAGES),true)
include $(CURDIR)/build/container/native-only.mk
else
include $(CURDIR)/build/container/multi-arch.mk
endif
push-%: DIST = $(*)
$(PUSH_TARGETS): push-%:
$(DOCKER) push "$(IMAGE_NAME):$(IMAGE_TAG)"
# For the default push target we also push a short tag equal to the version.
# We skip this for the development release
DEVEL_RELEASE_IMAGE_VERSION ?= devel
PUSH_MULTIPLE_TAGS ?= true
ifeq ($(strip $(OUT_IMAGE_VERSION)),$(DEVEL_RELEASE_IMAGE_VERSION))
PUSH_MULTIPLE_TAGS = false
endif
ifeq ($(PUSH_MULTIPLE_TAGS),true)
ifneq ($(strip $(VERSION)),$(DEVEL_RELEASE_IMAGE_VERSION))
push-$(DEFAULT_PUSH_TARGET): push-short
endif
push-short:
$(DOCKER) tag "$(IMAGE_NAME):$(VERSION)-$(DEFAULT_PUSH_TARGET)" "$(IMAGE_NAME):$(VERSION)"
$(DOCKER) push "$(IMAGE_NAME):$(VERSION)"
push-%: DIST = $(*)
push-short: DIST = $(DEFAULT_PUSH_TARGET)
build-%: DIST = $(*)
build-%: DOCKERFILE = $(CURDIR)/build/container/Dockerfile.$(DOCKERFILE_SUFFIX)
@@ -82,17 +64,16 @@ ARTIFACTS_ROOT ?= $(shell realpath --relative-to=$(CURDIR) $(DIST_DIR))
# Use a generic build target to build the relevant images
$(BUILD_TARGETS): build-%: $(ARTIFACTS_ROOT)
DOCKER_BUILDKIT=1 \
$(DOCKER) $(BUILDX) build --pull \
$(DOCKER_BUILD_OPTIONS) \
$(DOCKER_BUILD_PLATFORM_OPTIONS) \
$(DOCKER) build --pull \
--platform=linux/amd64 \
--tag $(IMAGE) \
--build-arg ARTIFACTS_ROOT="$(ARTIFACTS_ROOT)" \
--build-arg BASE_DIST="$(BASE_DIST)" \
--build-arg CUDA_VERSION="$(CUDA_VERSION)" \
--build-arg GOLANG_VERSION="$(GOLANG_VERSION)" \
--build-arg LIBNVIDIA_CONTAINER0_VERSION="$(LIBNVIDIA_CONTAINER0_DEPENDENCY)" \
--build-arg PACKAGE_DIST="$(PACKAGE_DIST)" \
--build-arg PACKAGE_VERSION="$(PACKAGE_VERSION)" \
--build-arg PACKAGE_ARCH="$(PACKAGE_ARCH)" \
--build-arg VERSION="$(VERSION)" \
--build-arg CVE_UPDATES="$(CVE_UPDATES)" \
-f $(DOCKERFILE) \
@@ -101,19 +82,22 @@ $(BUILD_TARGETS): build-%: $(ARTIFACTS_ROOT)
build-ubuntu%: BASE_DIST = $(*)
build-ubuntu%: DOCKERFILE_SUFFIX := ubuntu
build-ubuntu%: PACKAGE_DIST = ubuntu18.04
build-ubuntu%: PACKAGE_ARCH := amd64
build-ubuntu%: PACKAGE_DIST = $(BASE_DIST)
build-ubuntu%: PACKAGE_VERSION := $(LIB_VERSION)$(if $(LIB_TAG),~$(LIB_TAG))
build-ubuntu%: LIBNVIDIA_CONTAINER0_DEPENDENCY=$(LIBNVIDIA_CONTAINER0_VERSION)
# TODO: Update this to use the centos8 packages
build-ubi8: BASE_DIST := ubi8
build-ubi8: DOCKERFILE_SUFFIX := centos
build-ubi8: PACKAGE_DIST = centos8
build-ubi8: PACKAGE_ARCH := x86_64
build-ubi8: PACKAGE_DIST = centos7
build-ubi8: PACKAGE_VERSION := $(LIB_VERSION)-$(if $(LIB_TAG),0.1.$(LIB_TAG),1)
build-centos7: BASE_DIST = $(*)
build-centos7: DOCKERFILE_SUFFIX := centos
build-centos7: PACKAGE_DIST = $(BASE_DIST)
build-centos7: PACKAGE_VERSION := $(LIB_VERSION)-$(if $(LIB_TAG),0.1.$(LIB_TAG),1)
build-centos%: BASE_DIST = $(*)
build-centos%: DOCKERFILE_SUFFIX := centos
build-centos%: PACKAGE_ARCH := x86_64
build-centos%: PACKAGE_DIST = $(BASE_DIST)
build-centos%: PACKAGE_VERSION := $(LIB_VERSION)-$(if $(LIB_TAG),0.1.$(LIB_TAG),1)
build-packaging: BASE_DIST := ubuntu20.04
build-packaging: DOCKERFILE_SUFFIX := packaging

View File

@@ -1,37 +0,0 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
PUSH_ON_BUILD ?= false
DOCKER_BUILD_OPTIONS = --output=type=image,push=$(PUSH_ON_BUILD)
DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64,linux/arm64
REGCTL ?= regctl
$(PUSH_TARGETS): push-%:
$(REGCTL) \
image copy \
$(IMAGE) $(OUT_IMAGE)
push-short:
$(REGCTL) \
image copy \
$(IMAGE) $(OUT_IMAGE_NAME):$(OUT_IMAGE_VERSION)
# We only have x86_64 packages for centos7
build-centos7: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64
# We only generate amd64 image for ubuntu18.04
build-ubuntu18.04: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64
# We only generate a single image for packaging targets
build-packaging: DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64

View File

@@ -0,0 +1,170 @@
# The Experimental NVIDIA Container Runtime
## Introduction
The experimental NVIDIA Container Runtime is a proof-of-concept runtime that
approaches the problem of making GPUs (or other NVIDIA devices) available in
containerized environments in a different manner to the existing
[NVIDIA Container Runtime](../nvidia-container-runtime). Wherease the current
runtime relies on the [NVIDIA Container Library](https://github.com/NVIDIA/libnvidia-container)
to perform the modifications to a container, the experiemental runtime aims to
express the required modifications in terms of changes to a container's [OCI
runtime specification](https://github.com/opencontainers/runtime-spec). This
also aligns with open initiatives such as the [Container Device Interface (CDI)](https://github.com/container-orchestrated-devices/container-device-interface).
## Known Limitations
* The path of NVIDIA CUDA libraries / binaries injected into the container currently match that of the host system. This means that on an Ubuntu-based host systems these would be at `/usr/lib/x86_64-linux-gnu`
even if the container distribution would normally expect these at another location (e.g. `/usr/lib64`)
* Tools such as `nvidia-smi` may create additional device nodes in the container when run. This is
prevented in the "classic" runtime (and the NVIDIA Container Library) by modifying
the `/proc/driver/nvidia/params` file in the container.
* Other `NVIDIA_*` environment variables (e.g. `NVIDIA_DRIVER_CAPABILITIES`) are
not considered to filter mounted libraries or binaries. This is equivalent to
always using `NVIDIA_DRIVER_CAPABILITIES=all`.
## Building / Installing
The experimental NVIDIA Container Runtime is a self-contained golang binary and
can thus be built from source, or installed directly using `go install`.
### From source
After cloning the `nvidia-container-toolkit` repository from [GitLab](https://gitlab.com/nvidia/container-toolkit/container-toolkit)
or from the read-only mirror on [GitHub](https://github.com/NVIDIA/nvidia-container-toolkit)
running the following make command in the repository root:
```bash
make cmd-nvidia-container-runtime.experimental
```
will create an executable file `nvidia-container-runtime.experimental` in the
root.
A dockerized target:
```bash
make docker-cmd-nvidia-container-runtime.experimental
```
will also create the executable file `nvidia-container-runtime.experimental`
without requiring the setup of a development environment (with the exception)
of having `make` and `docker` installed.
### Go install
The experimental NVIDIA Container Runtime can also be `go installed` by running
the following command:
```bash
go install github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-container-runtime.experimental@experimental
```
which will build and install the `nvidia-container-runtime.experimental`
executable in the `${GOPATH}/bin` folder.
## Using the Runtime
The experimental NVIDIA Container Runtime is intended as a drop-in replacement
for the "classic" NVIDIA Container Runtime. As such it is used in the same
way (with the exception of the known limitiations noted above).
In general terms, to use the experimental NVIDIA Container Runtime to launch a
container with GPU support, it should be inserted as a shim for the desired
low-level OCI-compliant runtime (e.g. `runc` or `crun`). How this is achieved
depends on how containers are being launched.
### Docker
In the case of `docker` for example, the runtime must be registered with the
Docker daemon. This can be done by modifying the `/etc/docker/daemon.json` file
to contain the following:
```json
{
"runtimes": {
"nvidia-experimental": {
"path": "nvidia-container-runtime.experimental",
"runtimeArgs": []
}
},
}
```
This can then be invoked from docker by including the `--runtime=nvidia-experimental`
option when executing a `docker run` command.
### Runc
If `runc` is being used to run a container directly substituting the `runc`
command for `nvidia-container-runtime.experimental` should be sufficient as
the latter will `exec` to `runc` once the required (in-place) modifications have
been made to the container's OCI spec (`config.json` file).
## Configuration
### Runtime Path
The experimental NVIDIA Container Runtime allows for the path to the low-level
runtime to be specified. This is done by setting the following option in the
`/etc/nvidia-container-runtime/config.toml` file or setting the
`NVIDIA_CONTAINER_RUNTIME_PATH` environment variable.
```toml
[nvidia-container-runtime.experimental]
runtime-path = "/path/to/low-level-runtime"
```
This path can be set to the path for `runc` or `crun` on a system and if it is
a relative path, the `PATH` is searched for a matching executable.
### Device Selection
In order to select a specific device, the experimental NVIDIA Container Runtime
mimics the behaviour of the "classic" runtime. That is to say that the values of
certain [environment variables](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/user-guide.html#environment-variables-oci-spec)
**in the container's OCI specification** control the behaviour of the runtime.
#### `NVIDIA_VISIBLE_DEVICES`
This variable controls which GPUs will be made accessible inside the container.
##### Possible values
* `0,1,2`, `GPU-fef8089b` …: a comma-separated list of GPU UUID(s) or index(es).
* `all`: all GPUs will be accessible, this is the default value in our container images.
* `none`: no GPU will be accessible, but driver capabilities will be enabled.
* `void` or *empty* or *unset*: `nvidia-container-runtime` will have the same behavior as `runc`.
**Note**: When running on a MIG capable device, the following values will also be available:
* `0:0,0:1,1:0`, `MIG-GPU-fef8089b/0/1` …: a comma-separated list of MIG Device UUID(s) or index(es).
Where the MIG device indices have the form `<GPU Device Index>:<MIG Device Index>` as seen in the example output:
```
$ nvidia-smi -L
GPU 0: Graphics Device (UUID: GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5)
MIG Device 0: (UUID: MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/0)
MIG Device 1: (UUID: MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/1/1)
MIG Device 2: (UUID: MIG-GPU-b8ea3855-276c-c9cb-b366-c6fa655957c5/11/0)
```
#### `NVIDIA_MIG_CONFIG_DEVICES`
This variable controls which of the visible GPUs can have their MIG
configuration managed from within the container. This includes enabling and
disabling MIG mode, creating and destroying GPU Instances and Compute
Instances, etc.
##### Possible values
* `all`: Allow all MIG-capable GPUs in the visible device list to have their
MIG configurations managed.
**Note**:
* This feature is only available on MIG capable devices (e.g. the A100).
* To use this feature, the container must be started with `CAP_SYS_ADMIN` privileges.
* When not running as `root`, the container user must have read access to the
`/proc/driver/nvidia/capabilities/mig/config` file on the host.
#### `NVIDIA_MIG_MONITOR_DEVICES`
This variable controls which of the visible GPUs can have aggregate information
about all of their MIG devices monitored from within the container. This
includes inspecting the aggregate memory usage, listing the aggregate running
processes, etc.
##### Possible values
* `all`: Allow all MIG-capable GPUs in the visible device list to have their
MIG devices monitored.
**Note**:
* This feature is only available on MIG capable devices (e.g. the A100).
* To use this feature, the container must be started with `CAP_SYS_ADMIN` privileges.
* When not running as `root`, the container user must have read access to the
`/proc/driver/nvidia/capabilities/mig/monitor` file on the host.

View File

@@ -0,0 +1,52 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package config
import (
"fmt"
log "github.com/sirupsen/logrus"
)
// list represents a list of config updaters that are applied in order, with later
// configs having preference.
type list struct {
logger *log.Logger
configs []configUpdater
}
var _ configUpdater = (*list)(nil)
func newListWithLogger(logger *log.Logger, ci ...configUpdater) configUpdater {
c := list{
logger: logger,
configs: ci,
}
return &c
}
func (c list) Update(cfg *Config) error {
for i, u := range c.configs {
c.logger.Debugf("Applying config %v: %v", i, u)
err := u.Update(cfg)
if err != nil {
return fmt.Errorf("error applying config %v: %v", i, err)
}
}
return nil
}

View File

@@ -0,0 +1,100 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package config
import (
"fmt"
"testing"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
func TestComposite(t *testing.T) {
logger, _ := testlog.NewNullLogger()
// Empty list
c := newListWithLogger(logger)
cfg := &Config{}
err := c.Update(cfg)
require.NoError(t, err)
require.EqualValues(t, &Config{}, cfg)
// Add a single mock config
mockConfigUpdater := &configUpdaterMock{
UpdateFunc: func(cfg *Config) error {
cfg.DebugFilePath = "updated"
return nil
},
}
c = newListWithLogger(logger, mockConfigUpdater)
cfg = &Config{}
err = c.Update(cfg)
require.NoError(t, err)
require.EqualValues(t, &Config{DebugFilePath: "updated"}, cfg)
require.Len(t, mockConfigUpdater.UpdateCalls(), 1)
// Reset the calls
mockConfigUpdater.calls = struct{ Update []struct{ Config *Config } }{}
// define an error config
errorConfigUpdater := &configUpdaterMock{
UpdateFunc: func(cfg *Config) error {
return fmt.Errorf("mock error")
},
}
c = newListWithLogger(logger, errorConfigUpdater, mockConfigUpdater)
cfg = &Config{}
err = c.Update(cfg)
require.Error(t, err)
require.EqualValues(t, &Config{}, cfg)
require.Len(t, errorConfigUpdater.UpdateCalls(), 1)
require.Len(t, mockConfigUpdater.UpdateCalls(), 0)
// Reset the calls
mockConfigUpdater.calls = struct{ Update []struct{ Config *Config } }{}
errorConfigUpdater.calls = struct{ Update []struct{ Config *Config } }{}
// Change the order of the config and error
c = newListWithLogger(logger, mockConfigUpdater, errorConfigUpdater)
cfg = &Config{}
err = c.Update(cfg)
require.Error(t, err)
require.EqualValues(t, &Config{DebugFilePath: "updated"}, cfg)
require.Len(t, errorConfigUpdater.UpdateCalls(), 1)
require.Len(t, mockConfigUpdater.UpdateCalls(), 1)
// Reset the calls
mockConfigUpdater.calls = struct{ Update []struct{ Config *Config } }{}
errorConfigUpdater.calls = struct{ Update []struct{ Config *Config } }{}
// Call the mock twice
c = newListWithLogger(logger, mockConfigUpdater, mockConfigUpdater)
cfg = &Config{}
err = c.Update(cfg)
require.NoError(t, err)
require.EqualValues(t, &Config{DebugFilePath: "updated"}, cfg)
require.Len(t, mockConfigUpdater.UpdateCalls(), 2)
}

View File

@@ -0,0 +1,63 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package config
import (
"fmt"
log "github.com/sirupsen/logrus"
)
// Config defines the configuration options for the NVIDIA Container Runtime
type Config struct {
// Root defines the root of the file system to be used for locating mounts
Root string
// DebugFilePath defines a log file to print debug output to
DebugFilePath string
// RuntimePath defines the path to an OCI compliant runtime
RuntimePath string
// LogLevel defines the logging level for the application
LogLevel string
}
//go:generate moq -stub -out config_mock.go . configUpdater
// configUpdate represents an interface for applying updates to a config.
type configUpdater interface {
Update(*Config) error
}
// GetConfig returns a config struct with the values resolved. The values are defined in order of
// priority:
// 1. From the associated environment variables
// 2. From the loaded config file
// 3. From the default values defined in the `defaultConfig` function
func GetConfig(logger *log.Logger) (*Config, error) {
cfg := &Config{}
configs := newListWithLogger(logger,
newDefaultConfig(),
newDefaultConfigFileWithLogger(logger),
newConfigFromEnvironment(),
)
err := configs.Update(cfg)
if err != nil {
return nil, fmt.Errorf("error getting config: %v", err)
}
return cfg, nil
}

View File

@@ -0,0 +1,76 @@
// Code generated by moq; DO NOT EDIT.
// github.com/matryer/moq
package config
import (
"sync"
)
// Ensure, that configUpdaterMock does implement configUpdater.
// If this is not the case, regenerate this file with moq.
var _ configUpdater = &configUpdaterMock{}
// configUpdaterMock is a mock implementation of configUpdater.
//
// func TestSomethingThatUsesconfigUpdater(t *testing.T) {
//
// // make and configure a mocked configUpdater
// mockedconfigUpdater := &configUpdaterMock{
// UpdateFunc: func(config *Config) error {
// panic("mock out the Update method")
// },
// }
//
// // use mockedconfigUpdater in code that requires configUpdater
// // and then make assertions.
//
// }
type configUpdaterMock struct {
// UpdateFunc mocks the Update method.
UpdateFunc func(config *Config) error
// calls tracks calls to the methods.
calls struct {
// Update holds details about calls to the Update method.
Update []struct {
// Config is the config argument value.
Config *Config
}
}
lockUpdate sync.RWMutex
}
// Update calls UpdateFunc.
func (mock *configUpdaterMock) Update(config *Config) error {
callInfo := struct {
Config *Config
}{
Config: config,
}
mock.lockUpdate.Lock()
mock.calls.Update = append(mock.calls.Update, callInfo)
mock.lockUpdate.Unlock()
if mock.UpdateFunc == nil {
var (
errOut error
)
return errOut
}
return mock.UpdateFunc(config)
}
// UpdateCalls gets all the calls that were made to Update.
// Check the length with:
// len(mockedconfigUpdater.UpdateCalls())
func (mock *configUpdaterMock) UpdateCalls() []struct {
Config *Config
} {
var calls []struct {
Config *Config
}
mock.lockUpdate.RLock()
calls = mock.calls.Update
mock.lockUpdate.RUnlock()
return calls
}

View File

@@ -0,0 +1,58 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package config
import (
"io/ioutil"
"os"
"path"
"path/filepath"
"testing"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
func TestGetConfigWithCustomConfig(t *testing.T) {
wd, err := os.Getwd()
require.NoError(t, err)
// By default debug is disabled
contents := []byte("[nvidia-container-runtime]\ndebug = \"/nvidia-container-toolkit.log\"")
testDir := path.Join(wd, "test")
filename := path.Join(testDir, configFileRelativePath)
previousConfig, present := os.LookupEnv(configOverride)
os.Setenv(configOverride, testDir)
defer func() {
if present {
os.Setenv(configOverride, previousConfig)
} else {
os.Unsetenv(configOverride)
}
}()
require.NoError(t, os.MkdirAll(filepath.Dir(filename), 0766))
require.NoError(t, ioutil.WriteFile(filename, contents, 0766))
defer func() { require.NoError(t, os.RemoveAll(testDir)) }()
logger, _ := testlog.NewNullLogger()
cfg, err := GetConfig(logger)
require.NoError(t, err)
require.Equal(t, "/nvidia-container-toolkit.log", cfg.DebugFilePath)
}

View File

@@ -1,5 +1,4 @@
#!/usr/bin/env bash
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,13 +12,36 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
# This script is used to build the packages for the components of the NVIDIA
# Container Stack. These include the nvidia-container-toolkit in this repository
# as well as the components included in the third_party folder.
# All required packages are generated in the specified dist folder.
package config
test_repo=$1
echo "Setting up TEST repo: ${test_repo}"
sed -i -e "s#nvidia\.github\.io/libnvidia-container#${test_repo}/libnvidia-container#g" /etc/yum.repos.d/nvidia-container-toolkit.repo
yum-config-manager --enable libnvidia-container-experimental
import (
log "github.com/sirupsen/logrus"
)
type defaultConfig struct{}
var _ configUpdater = (*defaultConfig)(nil)
func newDefaultConfig() configUpdater {
c := defaultConfig{}
return &c
}
// Update defines the the default values for the config options
func (c defaultConfig) Update(cfg *Config) error {
*cfg = Config{
DebugFilePath: "/dev/null",
LogLevel: log.InfoLevel.String(),
}
return nil
}
func getDefaultConfig() *Config {
cfg := &Config{}
defaultConfig{}.Update(cfg)
return cfg
}

View File

@@ -0,0 +1,37 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package config
import (
"testing"
"github.com/stretchr/testify/require"
)
func TestDefaultConfigUpdate(t *testing.T) {
cfg := &Config{}
require.Empty(t, cfg.DebugFilePath)
c := defaultConfig{}
err := c.Update(cfg)
require.NoError(t, err)
require.Equal(t, "/dev/null", cfg.DebugFilePath)
require.Equal(t, "", cfg.Root)
}

View File

@@ -0,0 +1,61 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package config
import (
"os"
"strings"
)
const (
debugFilePathEnvvarName = "NVIDIA_CONTAINER_RUNTIME_DEBUG"
runtimePathEnvvarName = "NVIDIA_CONTAINER_RUNTIME_PATH"
rootEnvvarName = "NVIDIA_CONTAINER_RUNTIME_ROOT"
logLevelEnvvarName = "NVIDIA_CONTAINER_RUNTIME_LOG_LEVEL"
)
type envConfig struct{}
func newConfigFromEnvironment() configUpdater {
c := envConfig{}
return &c
}
func (c envConfig) Update(cfg *Config) error {
debugFilePathEnvvar, exists := os.LookupEnv(debugFilePathEnvvarName)
if exists && strings.TrimSpace(debugFilePathEnvvar) != "" {
cfg.DebugFilePath = debugFilePathEnvvar
}
runtimePathEnvvar, exists := os.LookupEnv(runtimePathEnvvarName)
if exists && strings.TrimSpace(runtimePathEnvvar) != "" {
cfg.RuntimePath = runtimePathEnvvar
}
rootEnvvar, exists := os.LookupEnv(rootEnvvarName)
if exists && strings.TrimSpace(rootEnvvar) != "" {
cfg.Root = rootEnvvar
}
logLevelEnvvar, exists := os.LookupEnv(logLevelEnvvarName)
if exists && strings.TrimSpace(logLevelEnvvar) != "" {
cfg.LogLevel = logLevelEnvvar
}
return nil
}

View File

@@ -1,5 +1,5 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -12,13 +12,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
*/
package constraints
package config
//go:generate moq -stub -out constraint_mock.go . Constraint
// Constraint represents a constraint that is to be evaluated
type Constraint interface {
String() string
Assert() error
// noop implements a configUpdater that does not update a config.
type noop struct{}
func newNoopConfigUpdater() configUpdater {
c := noop{}
return &c
}
func (c noop) Update(cfg *Config) error {
return nil
}

View File

@@ -0,0 +1,36 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package config
import (
"testing"
"github.com/stretchr/testify/require"
)
func TestNoopDoesNotModifyConfig(t *testing.T) {
cfg := &Config{
DebugFilePath: "test-path",
}
c := newNoopConfigUpdater()
err := c.Update(cfg)
require.NoError(t, err)
require.Equal(t, "test-path", cfg.DebugFilePath)
}

View File

@@ -0,0 +1,158 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package config
import (
"fmt"
"io"
"os"
"path/filepath"
"github.com/pelletier/go-toml"
log "github.com/sirupsen/logrus"
)
const (
configFileRelativePath = "nvidia-container-runtime/config.toml"
configOverride = "XDG_CONFIG_HOME"
defaultConfigRoot = "/etc"
nvidiaContainerCliSection = "nvidia-container-cli"
nvidiaContainerRuntimeConfigSection = "nvidia-container-runtime"
nvidiaContainerRuntimeExperimentalConfigSection = "nvidia-container-runtime.experimental"
)
type tomlConfig struct {
logger *log.Logger
path string
sections []tomlSection
}
type tomlSection struct {
section string
keys map[string]struct{}
}
func newDefaultConfigFileWithLogger(logger *log.Logger) configUpdater {
configDir := defaultConfigRoot
if XDGConfigDir := os.Getenv(configOverride); len(XDGConfigDir) != 0 {
configDir = XDGConfigDir
}
configFilePath := filepath.Join(configDir, configFileRelativePath)
return newConfigFromFileWithLogger(logger, configFilePath)
}
func newConfigFromFileWithLogger(logger *log.Logger, filepath string) configUpdater {
if _, err := os.Stat(filepath); os.IsNotExist(err) {
logger.Warnf("The config file '%v' does not exist", filepath)
return newNoopConfigUpdater()
}
sections := []tomlSection{
{
section: nvidiaContainerRuntimeConfigSection,
},
{
section: nvidiaContainerRuntimeExperimentalConfigSection,
},
{
section: nvidiaContainerCliSection,
keys: map[string]struct{}{
"root": {},
},
},
}
c := tomlConfig{
logger: logger,
path: filepath,
sections: sections,
}
return &c
}
func (c tomlConfig) Update(cfg *Config) error {
configFile, err := os.Open(c.path)
if err != nil {
return fmt.Errorf("error opening config file %v: %v", c.path, err)
}
defer configFile.Close()
return c.updateFromReader(cfg, configFile)
}
func (c tomlConfig) updateFromReader(cfg *Config, reader io.Reader) error {
toml, err := toml.LoadReader(reader)
if err != nil {
return fmt.Errorf("error reading TOML contents: %v", err)
}
for _, section := range c.sections {
if v, ok := section.GetStringFrom(toml, "debug"); ok {
cfg.DebugFilePath = v
}
if v, ok := section.GetStringFrom(toml, "runtime-path"); ok {
cfg.RuntimePath = v
}
if v, ok := section.GetStringFrom(toml, "root"); ok {
cfg.Root = v
}
if v, ok := section.GetStringFrom(toml, "log-level"); ok {
cfg.LogLevel = v
}
}
return nil
}
func (c tomlSection) GetStringFrom(toml *toml.Tree, key string) (string, bool) {
value := c.GetFrom(toml, key)
if value != nil {
if v, ok := value.(string); ok {
return v, ok
}
}
return "", false
}
func (c tomlSection) GetFrom(toml *toml.Tree, key string) interface{} {
if !c.validKey(key) {
return nil
}
return toml.Get(c.configKey(key))
}
func (c tomlSection) validKey(key string) bool {
if c.keys == nil {
return true
}
_, exists := c.keys[key]
return exists
}
func (c tomlSection) configKey(key string) string {
if c.section == "" {
return key
}
return c.section + "." + key
}

View File

@@ -0,0 +1,259 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package config
import (
"fmt"
"io"
"io/ioutil"
"os"
"path"
"path/filepath"
"strings"
"testing"
"testing/iotest"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
func TestUpdateFromReader(t *testing.T) {
logger, _ := testlog.NewNullLogger()
testCases := []struct {
description string
readerError bool
lines []string
expected *Config
expectedError bool
}{
{
description: "reader error returns error",
readerError: true,
expectedError: true,
expected: getDefaultConfig(),
},
{
description: "empty config returns defaults",
lines: []string{},
expected: &Config{
DebugFilePath: "/dev/null",
LogLevel: "info",
},
},
{
description: "debug output is set",
lines: []string{"nvidia-container-runtime.debug=\"nvidia-container-toolkit.log\""},
expected: &Config{
DebugFilePath: "nvidia-container-toolkit.log",
LogLevel: "info",
},
},
{
description: "debug output is set in section",
lines: []string{"[nvidia-container-runtime]", "debug=\"nvidia-container-toolkit.log\""},
expected: &Config{
DebugFilePath: "nvidia-container-toolkit.log",
LogLevel: "info",
},
},
{
description: "blank debug is set",
lines: []string{"nvidia-container-runtime.debug=\"\""},
expected: &Config{
DebugFilePath: "",
LogLevel: "info",
},
},
{
description: "non-string debug is ignored",
lines: []string{"nvidia-container-runtime.debug=2"},
expected: &Config{
DebugFilePath: "/dev/null",
LogLevel: "info",
},
},
{
description: "log-level is set",
lines: []string{"nvidia-container-runtime.log-level=\"trace\""},
expected: &Config{
DebugFilePath: "/dev/null",
LogLevel: "trace",
},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
cfg := getDefaultConfig()
c := tomlConfig{
logger: logger,
sections: []tomlSection{
{section: nvidiaContainerRuntimeConfigSection},
},
}
var reader io.Reader
if tc.readerError {
reader = iotest.ErrReader(fmt.Errorf("error"))
} else {
reader = strings.NewReader(strings.Join(tc.lines, "\n"))
}
err := c.updateFromReader(cfg, reader)
if tc.expectedError {
require.Error(t, err)
} else {
require.NoError(t, err)
}
require.EqualValues(t, tc.expected, cfg)
})
}
}
func TestUpdateFromReaderExperimental(t *testing.T) {
logger, _ := testlog.NewNullLogger()
testCases := []struct {
readerError bool
lines []string
expected *Config
expectedError bool
}{
{
lines: []string{"nvidia-container-runtime.debug=\"nvidia-container-toolkit.log\""},
expected: getDefaultConfig(),
},
{
lines: []string{"[nvidia-container-runtime]", "debug=\"nvidia-container-toolkit.log\""},
expected: getDefaultConfig(),
},
{
lines: []string{"nvidia-container-runtime.experimental.debug=\"\""},
expected: &Config{
DebugFilePath: "",
LogLevel: "info",
},
},
{
lines: []string{"nvidia-container-runtime.experimental.debug=2"},
expected: getDefaultConfig(),
},
{
lines: []string{"nvidia-container-runtime.experimental.debug=\"nvidia-container-toolkit.log\""},
expected: &Config{
DebugFilePath: "nvidia-container-toolkit.log",
LogLevel: "info",
},
},
{
lines: []string{"[nvidia-container-runtime.experimental]", "debug=\"nvidia-container-toolkit.log\""},
expected: &Config{
DebugFilePath: "nvidia-container-toolkit.log",
LogLevel: "info",
},
},
{
lines: []string{
"nvidia-container-runtime.debug=\"nvidia-container-toolkit.log\"",
"nvidia-container-runtime.experimental.debug=\"nvidia-container-exp-toolkit.log\"",
},
expected: &Config{
DebugFilePath: "nvidia-container-exp-toolkit.log",
LogLevel: "info",
},
},
}
for i, tc := range testCases {
cfg := getDefaultConfig()
c := tomlConfig{
logger: logger,
sections: []tomlSection{
{section: nvidiaContainerRuntimeExperimentalConfigSection},
},
}
var reader io.Reader
if tc.readerError {
reader = iotest.ErrReader(fmt.Errorf("error"))
} else {
reader = strings.NewReader(strings.Join(tc.lines, "\n"))
}
err := c.updateFromReader(cfg, reader)
if tc.expectedError {
require.Error(t, err, "%d: %v", i, tc)
} else {
require.NoError(t, err, "%d: %v", i, tc)
}
require.EqualValues(t, tc.expected, cfg, "%d: %v", i, tc)
}
}
func TestConfigFromFile(t *testing.T) {
wd, err := os.Getwd()
require.NoError(t, err)
// By default debug is disabled
lines := []string{
"[nvidia-container-cli]",
"root = \"/run/nvidia/driver\"",
"[nvidia-container-runtime]",
"#debug = \"/nvidia-container-toolkit.log\"",
"",
"[nvidia-container-runtime.experimental]",
"debug = \"/nvidia-container-toolkit.experimental.log\"",
}
contents := []byte(strings.Join(lines, "\n"))
testDir := path.Join(wd, "test")
filename := path.Join(testDir, configFileRelativePath)
require.NoError(t, os.MkdirAll(filepath.Dir(filename), 0766))
require.NoError(t, ioutil.WriteFile(filename, contents, 0766))
defer func() { require.NoError(t, os.RemoveAll(testDir)) }()
logger, _ := testlog.NewNullLogger()
c := newConfigFromFileWithLogger(logger, filename)
cfg := getDefaultConfig()
err = c.Update(cfg)
require.NoError(t, err)
require.Equal(t, "/nvidia-container-toolkit.experimental.log", cfg.DebugFilePath)
require.Equal(t, "/run/nvidia/driver", cfg.Root)
}
func TestConfigFromNonexistentFileReturnsNoop(t *testing.T) {
logger, _ := testlog.NewNullLogger()
c := newConfigFromFileWithLogger(logger, "/does/not/exist")
n, ok := c.(*noop)
require.True(t, ok)
require.NotNil(t, n)
}
func TestGetConfigKey(t *testing.T) {
require.Equal(t, "key", tomlSection{}.configKey("key"))
require.Equal(t, "section.key", tomlSection{section: "section"}.configKey("key"))
}

View File

@@ -0,0 +1,144 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package main
import (
"fmt"
"os"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/ensure"
"github.com/NVIDIA/nvidia-container-toolkit/internal/filter"
"github.com/NVIDIA/nvidia-container-toolkit/internal/modify"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/NVIDIA/nvidia-container-toolkit/internal/runtime"
log "github.com/sirupsen/logrus"
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-container-runtime.experimental/config"
)
const (
visibleDevicesEnvvar = "NVIDIA_VISIBLE_DEVICES"
visibleDevicesVoid = "void"
)
var logger = log.New()
func main() {
cfg, err := config.GetConfig(logger)
if err != nil {
logger.Errorf("Error loading config: %v", err)
os.Exit(1)
}
if cfg.DebugFilePath != "" && cfg.DebugFilePath != "/dev/null" {
logFile, err := os.OpenFile(cfg.DebugFilePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
logger.Errorf("Error opening debug log file: %v", err)
os.Exit(1)
}
defer logFile.Close()
logger.SetOutput(logFile)
}
logLevel, err := log.ParseLevel(cfg.LogLevel)
if err == nil {
logger.SetLevel(logLevel)
} else {
logger.Warnf("Invalid log-level '%v'; using '%v'", cfg.LogLevel, logger.Level.String())
}
logger.Infof("Starting nvidia-container-runtime: %v", os.Args)
logger.Debugf("Using config=%+v", cfg)
if err := run(cfg, os.Args); err != nil {
logger.Errorf("Error running runtime: %v", err)
os.Exit(1)
}
}
func run(cfg *config.Config, args []string) error {
logger.Debugf("running with args=%v", args)
// We create a low-level runtime
lowLevelRuntime, err := createLowLevelRuntime(logger, cfg)
if err != nil {
return fmt.Errorf("error constructing low-level runtime: %v", err)
}
if !oci.HasCreateSubcommand(args) {
logger.Infof("No modification of OCI specification required")
logger.Infof("Forwarding command to runtime")
return lowLevelRuntime.Exec(args)
}
// We create the OCI spec that is to be modified
ociSpec, bundleDir, err := oci.NewSpecFromArgs(args)
if err != nil {
return fmt.Errorf("error constructing OCI spec: %v", err)
}
err = ociSpec.Load()
if err != nil {
return fmt.Errorf("error loading OCI specification: %v", err)
}
visibleDevices, exists := ociSpec.LookupEnv(visibleDevicesEnvvar)
if !exists || visibleDevices == "" || visibleDevices == visibleDevicesVoid {
logger.Infof("Using low-level runtime: %v=%v (exists=%v)", visibleDevicesEnvvar, visibleDevices, exists)
return lowLevelRuntime.Exec(os.Args)
}
// We create the modifier that will be applied by the Modifying Runtime Wrapper
modifier, err := createModifier(cfg.Root, bundleDir, visibleDevices, ociSpec)
if err != nil {
return fmt.Errorf("error constructing modifer: %v", err)
}
// We construct the Modifying runtime
r := runtime.NewModifyingRuntimeWrapperWithLogger(logger, lowLevelRuntime, ociSpec, modifier)
return r.Exec(os.Args)
}
func createLowLevelRuntime(logger *log.Logger, cfg *config.Config) (oci.Runtime, error) {
if cfg.RuntimePath == "" {
return oci.NewLowLevelRuntimeWithLogger(logger, "docker-runc", "runc")
}
logger.Infof("Creating runtime with path %v", cfg.RuntimePath)
return oci.NewRuntimeForPathWithLogger(logger, cfg.RuntimePath)
}
func createModifier(root string, bundleDir string, visibleDevices string, env filter.EnvLookup) (modify.Modifier, error) {
// We set up the modifier using discovery
discovered, err := discover.NewNVMLServerWithLogger(logger, root)
if err != nil {
return nil, fmt.Errorf("error discovering devices: %v", err)
}
// We apply a filter to the discovered devices
selected := filter.NewSelectDevicesFromWithLogger(logger, discovered, visibleDevices, env)
// We ensure that the selected devices are available
available := ensure.NewEnsureDevicesWithLogger(logger, selected, root)
// We construct the modifer for the OCI spec
modifier := modify.NewModifierWithLoggerFor(logger, available, root, bundleDir)
return modifier, nil
}

View File

@@ -1,87 +0,0 @@
# The NVIDIA Container Runtime
The NVIDIA Container Runtime is a shim for OCI-compliant low-level runtimes such as [runc](https://github.com/opencontainers/runc). When a `create` command is detected, the incoming [OCI runtime specification](https://github.com/opencontainers/runtime-spec) is modified in place and the command is forwarded to the low-level runtime.
## Configuration
The NVIDIA Container Runtime uses file-based configuration, with the config stored in `/etc/nvidia-container-runtime/config.toml`. The `/etc` path can be overridden using the `XDG_CONFIG_HOME` environment variable with the `${XDG_CONFIG_HOME}/nvidia-container-runtime/config.toml` file used instead if this environment variable is set.
This config file may contain options for other components of the NVIDIA container stack and for the NVIDIA Container Runtime, the relevant config section is `nvidia-container-runtime`
### Logging
The `log-level` config option (default: `"info"`) specifies the log level to use and the `debug` option, if set, specifies a log file to which logs for the NVIDIA Container Runtime must be written.
In addition to this, the NVIDIA Container Runtime considers the value of `--log` and `--log-format` flags that may be passed to it by a container runtime such as docker or containerd. If the `--debug` flag is present the log-level specified in the config file is overridden as `"debug"`.
### Low-level Runtime Path
The `runtimes` config option allows for the low-level runtime to be specified. The first entry in this list that is an existing executable file is used as the low-level runtime. If the entry is not a path, the `PATH` is searched for a matching executable. If the entry is a path this is checked instead.
The default value for this setting is:
```toml
runtimes = [
"docker-runc",
"runc",
]
```
and if, for example, `crun` is to be used instead this can be changed to:
```toml
runtimes = [
"crun",
]
```
### Runtime Mode
The `mode` config option (default `"auto"`) controls the high-level behaviour of the runtime.
#### Auto Mode
When `mode` is set to `"auto"`, the runtime employs heuristics to determine which mode to use based on, for example, the platform where the runtime is being run.
#### Legacy Mode
When `mode` is set to `"legacy"`, the NVIDIA Container Runtime adds a [`prestart` hook](https://github.com/opencontainers/runtime-spec/blob/master/config.md#prestart) to the incomming OCI specification that invokes the NVIDIA Container Runtime Hook for all containers created. This hook checks whether NVIDIA devices are requested and ensures GPU access is configured using the `nvidia-container-cli` from the [libnvidia-container](https://github.com/NVIDIA/libnvidia-container) project.
#### CSV Mode
When `mode` is set to `"csv"`, CSV files at `/etc/nvidia-container-runtime/host-files-for-container.d` define the devices and mounts that are to be injected into a container when it is created. The search path for the files can be overridden by modifying the `nvidia-container-runtime.modes.csv.mount-spec-path` in the config as below:
```toml
[nvidia-container-runtime]
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
```
This mode is primarily targeted at Tegra-based systems without NVML available.
### Notes on using the docker CLI
Note that only the `"legacy"` NVIDIA Container Runtime mode is directly compatible with the `--gpus` flag implemented by the `docker` CLI (assuming the NVIDIA Container Runtime is not used). The reason for this is that `docker` inserts the same NVIDIA Container Runtime Hook into the OCI runtime specification.
If a different mode is explicitly set or detected, the NVIDIA Container Runtime Hook will raise the following error when `--gpus` is set:
```
$ docker run --rm --gpus all ubuntu:18.04
docker: Error response from daemon: failed to create shim: OCI runtime create failed: container_linux.go:380: starting container process caused: process_linux.go:545: container init caused: Running hook #0:: error running hook: exit status 1, stdout: , stderr: Auto-detected mode as 'csv'
invoking the NVIDIA Container Runtime Hook directly (e.g. specifying the docker --gpus flag) is not supported. Please use the NVIDIA Container Runtime instead.: unknown.
```
Here NVIDIA Container Runtime must be used explicitly. The recommended way to do this is to specify the `--runtime=nvidia` command line argument as part of the `docker run` commmand as follows:
```
$ docker run --rm --gpus all --runtime=nvidia ubuntu:18.04
```
Alternatively the NVIDIA Container Runtime can be set as the default runtime for docker. This can be done by modifying the `/etc/docker/daemon.json` file as follows:
```json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
```

View File

@@ -20,220 +20,60 @@ import (
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"github.com/sirupsen/logrus"
"github.com/tsaikd/KDGoLib/logrusutil"
)
// Logger adds a way to manage output to a log file to a logrus.Logger
type Logger struct {
*logrus.Logger
previousLogger *logrus.Logger
logFiles []*os.File
previousOutput io.Writer
logFile *os.File
}
// NewLogger creates an empty logger
// NewLogger constructs a Logger with a preddefined formatter
func NewLogger() *Logger {
return &Logger{
Logger: logrus.New(),
logrusLogger := logrus.New()
formatter := &logrusutil.ConsoleLogFormatter{
TimestampFormat: "2006/01/02 15:04:07",
Flag: logrusutil.Ltime,
}
logger := &Logger{
Logger: logrusLogger,
}
logger.SetFormatter(formatter)
return logger
}
// UpdateLogger constructs a Logger with a preddefined formatter
func UpdateLogger(filename string, logLevel string, argv []string) (*Logger, error) {
configFromArgs := parseArgs(argv)
level, logLevelError := configFromArgs.getLevel(logLevel)
var logFiles []*os.File
var argLogFileError error
// We don't create log files if the version argument is supplied
if !configFromArgs.version {
configLogFile, err := createLogFile(filename)
if err != nil {
return logger, fmt.Errorf("error opening debug log file: %v", err)
}
if configLogFile != nil {
logFiles = append(logFiles, configLogFile)
}
argLogFile, err := createLogFile(configFromArgs.file)
if argLogFile != nil {
logFiles = append(logFiles, argLogFile)
}
argLogFileError = err
// LogToFile opens the specified file for appending and sets the logger to
// output to the opened file. A reference to the file pointer is stored to
// allow this to be closed.
func (l *Logger) LogToFile(filename string) error {
logFile, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
return fmt.Errorf("error opening debug log file: %v", err)
}
l := &Logger{
Logger: logrus.New(),
previousLogger: logger.Logger,
logFiles: logFiles,
}
l.logFile = logFile
l.previousOutput = l.Out
l.SetOutput(logFile)
l.SetLevel(level)
if level == logrus.DebugLevel {
logrus.SetReportCaller(true)
// Shorten function and file names reported by the logger, by
// trimming common "github.com/opencontainers/runc" prefix.
// This is only done for text formatter.
_, file, _, _ := runtime.Caller(0)
prefix := filepath.Dir(file) + "/"
logrus.SetFormatter(&logrus.TextFormatter{
CallerPrettyfier: func(f *runtime.Frame) (string, string) {
function := strings.TrimPrefix(f.Function, prefix) + "()"
fileLine := strings.TrimPrefix(f.File, prefix) + ":" + strconv.Itoa(f.Line)
return function, fileLine
},
})
}
if configFromArgs.format == "json" {
l.SetFormatter(new(logrus.JSONFormatter))
}
if len(logFiles) == 0 {
l.SetOutput(io.Discard)
} else if len(logFiles) == 1 {
l.SetOutput(logFiles[0])
} else if len(logFiles) > 1 {
var writers []io.Writer
for _, f := range logFiles {
writers = append(writers, f)
}
l.SetOutput(io.MultiWriter(writers...))
}
if logLevelError != nil {
l.Warn(logLevelError)
}
if argLogFileError != nil {
l.Warnf("Failed to open log file: %v", argLogFileError)
}
return l, nil
return nil
}
// Reset closes the log file (if any) and resets the logger output to what it
// was before UpdateLogger was called.
func (l *Logger) Reset() error {
defer func() {
previous := l.previousLogger
if previous == nil {
previous = logrus.New()
}
logger = &Logger{Logger: previous}
}()
var errs []error
for _, f := range l.logFiles {
err := f.Close()
if err != nil {
errs = append(errs, err)
}
// CloseFile closes the log file (if any) and resets the logger output to what it
// was before LogToFile was called.
func (l *Logger) CloseFile() error {
if l.logFile == nil {
return nil
}
logFile := l.logFile
l.SetOutput(l.previousOutput)
l.logFile = nil
var err error
for _, e := range errs {
if err == nil {
err = e
continue
}
return fmt.Errorf("%v; %w", e, err)
}
return err
}
func createLogFile(filename string) (*os.File, error) {
if filename != "" && filename != os.DevNull {
return os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
}
return nil, nil
}
type loggerConfig struct {
file string
format string
debug bool
version bool
}
func (c loggerConfig) getLevel(logLevel string) (logrus.Level, error) {
if c.debug {
return logrus.DebugLevel, nil
}
if logLevel, err := logrus.ParseLevel(logLevel); err == nil {
return logLevel, nil
}
return logrus.InfoLevel, fmt.Errorf("invalid log-level '%v'", logLevel)
}
// Informed by Taken from https://github.com/opencontainers/runc/blob/7fd8b57001f5bfa102e89cb434d96bf71f7c1d35/main.go#L182
func parseArgs(args []string) loggerConfig {
c := loggerConfig{}
expected := map[string]*string{
"log-format": &c.format,
"log": &c.file,
}
found := make(map[string]bool)
for i := 0; i < len(args); i++ {
if len(found) == 4 {
break
}
param := args[i]
parts := strings.SplitN(param, "=", 2)
trimmed := strings.TrimLeft(parts[0], "-")
// If this is not a flag we continue
if parts[0] == trimmed {
continue
}
// Check the version flag
if trimmed == "version" {
c.version = true
found["version"] = true
// For the version flag we don't process any other flags
continue
}
// Check the debug flag
if trimmed == "debug" {
c.debug = true
found["debug"] = true
continue
}
destination, exists := expected[trimmed]
if !exists {
continue
}
var value string
if len(parts) == 2 {
value = parts[2]
} else if i+1 < len(args) {
value = args[i+1]
i++
} else {
continue
}
*destination = value
found[trimmed] = true
}
return c
return logFile.Close()
}

View File

@@ -3,82 +3,87 @@ package main
import (
"fmt"
"os"
"strings"
"path"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/pelletier/go-toml"
)
// version must be set by go build's -X main.version= option in the Makefile.
var version = "unknown"
const (
configOverride = "XDG_CONFIG_HOME"
configFilePath = "nvidia-container-runtime/config.toml"
// gitCommit will be the hash that the binary was built from
// and will be populated by the Makefile
var gitCommit = ""
hookDefaultFilePath = "/usr/bin/nvidia-container-runtime-hook"
)
var (
configDir = "/etc/"
)
var logger = NewLogger()
func main() {
err := run(os.Args)
if err != nil {
logger.Errorf("%v", err)
logger.Errorf("Error running %v: %v", os.Args, err)
os.Exit(1)
}
}
// run is an entry point that allows for idiomatic handling of errors
// when calling from the main function.
func run(argv []string) (rerr error) {
printVersion := hasVersionFlag(argv)
if printVersion {
fmt.Printf("%v version %v\n", "NVIDIA Container Runtime", info.GetVersionString(fmt.Sprintf("spec: %v", specs.Version)))
}
cfg, err := config.GetConfig()
func run(argv []string) (err error) {
cfg, err := getConfig()
if err != nil {
return fmt.Errorf("error loading config: %v", err)
}
logger, err = UpdateLogger(
cfg.NVIDIAContainerRuntimeConfig.DebugFilePath,
cfg.NVIDIAContainerRuntimeConfig.LogLevel,
argv,
)
err = logger.LogToFile(cfg.debugFilePath)
if err != nil {
return fmt.Errorf("failed to set up logger: %v", err)
return fmt.Errorf("error opening debug log file: %v", err)
}
defer logger.Reset()
defer func() {
// We capture and log a returning error before closing the log file.
if err != nil {
logger.Errorf("Error running %v: %v", argv, err)
}
logger.CloseFile()
}()
logger.Debugf("Command line arguments: %v", argv)
runtime, err := newNVIDIAContainerRuntime(logger.Logger, cfg, argv)
r, err := newRuntime(argv)
if err != nil {
return fmt.Errorf("failed to create NVIDIA Container Runtime: %v", err)
return fmt.Errorf("error creating runtime: %v", err)
}
if printVersion {
fmt.Print("\n")
}
return runtime.Exec(argv)
logger.Printf("Running %s\n", argv[0])
return r.Exec(argv)
}
// TODO: This should be refactored / combined with parseArgs in logger.
func hasVersionFlag(args []string) bool {
for i := 0; i < len(args); i++ {
param := args[i]
type config struct {
debugFilePath string
}
parts := strings.SplitN(param, "=", 2)
trimmed := strings.TrimLeft(parts[0], "-")
// If this is not a flag we continue
if parts[0] == trimmed {
continue
}
// getConfig sets up the config struct. Values are read from a toml file
// or set via the environment.
func getConfig() (*config, error) {
cfg := &config{}
// Check the version flag
if trimmed == "version" {
return true
}
if XDGConfigDir := os.Getenv(configOverride); len(XDGConfigDir) != 0 {
configDir = XDGConfigDir
}
return false
configFilePath := path.Join(configDir, configFilePath)
tomlContent, err := os.ReadFile(configFilePath)
if err != nil {
return nil, err
}
toml, err := toml.Load(string(tomlContent))
if err != nil {
return nil, err
}
cfg.debugFilePath = toml.GetDefault("nvidia-container-runtime.debug", "/dev/null").(string)
return cfg, nil
}

View File

@@ -3,15 +3,15 @@ package main
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"runtime"
"strings"
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier"
"github.com/NVIDIA/nvidia-container-toolkit/internal/test"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/require"
)
@@ -24,10 +24,6 @@ const (
unmodifiedSpecFileSuffix = "test/input/test_spec.json"
)
const (
runcExecutableName = "runc"
)
type testConfig struct {
root string
binPath string
@@ -39,7 +35,7 @@ func TestMain(m *testing.M) {
// TEST SETUP
// Determine the module root and the test binary path
var err error
moduleRoot, err := test.GetModuleRoot()
moduleRoot, err := getModuleRoot()
if err != nil {
logger.Fatalf("error in test setup: could not get module root: %v", err)
}
@@ -47,7 +43,7 @@ func TestMain(m *testing.M) {
testInputPath := filepath.Join(moduleRoot, "test", "input")
// Set the environment variables for the test
os.Setenv("PATH", test.PrependToPath(testBinPath, moduleRoot))
os.Setenv("PATH", prependToPath(testBinPath, moduleRoot))
os.Setenv("XDG_CONFIG_HOME", testInputPath)
// Confirm that the environment is configured correctly
@@ -75,6 +71,31 @@ func TestMain(m *testing.M) {
os.Exit(exitCode)
}
func getModuleRoot() (string, error) {
_, filename, _, _ := runtime.Caller(0)
return hasGoMod(filename)
}
func hasGoMod(dir string) (string, error) {
if dir == "" || dir == "/" {
return "", fmt.Errorf("module root not found")
}
_, err := os.Stat(filepath.Join(dir, "go.mod"))
if err != nil {
return hasGoMod(filepath.Dir(dir))
}
return dir, nil
}
func prependToPath(additionalPaths ...string) string {
paths := strings.Split(os.Getenv("PATH"), ":")
paths = append(additionalPaths, paths...)
return strings.Join(paths, ":")
}
// case 1) nvidia-container-runtime run --bundle
// case 2) nvidia-container-runtime create --bundle
// - Confirm the runtime handles bad input correctly
@@ -84,6 +105,11 @@ func TestBadInput(t *testing.T) {
t.Fatal(err)
}
cmdRun := exec.Command(nvidiaRuntime, "run", "--bundle")
t.Logf("executing: %s\n", strings.Join(cmdRun.Args, " "))
output, err := cmdRun.CombinedOutput()
require.Errorf(t, err, "runtime should return an error", "output=%v", string(output))
cmdCreate := exec.Command(nvidiaRuntime, "create", "--bundle")
t.Logf("executing: %s\n", strings.Join(cmdCreate.Args, " "))
err = cmdCreate.Run()
@@ -167,11 +193,11 @@ func TestDuplicateHook(t *testing.T) {
require.Equal(t, 1, nvidiaHookCount(spec.Hooks), "exactly one nvidia prestart hook should be inserted correctly into config.json")
}
// addNVIDIAHook is a basic wrapper for an addHookModifier that is used for
// addNVIDIAHook is a basic wrapper for nvidiaContainerRunime.addNVIDIAHook that is used for
// testing.
func addNVIDIAHook(spec *specs.Spec) error {
m := modifier.NewStableRuntimeModifier(logger.Logger)
return m.Modify(spec)
r := nvidiaContainerRuntime{logger: logger.Logger}
return r.addNVIDIAHook(spec)
}
func (c testConfig) getRuntimeSpec() (specs.Spec, error) {
@@ -244,3 +270,24 @@ func nvidiaHookCount(hooks *specs.Hooks) int {
}
return count
}
func TestGetConfigWithCustomConfig(t *testing.T) {
wd, err := os.Getwd()
require.NoError(t, err)
// By default debug is disabled
contents := []byte("[nvidia-container-runtime]\ndebug = \"/nvidia-container-toolkit.log\"")
testDir := filepath.Join(wd, "test")
filename := filepath.Join(testDir, configFilePath)
os.Setenv(configOverride, testDir)
require.NoError(t, os.MkdirAll(filepath.Dir(filename), 0766))
require.NoError(t, ioutil.WriteFile(filename, contents, 0766))
defer func() { require.NoError(t, os.RemoveAll(testDir)) }()
cfg, err := getConfig()
require.NoError(t, err)
require.Equal(t, cfg.debugFilePath, "/nvidia-container-toolkit.log")
}

View File

@@ -0,0 +1,132 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package main
import (
"fmt"
"os"
"os/exec"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/opencontainers/runtime-spec/specs-go"
log "github.com/sirupsen/logrus"
)
// nvidiaContainerRuntime encapsulates the NVIDIA Container Runtime. It wraps the specified runtime, conditionally
// modifying the specified OCI specification before invoking the runtime.
type nvidiaContainerRuntime struct {
logger *log.Logger
runtime oci.Runtime
ociSpec oci.Spec
}
var _ oci.Runtime = (*nvidiaContainerRuntime)(nil)
// newNvidiaContainerRuntime is a constructor for a standard runtime shim.
func newNvidiaContainerRuntimeWithLogger(logger *log.Logger, runtime oci.Runtime, ociSpec oci.Spec) (oci.Runtime, error) {
r := nvidiaContainerRuntime{
logger: logger,
runtime: runtime,
ociSpec: ociSpec,
}
return &r, nil
}
// Exec defines the entrypoint for the NVIDIA Container Runtime. A check is performed to see whether modifications
// to the OCI spec are required -- and applicable modifcations applied. The supplied arguments are then
// forwarded to the underlying runtime's Exec method.
func (r nvidiaContainerRuntime) Exec(args []string) error {
if r.modificationRequired(args) {
err := r.modifyOCISpec()
if err != nil {
return fmt.Errorf("error modifying OCI spec: %v", err)
}
}
r.logger.Println("Forwarding command to runtime")
return r.runtime.Exec(args)
}
// modificationRequired checks the intput arguments to determine whether a modification
// to the OCI spec is required.
func (r nvidiaContainerRuntime) modificationRequired(args []string) bool {
if oci.HasCreateSubcommand(args) {
r.logger.Infof("'create' command detected; modification required")
return true
}
r.logger.Infof("No modification required")
return false
}
// modifyOCISpec loads and modifies the OCI spec specified in the nvidiaContainerRuntime
// struct. The spec is modified in-place and written to the same file as the input after
// modifcationas are applied.
func (r nvidiaContainerRuntime) modifyOCISpec() error {
err := r.ociSpec.Load()
if err != nil {
return fmt.Errorf("error loading OCI specification for modification: %v", err)
}
err = r.ociSpec.Modify(r.addNVIDIAHook)
if err != nil {
return fmt.Errorf("error injecting NVIDIA Container Runtime hook: %v", err)
}
err = r.ociSpec.Flush()
if err != nil {
return fmt.Errorf("error writing modified OCI specification: %v", err)
}
return nil
}
// addNVIDIAHook modifies the specified OCI specification in-place, inserting a
// prestart hook.
func (r nvidiaContainerRuntime) addNVIDIAHook(spec *specs.Spec) error {
path, err := exec.LookPath("nvidia-container-runtime-hook")
if err != nil {
path = hookDefaultFilePath
_, err = os.Stat(path)
if err != nil {
return err
}
}
r.logger.Printf("prestart hook path: %s\n", path)
args := []string{path}
if spec.Hooks == nil {
spec.Hooks = &specs.Hooks{}
} else if len(spec.Hooks.Prestart) != 0 {
for _, hook := range spec.Hooks.Prestart {
if !strings.Contains(hook.Path, "nvidia-container-runtime-hook") {
continue
}
r.logger.Println("existing nvidia prestart hook in OCI spec file")
return nil
}
}
spec.Hooks.Prestart = append(spec.Hooks.Prestart, specs.Hook{
Path: path,
Args: append(args, "prestart"),
})
return nil
}

View File

@@ -0,0 +1,203 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package main
import (
"fmt"
"strings"
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/opencontainers/runtime-spec/specs-go"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
func TestAddNvidiaHook(t *testing.T) {
logger, logHook := testlog.NewNullLogger()
shim := nvidiaContainerRuntime{
logger: logger,
}
testCases := []struct {
spec *specs.Spec
errorPrefix string
shouldNotAdd bool
}{
{
spec: &specs.Spec{},
},
{
spec: &specs.Spec{
Hooks: &specs.Hooks{},
},
},
{
spec: &specs.Spec{
Hooks: &specs.Hooks{
Prestart: []specs.Hook{{
Path: "some-hook",
}},
},
},
},
{
spec: &specs.Spec{
Hooks: &specs.Hooks{
Prestart: []specs.Hook{{
Path: "nvidia-container-runtime-hook",
}},
},
},
shouldNotAdd: true,
},
}
for i, tc := range testCases {
logHook.Reset()
var numPrestartHooks int
if tc.spec.Hooks != nil {
numPrestartHooks = len(tc.spec.Hooks.Prestart)
}
err := shim.addNVIDIAHook(tc.spec)
if tc.errorPrefix == "" {
require.NoErrorf(t, err, "%d: %v", i, tc)
} else {
require.Truef(t, strings.HasPrefix(err.Error(), tc.errorPrefix), "%d: %v", i, tc)
require.NotNilf(t, tc.spec.Hooks, "%d: %v", i, tc)
require.Equalf(t, 1, nvidiaHookCount(tc.spec.Hooks), "%d: %v", i, tc)
if tc.shouldNotAdd {
require.Equal(t, numPrestartHooks+1, len(tc.spec.Hooks.Poststart), "%d: %v", i, tc)
} else {
require.Equal(t, numPrestartHooks+1, len(tc.spec.Hooks.Poststart), "%d: %v", i, tc)
nvidiaHook := tc.spec.Hooks.Poststart[len(tc.spec.Hooks.Poststart)-1]
// TODO: This assumes that the hook has been set up in the makefile
expectedPath := "/usr/bin/nvidia-container-runtime-hook"
require.Equalf(t, expectedPath, nvidiaHook.Path, "%d: %v", i, tc)
require.Equalf(t, []string{expectedPath, "prestart"}, nvidiaHook.Args, "%d: %v", i, tc)
require.Emptyf(t, nvidiaHook.Env, "%d: %v", i, tc)
require.Nilf(t, nvidiaHook.Timeout, "%d: %v", i, tc)
}
}
}
}
func TestNvidiaContainerRuntime(t *testing.T) {
logger, hook := testlog.NewNullLogger()
testCases := []struct {
shim nvidiaContainerRuntime
shouldModify bool
args []string
modifyError error
writeError error
}{
{
shim: nvidiaContainerRuntime{},
shouldModify: false,
},
{
shim: nvidiaContainerRuntime{},
args: []string{"create"},
shouldModify: true,
},
{
shim: nvidiaContainerRuntime{},
args: []string{"--bundle=create"},
shouldModify: false,
},
{
shim: nvidiaContainerRuntime{},
args: []string{"--bundle", "create"},
shouldModify: false,
},
{
shim: nvidiaContainerRuntime{},
args: []string{"create"},
shouldModify: true,
},
{
shim: nvidiaContainerRuntime{},
args: []string{"create"},
modifyError: fmt.Errorf("error modifying"),
shouldModify: true,
},
{
shim: nvidiaContainerRuntime{},
args: []string{"create"},
writeError: fmt.Errorf("error writing"),
shouldModify: true,
},
}
for i, tc := range testCases {
tc.shim.logger = logger
hook.Reset()
ociMock := &oci.SpecMock{
ModifyFunc: func(specModifier oci.SpecModifier) error {
return tc.modifyError
},
FlushFunc: func() error {
return tc.writeError
},
}
require.Equal(t, tc.shouldModify, tc.shim.modificationRequired(tc.args), "%d: %v", i, tc)
tc.shim.ociSpec = ociMock
tc.shim.runtime = &MockShim{}
err := tc.shim.Exec(tc.args)
if tc.modifyError != nil || tc.writeError != nil {
require.Error(t, err, "%d: %v", i, tc)
} else {
require.NoError(t, err, "%d: %v", i, tc)
}
if tc.shouldModify {
require.Equal(t, 1, len(ociMock.ModifyCalls()), "%d: %v", i, tc)
} else {
require.Equal(t, 0, len(ociMock.ModifyCalls()), "%d: %v", i, tc)
}
writeExpected := tc.shouldModify && tc.modifyError == nil
if writeExpected {
require.Equal(t, 1, len(ociMock.FlushCalls()), "%d: %v", i, tc)
} else {
require.Equal(t, 0, len(ociMock.FlushCalls()), "%d: %v", i, tc)
}
}
}
type MockShim struct {
called bool
args []string
returnError error
}
func (m *MockShim) Exec(args []string) error {
m.called = true
m.args = args
return m.returnError
}

View File

@@ -1,5 +1,5 @@
/*
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -19,93 +19,56 @@ package main
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info"
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/NVIDIA/nvidia-container-toolkit/internal/runtime"
"github.com/sirupsen/logrus"
)
// newNVIDIAContainerRuntime is a factory method that constructs a runtime based on the selected configuration and specified logger
func newNVIDIAContainerRuntime(logger *logrus.Logger, cfg *config.Config, argv []string) (oci.Runtime, error) {
lowLevelRuntime, err := oci.NewLowLevelRuntime(logger, cfg.NVIDIAContainerRuntimeConfig.Runtimes)
if err != nil {
return nil, fmt.Errorf("error constructing low-level runtime: %v", err)
}
const (
ociSpecFileName = "config.json"
dockerRuncExecutableName = "docker-runc"
runcExecutableName = "runc"
)
if !oci.HasCreateSubcommand(argv) {
logger.Debugf("Skipping modifier for non-create subcommand")
return lowLevelRuntime, nil
}
ociSpec, err := oci.NewSpec(logger, argv)
// newRuntime is a factory method that constructs a runtime based on the selected configuration.
func newRuntime(argv []string) (oci.Runtime, error) {
ociSpec, err := newOCISpec(argv)
if err != nil {
return nil, fmt.Errorf("error constructing OCI specification: %v", err)
}
specModifier, err := newSpecModifier(logger, cfg, ociSpec, argv)
runc, err := newRuncRuntime()
if err != nil {
return nil, fmt.Errorf("failed to construct OCI spec modifier: %v", err)
return nil, fmt.Errorf("error constructing runc runtime: %v", err)
}
// Create the wrapping runtime with the specified modifier
r := runtime.NewModifyingRuntimeWrapper(
logger,
lowLevelRuntime,
ociSpec,
specModifier,
)
r, err := newNvidiaContainerRuntimeWithLogger(logger.Logger, runc, ociSpec)
if err != nil {
return nil, fmt.Errorf("error constructing NVIDIA Container Runtime: %v", err)
}
return r, nil
}
// newSpecModifier is a factory method that creates constructs an OCI spec modifer based on the provided config.
func newSpecModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec, argv []string) (oci.SpecModifier, error) {
modeModifier, err := newModeModifier(logger, cfg, ociSpec, argv)
// newOCISpec constructs an OCI spec for the provided arguments
func newOCISpec(argv []string) (oci.Spec, error) {
bundleDir, err := oci.GetBundleDir(argv)
if err != nil {
return nil, err
return nil, fmt.Errorf("error parsing command line arguments: %v", err)
}
logger.Infof("Using bundle directory: %v", bundleDir)
graphicsModifier, err := modifier.NewGraphicsModifier(logger, cfg, ociSpec)
if err != nil {
return nil, err
}
ociSpecPath := oci.GetSpecFilePath(bundleDir)
logger.Infof("Using OCI specification file path: %v", ociSpecPath)
gdsModifier, err := modifier.NewGDSModifier(logger, cfg, ociSpec)
if err != nil {
return nil, err
}
ociSpec := oci.NewSpecFromFile(ociSpecPath)
mofedModifier, err := modifier.NewMOFEDModifier(logger, cfg, ociSpec)
if err != nil {
return nil, err
}
return ociSpec, nil
}
tegraModifier, err := modifier.NewTegraPlatformFiles(logger)
if err != nil {
return nil, err
}
modifiers := modifier.Merge(
modeModifier,
graphicsModifier,
gdsModifier,
mofedModifier,
tegraModifier,
// newRuncRuntime locates the runc binary and wraps it in a SyscallExecRuntime
func newRuncRuntime() (oci.Runtime, error) {
return oci.NewLowLevelRuntimeWithLogger(
logger.Logger,
dockerRuncExecutableName,
runcExecutableName,
)
return modifiers, nil
}
func newModeModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec, argv []string) (oci.SpecModifier, error) {
switch info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode) {
case "legacy":
return modifier.NewStableRuntimeModifier(logger), nil
case "csv":
return modifier.NewCSVModifier(logger, cfg, ociSpec)
case "cdi":
return modifier.NewCDIModifier(logger, cfg, ociSpec)
}
return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)
}

View File

@@ -17,113 +17,14 @@
package main
import (
"encoding/json"
"os"
"path/filepath"
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/opencontainers/runtime-spec/specs-go"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
func TestFactoryMethod(t *testing.T) {
logger, _ := testlog.NewNullLogger()
func TestConstructor(t *testing.T) {
shim, err := newRuntime([]string{})
testCases := []struct {
description string
cfg *config.Config
spec *specs.Spec
expectedError bool
}{
{
description: "empty config raises error",
cfg: &config.Config{
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{},
},
expectedError: true,
},
{
description: "config with runtime raises no error",
cfg: &config.Config{
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
Runtimes: []string{"runc"},
Mode: "legacy",
},
},
},
{
description: "csv mode is supported",
cfg: &config.Config{
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
Runtimes: []string{"runc"},
Mode: "csv",
},
},
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{
"NVIDIA_VISIBLE_DEVICES=all",
},
},
},
},
{
description: "non-legacy discover mode raises error",
cfg: &config.Config{
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
Runtimes: []string{"runc"},
Mode: "non-legacy",
},
},
expectedError: true,
},
{
description: "legacy discover mode returns modifier",
cfg: &config.Config{
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
Runtimes: []string{"runc"},
Mode: "legacy",
},
},
},
{
description: "csv discover mode returns modifier",
cfg: &config.Config{
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
Runtimes: []string{"runc"},
Mode: "csv",
},
},
},
{
description: "empty mode raises error",
cfg: &config.Config{
NVIDIAContainerRuntimeConfig: config.RuntimeConfig{
Runtimes: []string{"runc"},
},
},
expectedError: true,
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
bundleDir := t.TempDir()
specFile, err := os.Create(filepath.Join(bundleDir, "config.json"))
require.NoError(t, err)
require.NoError(t, json.NewEncoder(specFile).Encode(tc.spec))
argv := []string{"--bundle", bundleDir, "create"}
_, err = newNVIDIAContainerRuntime(logger, tc.cfg, argv)
if tc.expectedError {
require.Error(t, err)
} else {
require.NoError(t, err)
}
})
}
require.NoError(t, err)
require.NotNil(t, shim)
}

View File

@@ -7,12 +7,14 @@ import (
"os"
"path"
"path/filepath"
"strconv"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"golang.org/x/mod/semver"
)
var envSwarmGPU *string
const (
envCUDAVersion = "CUDA_VERSION"
envNVRequirePrefix = "NVIDIA_REQUIRE_"
@@ -102,6 +104,32 @@ type HookState struct {
BundlePath string `json:"bundlePath"`
}
func parseCudaVersion(cudaVersion string) (vmaj, vmin, vpatch uint32) {
if _, err := fmt.Sscanf(cudaVersion, "%d.%d.%d\n", &vmaj, &vmin, &vpatch); err != nil {
vpatch = 0
if _, err := fmt.Sscanf(cudaVersion, "%d.%d\n", &vmaj, &vmin); err != nil {
vmin = 0
if _, err := fmt.Sscanf(cudaVersion, "%d\n", &vmaj); err != nil {
log.Panicln("invalid CUDA version:", cudaVersion)
}
}
}
return
}
func getEnvMap(e []string) (m map[string]string) {
m = make(map[string]string)
for _, s := range e {
p := strings.SplitN(s, "=", 2)
if len(p) != 2 {
log.Panicln("environment error")
}
m[p[0]] = p[1]
}
return
}
func loadSpec(path string) (spec *Spec) {
f, err := os.Open(path)
if err != nil {
@@ -163,18 +191,49 @@ func isPrivileged(s *Spec) bool {
return false
}
func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string {
// Build a list of envvars to consider. Note that the Swarm Resource envvars have a higher precedence.
envVars := append(swarmResourceEnvvars, envNVVisibleDevices)
func isLegacyCUDAImage(env map[string]string) bool {
legacyCudaVersion := env[envCUDAVersion]
cudaRequire := env[envNVRequireCUDA]
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
}
devices := image.DevicesFromEnvvars(envVars...)
if len(devices) == 0 {
func getDevicesFromEnvvar(env map[string]string, legacyImage bool) *string {
// Build a list of envvars to consider.
envVars := []string{envNVVisibleDevices}
if envSwarmGPU != nil {
// The Swarm envvar has higher precedence.
envVars = append([]string{*envSwarmGPU}, envVars...)
}
// Grab a reference to devices from the first envvar
// in the list that actually exists in the environment.
var devices *string
for _, envVar := range envVars {
if devs, ok := env[envVar]; ok {
devices = &devs
break
}
}
// Environment variable unset with legacy image: default to "all".
if devices == nil && legacyImage {
all := "all"
return &all
}
// Environment variable unset or empty or "void": return nil
if devices == nil || len(*devices) == 0 || *devices == "void" {
return nil
}
devicesString := strings.Join(devices, ",")
// Environment variable set to "none": reset to "".
if *devices == "none" {
empty := ""
return &empty
}
return &devicesString
// Any other value.
return devices
}
func getDevicesFromMounts(mounts []Mount) *string {
@@ -214,7 +273,7 @@ func getDevicesFromMounts(mounts []Mount) *string {
return &ret
}
func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *string {
func getDevices(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool, legacyImage bool) *string {
// If enabled, try and get the device list from volume mounts first
if hookConfig.AcceptDeviceListAsVolumeMounts {
devices := getDevicesFromMounts(mounts)
@@ -224,7 +283,7 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil
}
// Fallback to reading from the environment variable if privileges are correct
devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars())
devices := getDevicesFromEnvvar(env, legacyImage)
if devices == nil {
return nil
}
@@ -276,11 +335,27 @@ func getDriverCapabilities(env map[string]string, supportedDriverCapabilities Dr
return capabilities
}
func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *nvidiaConfig {
legacyImage := image.IsLegacy()
func getRequirements(env map[string]string, legacyImage bool) []string {
// All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli
var requirements []string
for name, value := range env {
if strings.HasPrefix(name, envNVRequirePrefix) {
requirements = append(requirements, value)
}
}
if legacyImage {
vmaj, vmin, _ := parseCudaVersion(env[envCUDAVersion])
cudaRequire := fmt.Sprintf("cuda>=%d.%d", vmaj, vmin)
requirements = append(requirements, cudaRequire)
}
return requirements
}
func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool) *nvidiaConfig {
legacyImage := isLegacyCUDAImage(env)
var devices string
if d := getDevices(hookConfig, image, mounts, privileged); d != nil {
if d := getDevices(hookConfig, env, mounts, privileged, legacyImage); d != nil {
devices = *d
} else {
// 'nil' devices means this is not a GPU container.
@@ -288,7 +363,7 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, p
}
var migConfigDevices string
if d := getMigConfigDevices(image); d != nil {
if d := getMigConfigDevices(env); d != nil {
migConfigDevices = *d
}
if !privileged && migConfigDevices != "" {
@@ -296,21 +371,19 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, p
}
var migMonitorDevices string
if d := getMigMonitorDevices(image); d != nil {
if d := getMigMonitorDevices(env); d != nil {
migMonitorDevices = *d
}
if !privileged && migMonitorDevices != "" {
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
}
driverCapabilities := getDriverCapabilities(image, hookConfig.SupportedDriverCapabilities, legacyImage).String()
driverCapabilities := getDriverCapabilities(env, hookConfig.SupportedDriverCapabilities, legacyImage).String()
requirements, err := image.GetRequirements()
if err != nil {
log.Panicln("failed to get requirements", err)
}
requirements := getRequirements(env, legacyImage)
disableRequire := image.HasDisableRequire()
// Don't fail on invalid values.
disableRequire, _ := strconv.ParseBool(env[envNVDisableRequire])
return &nvidiaConfig{
Devices: devices,
@@ -336,16 +409,13 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
s := loadSpec(path.Join(b, "config.json"))
image, err := image.NewCUDAImageFromEnv(s.Process.Env)
if err != nil {
log.Panicln(err)
}
env := getEnvMap(s.Process.Env)
privileged := isPrivileged(s)
envSwarmGPU = hook.SwarmResource
return containerConfig{
Pid: h.Pid,
Rootfs: s.Root.Path,
Env: image,
Nvidia: getNvidiaConfig(&hook, image, s.Mounts, privileged),
Env: env,
Nvidia: getNvidiaConfig(&hook, env, s.Mounts, privileged),
}
}

View File

@@ -4,7 +4,6 @@ import (
"path/filepath"
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/stretchr/testify/require"
)
@@ -69,7 +68,7 @@ func TestGetNvidiaConfig(t *testing.T) {
description: "Legacy image, devices 'void', no capabilities, no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "void",
envNVVisibleDevices: "",
},
privileged: false,
expectedConfig: nil,
@@ -226,7 +225,7 @@ func TestGetNvidiaConfig(t *testing.T) {
description: "Modern image, devices 'void', no capabilities, no requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "void",
envNVVisibleDevices: "",
},
privileged: false,
expectedConfig: nil,
@@ -449,44 +448,6 @@ func TestGetNvidiaConfig(t *testing.T) {
DriverCapabilities: defaultDriverCapabilities.String(),
},
},
{
description: "Hook config set, swarmResource overrides device selection",
env: map[string]string{
envNVVisibleDevices: "all",
"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
},
privileged: true,
hookConfig: &HookConfig{
SwarmResource: func() *string {
s := "DOCKER_SWARM_RESOURCE"
return &s
}(),
SupportedDriverCapabilities: "video,display,utility,compute",
},
expectedConfig: &nvidiaConfig{
Devices: "GPU1,GPU2",
DriverCapabilities: defaultDriverCapabilities.String(),
},
},
{
description: "Hook config set, comma separated swarmResource is split and overrides device selection",
env: map[string]string{
envNVVisibleDevices: "all",
"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
},
privileged: true,
hookConfig: &HookConfig{
SwarmResource: func() *string {
s := "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE"
return &s
}(),
SupportedDriverCapabilities: "video,display,utility,compute",
},
expectedConfig: &nvidiaConfig{
Devices: "GPU1,GPU2",
DriverCapabilities: defaultDriverCapabilities.String(),
},
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
@@ -710,7 +671,7 @@ func TestDeviceListSourcePriority(t *testing.T) {
hookConfig := getDefaultHookConfig()
hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged
hookConfig.AcceptDeviceListAsVolumeMounts = tc.acceptMounts
devices = getDevices(&hookConfig, env, tc.mountDevices, tc.privileged)
devices = getDevices(&hookConfig, env, tc.mountDevices, tc.privileged, false)
}
// For all other tests, just grab the devices and check the results
@@ -727,13 +688,13 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
gpuID := "GPU-12345"
anotherGPUID := "GPU-67890"
thirdGPUID := "MIG-12345"
var tests = []struct {
description string
swarmResourceEnvvars []string
env map[string]string
expectedDevices *string
description string
envSwarmGPU *string
env map[string]string
legacyImage bool
expectedDevices *string
}{
{
description: "empty env returns nil for non-legacy image",
@@ -768,15 +729,13 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
env: map[string]string{
envNVVisibleDevices: gpuID,
envCUDAVersion: "legacy",
},
legacyImage: true,
expectedDevices: &gpuID,
},
{
description: "empty env returns all for legacy image",
env: map[string]string{
envCUDAVersion: "legacy",
},
description: "empty env returns all for legacy image",
legacyImage: true,
expectedDevices: &all,
},
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when
@@ -822,113 +781,86 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
env: map[string]string{
envNVVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
envCUDAVersion: "legacy",
},
legacyImage: true,
expectedDevices: &gpuID,
},
{
description: "empty env returns all for legacy image",
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
envCUDAVersion: "legacy",
},
legacyImage: true,
expectedDevices: &all,
},
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
// enabled
{
description: "empty env returns nil for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
description: "empty env returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
},
{
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: "",
},
},
{
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: "void",
},
},
{
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: "none",
},
expectedDevices: &empty,
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: gpuID,
},
expectedDevices: &gpuID,
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: gpuID,
envCUDAVersion: "legacy",
},
legacyImage: true,
expectedDevices: &gpuID,
},
{
description: "DOCKER_RESOURCE_GPUS is selected if present",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
description: "DOCKER_RESOURCE_GPUS is selected if present",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
{
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envNVVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
envNVVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
{
description: "First available swarm resource envvar is selected and overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
envNVVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS": thirdGPUID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: &thirdGPUID,
},
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
envNVVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
}
for i, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
devices := getDevicesFromEnvvar(image.CUDA(tc.env), tc.swarmResourceEnvvars)
envSwarmGPU = tc.envSwarmGPU
devices := getDevicesFromEnvvar(tc.env, tc.legacyImage)
if tc.expectedDevices == nil {
require.Nil(t, devices, "%d: %v", i, tc)
return

View File

@@ -5,10 +5,8 @@ import (
"os"
"path"
"reflect"
"strings"
"github.com/BurntSushi/toml"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
)
const (
@@ -35,7 +33,7 @@ type CLIConfig struct {
Ldconfig *string `toml:"ldconfig"`
}
// HookConfig : options for the nvidia-container-runtime-hook.
// HookConfig : options for the nvidia-container-toolkit.
type HookConfig struct {
DisableRequire bool `toml:"disable-require"`
SwarmResource *string `toml:"swarm-resource"`
@@ -43,11 +41,10 @@ type HookConfig struct {
AcceptDeviceListAsVolumeMounts bool `toml:"accept-nvidia-visible-devices-as-volume-mounts"`
SupportedDriverCapabilities DriverCapabilities `toml:"supported-driver-capabilities"`
NvidiaContainerCLI CLIConfig `toml:"nvidia-container-cli"`
NVIDIAContainerRuntime config.RuntimeConfig `toml:"nvidia-container-runtime"`
NvidiaContainerCLI CLIConfig `toml:"nvidia-container-cli"`
}
func getDefaultHookConfig() HookConfig {
func getDefaultHookConfig() (config HookConfig) {
return HookConfig{
DisableRequire: false,
SwarmResource: nil,
@@ -66,7 +63,6 @@ func getDefaultHookConfig() HookConfig {
User: nil,
Ldconfig: nil,
},
NVIDIAContainerRuntime: *config.GetDefaultRuntimeConfig(),
}
}
@@ -117,22 +113,3 @@ func (c HookConfig) getConfigOption(fieldName string) string {
}
return v
}
// getSwarmResourceEnvvars returns the swarm resource envvars for the config.
func (c *HookConfig) getSwarmResourceEnvvars() []string {
if c.SwarmResource == nil {
return nil
}
candidates := strings.Split(*c.SwarmResource, ",")
var envvars []string
for _, c := range candidates {
trimmed := strings.TrimSpace(c)
if len(trimmed) > 0 {
envvars = append(envvars, trimmed)
}
}
return envvars
}

View File

@@ -103,59 +103,3 @@ func TestGetHookConfig(t *testing.T) {
})
}
}
func TestGetSwarmResourceEnvvars(t *testing.T) {
testCases := []struct {
value string
expected []string
}{
{
value: "nil",
expected: nil,
},
{
value: "",
expected: nil,
},
{
value: " ",
expected: nil,
},
{
value: "single",
expected: []string{"single"},
},
{
value: "single ",
expected: []string{"single"},
},
{
value: "one,two",
expected: []string{"one", "two"},
},
{
value: "one ,two",
expected: []string{"one", "two"},
},
{
value: "one, two",
expected: []string{"one", "two"},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
c := &HookConfig{
SwarmResource: func() *string {
if tc.value == "nil" {
return nil
}
return &tc.value
}(),
}
envvars := c.getSwarmResourceEnvvars()
require.EqualValues(t, tc.expected, envvars)
})
}
}

View File

@@ -7,6 +7,51 @@ import (
"github.com/stretchr/testify/require"
)
func TestParseCudaVersionValid(t *testing.T) {
var tests = []struct {
version string
expected [3]uint32
}{
{"0", [3]uint32{0, 0, 0}},
{"8", [3]uint32{8, 0, 0}},
{"7.5", [3]uint32{7, 5, 0}},
{"9.0.116", [3]uint32{9, 0, 116}},
{"4294967295.4294967295.4294967295", [3]uint32{4294967295, 4294967295, 4294967295}},
}
for i, c := range tests {
vmaj, vmin, vpatch := parseCudaVersion(c.version)
version := [3]uint32{vmaj, vmin, vpatch}
require.Equal(t, c.expected, version, "%d: %v", i, c)
}
}
func TestParseCudaVersionInvalid(t *testing.T) {
var tests = []string{
"foo",
"foo.5.10",
"9.0.116.50",
"9.0.116foo",
"7.foo",
"9.0.bar",
"9.4294967296",
"9.0.116.",
"9..0",
"9.",
".5.10",
"-9",
"+9",
"-9.1.116",
"-9.-1.-116",
}
for _, c := range tests {
require.Panics(t, func() {
parseCudaVersion(c)
}, "parseCudaVersion(%v)", c)
}
}
func TestIsPrivileged(t *testing.T) {
var tests = []struct {
spec string

View File

@@ -6,21 +6,20 @@ import (
"log"
"os"
"os/exec"
"path"
"path/filepath"
"runtime"
"runtime/debug"
"strconv"
"strings"
"syscall"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
)
var (
debugflag = flag.Bool("debug", false, "enable debug output")
versionflag = flag.Bool("version", false, "enable version output")
configflag = flag.String("config", "", "configuration file")
debugflag = flag.Bool("debug", false, "enable debug output")
configflag = flag.String("config", "", "configuration file")
defaultPATH = []string{"/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"}
)
func exit() {
@@ -36,16 +35,28 @@ func exit() {
os.Exit(0)
}
func getPATH(config CLIConfig) string {
dirs := filepath.SplitList(os.Getenv("PATH"))
// directories from the hook environment have higher precedence
dirs = append(dirs, defaultPATH...)
if config.Root != nil {
rootDirs := []string{}
for _, dir := range dirs {
rootDirs = append(rootDirs, path.Join(*config.Root, dir))
}
// directories with the root prefix have higher precedence
dirs = append(rootDirs, dirs...)
}
return strings.Join(dirs, ":")
}
func getCLIPath(config CLIConfig) string {
if config.Path != nil {
return *config.Path
}
var root string
if config.Root != nil {
root = *config.Root
}
if err := os.Setenv("PATH", lookup.GetPath(root)); err != nil {
if err := os.Setenv("PATH", getPATH(config)); err != nil {
log.Panicln("couldn't set PATH variable:", err)
}
@@ -74,10 +85,6 @@ func doPrestart() {
hook := getHookConfig()
cli := hook.NvidiaContainerCLI
if info.ResolveAutoMode(&logInterceptor{}, hook.NVIDIAContainerRuntime.Mode) != "legacy" {
log.Panicln("invoking the NVIDIA Container Runtime Hook directly (e.g. specifying the docker --gpus flag) is not supported. Please use the NVIDIA Container Runtime (e.g. specify the --runtime=nvidia flag) instead.")
}
container := getContainerConfig(hook)
nvidia := container.Nvidia
if nvidia == nil {
@@ -160,11 +167,6 @@ func main() {
flag.Usage = usage
flag.Parse()
if *versionflag {
fmt.Printf("%v version %v\n", "NVIDIA Container Runtime Hook", info.GetVersionString())
return
}
args := flag.Args()
if len(args) == 0 {
flag.Usage()
@@ -184,12 +186,3 @@ func main() {
os.Exit(2)
}
}
// logInterceptor implements the info.Logger interface to allow for logging from this function.
type logInterceptor struct{}
func (l *logInterceptor) Infof(format string, args ...interface{}) {
log.Printf(format, args...)
}
func (l *logInterceptor) Debugf(format string, args ...interface{}) {}

View File

@@ -1,17 +0,0 @@
# NVIDIA Container Toolkit CLI
The NVIDIA Container Toolkit CLI `nvidia-ctk` provides a number of utilities that are useful for working with the NVIDIA Container Toolkit.
## Functionality
### Configure runtimes
The `runtime` command of the `nvidia-ctk` CLI provides a set of utilities to related to the configuration
and management of supported container engines.
For example, running the following command:
```bash
nvidia-ctk runtime configure --set-as-default
```
will ensure that the NVIDIA Container Runtime is added as the default runtime to the default container
engine.

View File

@@ -1,229 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package symlinks
import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover/csv"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
type command struct {
logger *logrus.Logger
}
type config struct {
hostRoot string
filenames cli.StringSlice
links cli.StringSlice
containerSpec string
}
// NewCommand constructs a hook command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
// build
func (m command) build() *cli.Command {
cfg := config{}
// Create the '' command
c := cli.Command{
Name: "create-symlinks",
Usage: "A hook to create symlinks in the container. This can be used to proces CSV mount specs",
Action: func(c *cli.Context) error {
return m.run(c, &cfg)
},
}
c.Flags = []cli.Flag{
&cli.StringFlag{
Name: "host-root",
Usage: "The root on the host filesystem to use to resolve symlinks",
Destination: &cfg.hostRoot,
},
&cli.StringSliceFlag{
Name: "csv-filename",
Usage: "Specify a (CSV) filename to process",
Destination: &cfg.filenames,
},
&cli.StringSliceFlag{
Name: "link",
Usage: "Specify a specific link to create. The link is specified as source:target",
Destination: &cfg.links,
},
&cli.StringFlag{
Name: "container-spec",
Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN",
Destination: &cfg.containerSpec,
},
}
return &c
}
func (m command) run(c *cli.Context, cfg *config) error {
s, err := oci.LoadContainerState(cfg.containerSpec)
if err != nil {
return fmt.Errorf("failed to load container state: %v", err)
}
containerRoot, err := s.GetContainerRoot()
if err != nil {
return fmt.Errorf("failed to determined container root: %v", err)
}
csvFiles := cfg.filenames.Value()
chainLocator := lookup.NewSymlinkChainLocator(m.logger, cfg.hostRoot)
var candidates []string
for _, file := range csvFiles {
mountSpecs, err := csv.NewCSVFileParser(m.logger, file).Parse()
if err != nil {
m.logger.Debugf("Skipping CSV file %v: %v", file, err)
continue
}
for _, ms := range mountSpecs {
if ms.Type != csv.MountSpecSym {
continue
}
targets, err := chainLocator.Locate(ms.Path)
if err != nil {
m.logger.Warnf("Failed to locate symlink %v", ms.Path)
}
candidates = append(candidates, targets...)
}
}
created := make(map[string]bool)
// candidates is a list of absolute paths to symlinks in a chain, or the final target of the chain.
for _, candidate := range candidates {
targets, err := m.Locate(candidate)
if err != nil {
m.logger.Debugf("Skipping invalid link: %v", err)
continue
} else if len(targets) != 1 {
m.logger.Debugf("Unexepected number of targets: %v", targets)
continue
} else if targets[0] == candidate {
m.logger.Debugf("%v is not a symlink", candidate)
continue
}
err = m.createLink(created, cfg.hostRoot, containerRoot, targets[0], candidate)
if err != nil {
m.logger.Warnf("Failed to create link %v: %v", []string{targets[0], candidate}, err)
}
}
links := cfg.links.Value()
for _, l := range links {
parts := strings.Split(l, ":")
if len(parts) != 2 {
m.logger.Warnf("Invalid link specification %v", l)
continue
}
err := m.createLink(created, cfg.hostRoot, containerRoot, parts[0], parts[1])
if err != nil {
m.logger.Warnf("Failed to create link %v: %v", parts, err)
}
}
return nil
}
func (m command) createLink(created map[string]bool, hostRoot string, containerRoot string, target string, link string) error {
linkPath, err := changeRoot(hostRoot, containerRoot, link)
if err != nil {
m.logger.Warnf("Failed to resolve path for link %v relative to %v: %v", link, containerRoot, err)
}
if created[linkPath] {
m.logger.Debugf("Link %v already created", linkPath)
return nil
}
targetPath, err := changeRoot(hostRoot, "/", target)
if err != nil {
m.logger.Warnf("Failed to resolve path for target %v relative to %v: %v", target, "/", err)
}
m.logger.Infof("Symlinking %v to %v", linkPath, targetPath)
err = os.MkdirAll(filepath.Dir(linkPath), 0755)
if err != nil {
return fmt.Errorf("failed to create directory: %v", err)
}
err = os.Symlink(target, linkPath)
if err != nil {
return fmt.Errorf("failed to create symlink: %v", err)
}
return nil
}
func changeRoot(current string, new string, path string) (string, error) {
if !filepath.IsAbs(path) {
return path, nil
}
relative := path
if current != "" {
r, err := filepath.Rel(current, path)
if err != nil {
return "", err
}
relative = r
}
return filepath.Join(new, relative), nil
}
// Locate returns the link target of the specified filename or an empty slice if the
// specified filename is not a symlink.
func (m command) Locate(filename string) ([]string, error) {
info, err := os.Lstat(filename)
if err != nil {
return nil, fmt.Errorf("failed to get file info: %v", info)
}
if info.Mode()&os.ModeSymlink == 0 {
m.logger.Debugf("%v is not a symlink", filename)
return nil, nil
}
target, err := os.Readlink(filename)
if err != nil {
return nil, fmt.Errorf("error checking symlink: %v", err)
}
m.logger.Debugf("Resolved link: '%v' => '%v'", filename, target)
return []string{target}, nil
}

View File

@@ -1,52 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package hook
import (
symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/create-symlinks"
ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/update-ldcache"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
type hookCommand struct {
logger *logrus.Logger
}
// NewCommand constructs a hook command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := hookCommand{
logger: logger,
}
return c.build()
}
// build
func (m hookCommand) build() *cli.Command {
// Create the 'hook' command
hook := cli.Command{
Name: "hook",
Usage: "A collection of hooks that may be injected into an OCI spec",
}
hook.Subcommands = []*cli.Command{
ldcache.NewCommand(m.logger),
symlinks.NewCommand(m.logger),
}
return &hook
}

View File

@@ -1,129 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package ldcache
import (
"fmt"
"os"
"path/filepath"
"syscall"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
type command struct {
logger *logrus.Logger
}
type config struct {
folders cli.StringSlice
containerSpec string
}
// NewCommand constructs an update-ldcache command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
// build the update-ldcache command
func (m command) build() *cli.Command {
cfg := config{}
// Create the 'update-ldcache' command
c := cli.Command{
Name: "update-ldcache",
Usage: "Update ldcache in a container by running ldconfig",
Action: func(c *cli.Context) error {
return m.run(c, &cfg)
},
}
c.Flags = []cli.Flag{
&cli.StringSliceFlag{
Name: "folder",
Usage: "Specifiy a folder to add to /etc/ld.so.conf before updating the ld cache",
Destination: &cfg.folders,
},
&cli.StringFlag{
Name: "container-spec",
Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN",
Destination: &cfg.containerSpec,
},
}
return &c
}
func (m command) run(c *cli.Context, cfg *config) error {
s, err := oci.LoadContainerState(cfg.containerSpec)
if err != nil {
return fmt.Errorf("failed to load container state: %v", err)
}
containerRoot, err := s.GetContainerRoot()
if err != nil {
return fmt.Errorf("failed to determined container root: %v", err)
}
err = m.createConfig(containerRoot, cfg.folders.Value())
if err != nil {
return fmt.Errorf("failed to update ld.so.conf: %v", err)
}
args := []string{"/sbin/ldconfig"}
if containerRoot != "" {
args = append(args, "-r", containerRoot)
}
return syscall.Exec(args[0], args, nil)
}
// createConfig creates (or updates) /etc/ld.so.conf.d/nvcr-<RANDOM_STRING>.conf in the container
// to include the required paths.
func (m command) createConfig(root string, folders []string) error {
if len(folders) == 0 {
m.logger.Debugf("No folders to add to /etc/ld.so.conf")
return nil
}
configFile, err := os.CreateTemp(filepath.Join(root, "/etc/ld.so.conf.d"), "nvcr-*.conf")
if err != nil {
return fmt.Errorf("failed to create config file: %v", err)
}
defer configFile.Close()
m.logger.Debugf("Adding folders %v to %v", folders, configFile.Name())
configured := make(map[string]bool)
for _, folder := range folders {
if configured[folder] {
continue
}
_, err = configFile.WriteString(fmt.Sprintf("%s\n", folder))
if err != nil {
return fmt.Errorf("failed to update ld.so.conf.d: %v", err)
}
configured[folder] = true
}
return nil
}

View File

@@ -1,386 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package cdi
import (
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
specs "github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
"sigs.k8s.io/yaml"
)
const (
nvidiaCTKExecutable = "nvidia-ctk"
nvidiaCTKDefaultFilePath = "/usr/bin/" + nvidiaCTKExecutable
)
type command struct {
logger *logrus.Logger
}
type config struct {
output string
jsonMode bool
}
// NewCommand constructs a generate-cdi command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
// build creates the CLI command
func (m command) build() *cli.Command {
cfg := config{}
// Create the 'generate-cdi' command
c := cli.Command{
Name: "generate-cdi",
Usage: "Generate CDI specifications for use with CDI-enabled runtimes",
Action: func(c *cli.Context) error {
return m.run(c, &cfg)
},
}
c.Flags = []cli.Flag{
&cli.StringFlag{
Name: "output",
Usage: "Specify the file to output the generated CDI specification to. If this is '-' or '' the specification is output to STDOUT",
Destination: &cfg.output,
},
&cli.BoolFlag{
Name: "json",
Usage: "Output the generated CDI spec in JSON mode instead of YAML",
Destination: &cfg.jsonMode,
},
}
return &c
}
func (m command) run(c *cli.Context, cfg *config) error {
spec, err := m.generateSpec()
if err != nil {
return fmt.Errorf("failed to generate CDI spec: %v", err)
}
var outputTo io.Writer
if cfg.output == "" || cfg.output == "-" {
outputTo = os.Stdout
} else {
outputFile, err := os.Create(cfg.output)
if err != nil {
return fmt.Errorf("failed to create output file: %v", err)
}
defer outputFile.Close()
outputTo = outputFile
}
if filepath.Ext(cfg.output) == ".json" {
cfg.jsonMode = true
} else if filepath.Ext(cfg.output) == ".yaml" || filepath.Ext(cfg.output) == ".yml" {
cfg.jsonMode = false
}
data, err := yaml.Marshal(spec)
if err != nil {
return fmt.Errorf("failed to marshal CDI spec: %v", err)
}
if cfg.jsonMode {
data, err = yaml.YAMLToJSONStrict(data)
if err != nil {
return fmt.Errorf("failed to convert CDI spec from YAML to JSON: %v", err)
}
}
_, err = outputTo.Write(data)
if err != nil {
return fmt.Errorf("failed to write output: %v", err)
}
return nil
}
func (m command) generateSpec() (*specs.Spec, error) {
nvmllib := nvml.New()
if r := nvmllib.Init(); r != nvml.SUCCESS {
return nil, r
}
defer nvmllib.Shutdown()
devicelib := device.New(device.WithNvml(nvmllib))
spec := specs.Spec{
Version: "0.4.0",
Kind: "nvidia.com/gpu",
ContainerEdits: specs.ContainerEdits{},
}
err := devicelib.VisitDevices(func(i int, d device.Device) error {
isMig, err := d.IsMigEnabled()
if err != nil {
return fmt.Errorf("failed to check whether device is MIG device: %v", err)
}
if isMig {
return nil
}
device, err := generateEditsForDevice(newGPUDevice(i, d))
if err != nil {
return fmt.Errorf("failed to generate CDI spec for device %v: %v", i, err)
}
spec.Devices = append(spec.Devices, device)
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to generate CDI spec for GPU devices: %v", err)
}
err = devicelib.VisitMigDevices(func(i int, d device.Device, j int, m device.MigDevice) error {
device, err := generateEditsForDevice(newMigDevice(i, j, m))
if err != nil {
return fmt.Errorf("failed to generate CDI spec for device %v: %v", i, err)
}
spec.Devices = append(spec.Devices, device)
return nil
})
if err != nil {
return nil, fmt.Errorf("falied to generate CDI spec for MIG devices: %v", err)
}
// We create an "all" device with all the discovered device nodes
var allDeviceNodes []*specs.DeviceNode
for _, d := range spec.Devices {
for _, dn := range d.ContainerEdits.DeviceNodes {
allDeviceNodes = append(allDeviceNodes, dn)
}
}
all := specs.Device{
Name: "all",
ContainerEdits: specs.ContainerEdits{
DeviceNodes: allDeviceNodes,
},
}
spec.Devices = append(spec.Devices, all)
spec.ContainerEdits.DeviceNodes = m.getExistingMetaDeviceNodes()
libraries, err := m.findLibs(nvmllib)
if err != nil {
return nil, fmt.Errorf("failed to locate driver libraries: %v", err)
}
binaries, err := m.findBinaries()
if err != nil {
return nil, fmt.Errorf("failed to locate driver binaries: %v", err)
}
ipcs, err := m.findIPC()
if err != nil {
return nil, fmt.Errorf("failed to locate driver IPC sockets: %v", err)
}
spec.ContainerEdits.Mounts = generateMountsForPaths(libraries, binaries, ipcs)
ldcacheUpdateHook := m.generateUpdateLdCacheHook(libraries)
spec.ContainerEdits.Hooks = []*specs.Hook{ldcacheUpdateHook}
return &spec, nil
}
func generateEditsForDevice(name string, d deviceInfo) (specs.Device, error) {
deviceNodePaths, err := d.GetDeviceNodes()
if err != nil {
return specs.Device{}, fmt.Errorf("failed to get paths for device: %v", err)
}
deviceNodes := getDeviceNodesFromPaths(deviceNodePaths)
device := specs.Device{
Name: name,
ContainerEdits: specs.ContainerEdits{
DeviceNodes: deviceNodes,
},
}
return device, nil
}
func (m command) getExistingMetaDeviceNodes() []*specs.DeviceNode {
metaDeviceNodePaths := []string{
"/dev/nvidia-modeset",
"/dev/nvidia-uvm-tools",
"/dev/nvidia-uvm",
"/dev/nvidiactl",
}
var existingDeviceNodePaths []string
for _, p := range metaDeviceNodePaths {
if _, err := os.Stat(p); err != nil {
m.logger.Infof("Ignoring missing meta device %v", p)
continue
}
existingDeviceNodePaths = append(existingDeviceNodePaths, p)
}
return getDeviceNodesFromPaths(existingDeviceNodePaths)
}
func getDeviceNodesFromPaths(deviceNodePaths []string) []*specs.DeviceNode {
var deviceNodes []*specs.DeviceNode
for _, p := range deviceNodePaths {
deviceNode := specs.DeviceNode{
Path: p,
}
deviceNodes = append(deviceNodes, &deviceNode)
}
return deviceNodes
}
func (m command) findLibs(nvmllib nvml.Interface) ([]string, error) {
version, r := nvmllib.SystemGetDriverVersion()
if r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to determine driver version: %v", r)
}
m.logger.Infof("Using driver version %v", version)
cache, err := ldcache.New(m.logger, "")
if err != nil {
return nil, fmt.Errorf("failed to load ldcache: %v", err)
}
libs32, libs64 := cache.List()
var libs []string
for _, l := range libs64 {
if strings.HasSuffix(l, version) {
m.logger.Infof("found 64-bit driver lib: %v", l)
libs = append(libs, l)
}
}
for _, l := range libs32 {
if strings.HasSuffix(l, version) {
m.logger.Infof("found 32-bit driver lib: %v", l)
libs = append(libs, l)
}
}
return libs, nil
}
func (m command) findBinaries() ([]string, error) {
candidates := []string{
"nvidia-smi", /* System management interface */
"nvidia-debugdump", /* GPU coredump utility */
"nvidia-persistenced", /* Persistence mode utility */
"nvidia-cuda-mps-control", /* Multi process service CLI */
"nvidia-cuda-mps-server", /* Multi process service server */
}
locator := lookup.NewExecutableLocator(m.logger, "")
var binaries []string
for _, c := range candidates {
targets, err := locator.Locate(c)
if err != nil {
m.logger.Warningf("skipping %v: %v", c, err)
continue
}
binaries = append(binaries, targets[0])
}
return binaries, nil
}
func (m command) findIPC() ([]string, error) {
candidates := []string{
"/var/run/nvidia-persistenced/socket",
"/var/run/nvidia-fabricmanager/socket",
// TODO: This can be controlled by the NV_MPS_PIPE_DIR envvar
"/tmp/nvidia-mps",
}
locator := lookup.NewFileLocator(m.logger, "")
var ipcs []string
for _, c := range candidates {
targets, err := locator.Locate(c)
if err != nil {
m.logger.Warningf("skipping %v: %v", c, err)
continue
}
ipcs = append(ipcs, targets[0])
}
return ipcs, nil
}
func generateMountsForPaths(pathSets ...[]string) []*specs.Mount {
var mounts []*specs.Mount
for _, paths := range pathSets {
for _, p := range paths {
mount := specs.Mount{
HostPath: p,
// We may want to adjust the container path
ContainerPath: p,
Type: "bind",
Options: []string{
"ro",
"nosuid",
"nodev",
"bind",
},
}
mounts = append(mounts, &mount)
}
}
return mounts
}
func (m command) generateUpdateLdCacheHook(libraries []string) *specs.Hook {
locator := lookup.NewExecutableLocator(m.logger, "")
hook := discover.CreateLDCacheUpdateHook(
m.logger,
locator,
nvidiaCTKExecutable,
nvidiaCTKDefaultFilePath,
libraries,
)
return &specs.Hook{
HookName: hook.Lifecycle,
Path: hook.Path,
Args: hook.Args,
}
}

View File

@@ -1,123 +0,0 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cdi
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
// nvmlDevice wraps an nvml.Device with more functions.
type nvmlDevice struct {
nvml.Device
}
// nvmlMigDevice allows for specific functions of nvmlDevice to be overridden.
type nvmlMigDevice nvmlDevice
// deviceInfo defines the information the required to construct a Device
type deviceInfo interface {
GetUUID() (string, error)
GetDeviceNodes() ([]string, error)
}
var _ deviceInfo = (*nvmlDevice)(nil)
var _ deviceInfo = (*nvmlMigDevice)(nil)
func newGPUDevice(i int, gpu device.Device) (string, nvmlDevice) {
return fmt.Sprintf("gpu%v", i), nvmlDevice{gpu}
}
func newMigDevice(i int, j int, mig device.MigDevice) (string, nvmlMigDevice) {
return fmt.Sprintf("mig%v:%v", i, j), nvmlMigDevice{mig}
}
// GetUUID returns the UUID of the device
func (d nvmlDevice) GetUUID() (string, error) {
uuid, ret := d.Device.GetUUID()
if ret != nvml.SUCCESS {
return "", ret
}
return uuid, nil
}
// GetUUID returns the UUID of the device
func (d nvmlMigDevice) GetUUID() (string, error) {
return nvmlDevice(d).GetUUID()
}
// GetDeviceNodes returns the device node paths for a GPU device
func (d nvmlDevice) GetDeviceNodes() ([]string, error) {
minor, ret := d.GetMinorNumber()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
}
path := fmt.Sprintf("/dev/nvidia%d", minor)
return []string{path}, nil
}
// GetDeviceNodes returns the device node paths for a MIG device
func (d nvmlMigDevice) GetDeviceNodes() ([]string, error) {
parent, ret := d.GetDeviceHandleFromMigDeviceHandle()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting parent device: %v", ret)
}
minor, ret := parent.GetMinorNumber()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
}
parentPath := fmt.Sprintf("/dev/nvidia%d", minor)
migCaps, err := nvcaps.NewMigCaps()
if err != nil {
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
}
gi, ret := d.GetGpuInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret)
}
ci, ret := d.GetComputeInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret)
}
giCap := nvcaps.NewGPUInstanceCap(minor, gi)
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
if err != nil {
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
}
ciCap := nvcaps.NewComputeInstanceCap(minor, gi, ci)
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
if err != nil {
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
}
devicePaths := []string{
parentPath,
giCapDevicePath,
ciCapDevicePath,
}
return devicePaths, nil
}

View File

@@ -1,50 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package info
import (
cdi "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/info/generate-cdi"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
type command struct {
logger *logrus.Logger
}
// NewCommand constructs an info command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
// build
func (m command) build() *cli.Command {
// Create the 'hook' command
hook := cli.Command{
Name: "info",
Usage: "Provide information about the system",
}
hook.Subcommands = []*cli.Command{
cdi.NewCommand(m.logger),
}
return &hook
}

View File

@@ -1,86 +0,0 @@
/**
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package main
import (
"os"
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook"
infoCLI "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/info"
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/runtime"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info"
log "github.com/sirupsen/logrus"
cli "github.com/urfave/cli/v2"
)
var logger = log.New()
// config defines the options that can be set for the CLI through config files,
// environment variables, or command line flags
type config struct {
// Debug indicates whether the CLI is started in "debug" mode
Debug bool
}
func main() {
// Create a config struct to hold the parsed environment variables or command line flags
config := config{}
// Create the top-level CLI
c := cli.NewApp()
c.Name = "NVIDIA Container Toolkit CLI"
c.UseShortOptionHandling = true
c.EnableBashCompletion = true
c.Usage = "Tools to configure the NVIDIA Container Toolkit"
c.Version = info.GetVersionString()
// Setup the flags for this command
c.Flags = []cli.Flag{
&cli.BoolFlag{
Name: "debug",
Aliases: []string{"d"},
Usage: "Enable debug-level logging",
Destination: &config.Debug,
EnvVars: []string{"NVIDIA_CTK_DEBUG"},
},
}
// Set log-level for all subcommands
c.Before = func(c *cli.Context) error {
logLevel := log.InfoLevel
if config.Debug {
logLevel = log.DebugLevel
}
logger.SetLevel(logLevel)
return nil
}
// Define the subcommands
c.Commands = []*cli.Command{
hook.NewCommand(logger),
runtime.NewCommand(logger),
infoCLI.NewCommand(logger),
}
// Run the CLI
err := c.Run(os.Args)
if err != nil {
log.Errorf("%v", err)
log.Exit(1)
}
}

View File

@@ -1,154 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package configure
import (
"encoding/json"
"fmt"
"os"
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/runtime/nvidia"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/docker"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
const (
defaultRuntime = "docker"
defaultDockerConfigFilePath = "/etc/docker/daemon.json"
)
type command struct {
logger *logrus.Logger
}
// NewCommand constructs an configure command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
// config defines the options that can be set for the CLI through config files,
// environment variables, or command line config
type config struct {
dryRun bool
runtime string
configFilePath string
nvidiaOptions nvidia.Options
}
func (m command) build() *cli.Command {
// Create a config struct to hold the parsed environment variables or command line flags
config := config{}
// Create the 'configure' command
configure := cli.Command{
Name: "configure",
Usage: "Add a runtime to the specified container engine",
Action: func(c *cli.Context) error {
return m.configureWrapper(c, &config)
},
}
configure.Flags = []cli.Flag{
&cli.BoolFlag{
Name: "dry-run",
Usage: "update the runtime configuration as required but don't write changes to disk",
Destination: &config.dryRun,
},
&cli.StringFlag{
Name: "runtime",
Usage: "the target runtime engine. One of [docker]",
Value: defaultRuntime,
Destination: &config.runtime,
},
&cli.StringFlag{
Name: "config",
Usage: "path to the config file for the target runtime",
Destination: &config.configFilePath,
},
&cli.StringFlag{
Name: "nvidia-runtime-name",
Usage: "specify the name of the NVIDIA runtime that will be added",
Value: nvidia.RuntimeName,
Destination: &config.nvidiaOptions.RuntimeName,
},
&cli.StringFlag{
Name: "runtime-path",
Usage: "specify the path to the NVIDIA runtime executable",
Value: nvidia.RuntimeExecutable,
Destination: &config.nvidiaOptions.RuntimePath,
},
&cli.BoolFlag{
Name: "set-as-default",
Usage: "set the specified runtime as the default runtime",
Destination: &config.nvidiaOptions.SetAsDefault,
},
}
return &configure
}
func (m command) configureWrapper(c *cli.Context, config *config) error {
switch config.runtime {
case "docker":
return m.configureDocker(c, config)
}
return fmt.Errorf("unrecognized runtime '%v'", config.runtime)
}
// configureDocker updates the docker config to enable the NVIDIA Container Runtime
func (m command) configureDocker(c *cli.Context, config *config) error {
configFilePath := config.configFilePath
if configFilePath == "" {
configFilePath = defaultDockerConfigFilePath
}
cfg, err := docker.LoadConfig(configFilePath)
if err != nil {
return fmt.Errorf("unable to load config: %v", err)
}
defaultRuntime := config.nvidiaOptions.DefaultRuntime()
runtimeConfig := config.nvidiaOptions.Runtime().DockerRuntimesConfig()
err = docker.UpdateConfig(cfg, defaultRuntime, runtimeConfig)
if err != nil {
return fmt.Errorf("unable to update config: %v", err)
}
if config.dryRun {
output, err := json.MarshalIndent(cfg, "", " ")
if err != nil {
return fmt.Errorf("unable to convert to JSON: %v", err)
}
os.Stdout.WriteString(fmt.Sprintf("%s\n", output))
return nil
}
err = docker.FlushConfig(cfg, configFilePath)
if err != nil {
return fmt.Errorf("unable to flush config: %v", err)
}
m.logger.Infof("Wrote updated config to %v", configFilePath)
m.logger.Infof("It is recommended that the docker daemon be restarted.")
return nil
}

View File

@@ -1,75 +0,0 @@
/*
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package nvidia
const (
// RuntimeName is the default name to use in configs for the NVIDIA Container Runtime
RuntimeName = "nvidia"
// RuntimeExecutable is the default NVIDIA Container Runtime executable file name
RuntimeExecutable = "nvidia-container-runtime"
)
// Options specifies the options for the NVIDIA Container Runtime w.r.t a container engine such as docker.
type Options struct {
SetAsDefault bool
RuntimeName string
RuntimePath string
}
// Runtime defines an NVIDIA runtime with a name and a executable
type Runtime struct {
Name string
Path string
}
// DefaultRuntime returns the default runtime for the configured options.
// If the configuration is invalid or the default runtimes should not be set
// the empty string is returned.
func (o Options) DefaultRuntime() string {
if !o.SetAsDefault {
return ""
}
return o.RuntimeName
}
// Runtime creates a runtime struct based on the options.
func (o Options) Runtime() Runtime {
path := o.RuntimePath
if o.RuntimePath == "" {
path = RuntimeExecutable
}
r := Runtime{
Name: o.RuntimeName,
Path: path,
}
return r
}
// DockerRuntimesConfig generatest the expected docker config for the specified runtime
func (r Runtime) DockerRuntimesConfig() map[string]interface{} {
runtimes := make(map[string]interface{})
runtimes[r.Name] = map[string]interface{}{
"path": r.Path,
"args": []string{},
}
return runtimes
}

View File

@@ -1,49 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package runtime
import (
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/runtime/configure"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
type runtimeCommand struct {
logger *logrus.Logger
}
// NewCommand constructs a runtime command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := runtimeCommand{
logger: logger,
}
return c.build()
}
func (m runtimeCommand) build() *cli.Command {
// Create the 'runtime' command
runtime := cli.Command{
Name: "runtime",
Usage: "A collection of runtime-related utilities for the NVIDIA Container Toolkit",
}
runtime.Subcommands = []*cli.Command{
configure.NewCommand(m.logger),
}
return &runtime
}

View File

@@ -16,17 +16,3 @@ ldconfig = "@/sbin/ldconfig"
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"
log-level = "info"
# Specify the runtimes to consider. This list is processed in order and the PATH
# searched for matching executables unless the entry is an absolute path.
runtimes = [
"docker-runc",
"runc",
]
mode = "auto"
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

18
config/config.toml.centos Normal file
View File

@@ -0,0 +1,18 @@
disable-require = false
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#accept-nvidia-visible-devices-as-volume-mounts = false
[nvidia-container-cli]
#root = "/run/nvidia/driver"
#path = "/usr/bin/nvidia-container-cli"
environment = []
#debug = "/var/log/nvidia-container-toolkit.log"
#ldcache = "/etc/ld.so.cache"
load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig"
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@@ -16,17 +16,3 @@ ldconfig = "@/sbin/ldconfig"
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"
log-level = "info"
# Specify the runtimes to consider. This list is processed in order and the PATH
# searched for matching executables unless the entry is an absolute path.
runtimes = [
"docker-runc",
"runc",
]
mode = "auto"
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

View File

@@ -16,17 +16,3 @@ ldconfig = "@/sbin/ldconfig"
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"
log-level = "info"
# Specify the runtimes to consider. This list is processed in order and the PATH
# searched for matching executables unless the entry is an absolute path.
runtimes = [
"docker-runc",
"runc",
]
mode = "auto"
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

View File

@@ -16,17 +16,3 @@ ldconfig = "@/sbin/ldconfig.real"
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"
log-level = "info"
# Specify the runtimes to consider. This list is processed in order and the PATH
# searched for matching executables unless the entry is an absolute path.
runtimes = [
"docker-runc",
"runc",
]
mode = "auto"
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

View File

@@ -0,0 +1,19 @@
disable-require = false
supported-driver-capabilities = "compute,compat32,graphics,utility,video,display"
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#accept-nvidia-visible-devices-as-volume-mounts = false
[nvidia-container-cli]
#root = "/run/nvidia/driver"
#path = "/usr/bin/nvidia-container-cli"
environment = []
#debug = "/var/log/nvidia-container-toolkit.log"
#ldcache = "/etc/ld.so.cache"
load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig.real"
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@@ -1,29 +1,14 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This is the dockerfile for building packages on yum-based RPM systems.
ARG BASEIMAGE
FROM ${BASEIMAGE}
RUN yum install -y \
ca-certificates \
gcc \
wget \
git \
make \
rpm-build && \
rpm-build \
wget \
&& \
rm -rf /var/cache/yum/*
ARG GOLANG_VERSION=0.0.0
@@ -43,12 +28,11 @@ ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
# packaging
ARG PKG_NAME
ARG PKG_VERS
ARG PKG_REV
ENV PKG_NAME ${PKG_NAME}
ENV PKG_VERS ${PKG_VERS}
ENV PKG_REV ${PKG_REV}
ENV VERSION $PKG_VERS
ENV RELEASE $PKG_REV
# output directory
ENV DIST_DIR=/tmp/nvidia-container-toolkit-$PKG_VERS/SOURCES
@@ -58,8 +42,6 @@ RUN mkdir -p $DIST_DIR /dist
WORKDIR $GOPATH/src/nvidia-container-toolkit
COPY . .
ARG GIT_COMMIT
ENV GIT_COMMIT ${GIT_COMMIT}
RUN make PREFIX=${DIST_DIR} cmds
ARG CONFIG_TOML_SUFFIX
@@ -67,6 +49,8 @@ ENV CONFIG_TOML_SUFFIX ${CONFIG_TOML_SUFFIX}
COPY config/config.toml.${CONFIG_TOML_SUFFIX} $DIST_DIR/config.toml
# Hook for Project Atomic's fork of Docker: https://github.com/projectatomic/docker/tree/docker-1.13.1-rhel#add-dockerhooks-exec-custom-hooks-for-prestartpoststop-containerspatch
# This might not be useful on Amazon Linux, but it's simpler to keep the RHEL
# and Amazon Linux packages identical.
COPY oci-nvidia-hook $DIST_DIR/oci-nvidia-hook
# Hook for libpod/CRI-O: https://github.com/containers/libpod/blob/v0.8.5/pkg/hooks/docs/oci-hooks.5.md
@@ -75,16 +59,11 @@ COPY oci-nvidia-hook.json $DIST_DIR/oci-nvidia-hook.json
WORKDIR $DIST_DIR/..
COPY packaging/rpm .
ARG LIBNVIDIA_CONTAINER_TOOLS_VERSION
ENV LIBNVIDIA_CONTAINER_TOOLS_VERSION ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}
CMD arch=$(uname -m) && \
rpmbuild --clean --target=$arch -bb \
-D "_topdir $PWD" \
-D "release_date $(date +'%a %b %d %Y')" \
-D "git_commit ${GIT_COMMIT}" \
-D "version ${PKG_VERS}" \
-D "libnvidia_container_tools_version ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}" \
-D "release ${PKG_REV}" \
-D "version $VERSION" \
-D "libnvidia_container_version ${VERSION}-${RELEASE}" \
-D "release $RELEASE" \
SPECS/nvidia-container-toolkit.spec && \
mv RPMS/$arch/*.rpm /dist

67
docker/Dockerfile.centos Normal file
View File

@@ -0,0 +1,67 @@
ARG BASEIMAGE
FROM ${BASEIMAGE}
RUN yum install -y \
ca-certificates \
gcc \
git \
make \
rpm-build \
wget \
&& \
rm -rf /var/cache/yum/*
ARG GOLANG_VERSION=0.0.0
RUN set -eux; \
\
arch="$(uname -m)"; \
case "${arch##*-}" in \
x86_64 | amd64) ARCH='amd64' ;; \
ppc64el | ppc64le) ARCH='ppc64le' ;; \
aarch64) ARCH='arm64' ;; \
*) echo "unsupported architecture"; exit 1 ;; \
esac; \
wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \
| tar -C /usr/local -xz
ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
# packaging
ARG PKG_VERS
ARG PKG_REV
ENV VERSION $PKG_VERS
ENV RELEASE $PKG_REV
# output directory
ENV DIST_DIR=/tmp/nvidia-container-toolkit-$PKG_VERS/SOURCES
RUN mkdir -p $DIST_DIR /dist
# nvidia-container-toolkit
WORKDIR $GOPATH/src/nvidia-container-toolkit
COPY . .
RUN make PREFIX=${DIST_DIR} cmds
ARG CONFIG_TOML_SUFFIX
ENV CONFIG_TOML_SUFFIX ${CONFIG_TOML_SUFFIX}
COPY config/config.toml.${CONFIG_TOML_SUFFIX} $DIST_DIR/config.toml
# Hook for Project Atomic's fork of Docker: https://github.com/projectatomic/docker/tree/docker-1.13.1-rhel#add-dockerhooks-exec-custom-hooks-for-prestartpoststop-containerspatch
COPY oci-nvidia-hook $DIST_DIR/oci-nvidia-hook
# Hook for libpod/CRI-O: https://github.com/containers/libpod/blob/v0.8.5/pkg/hooks/docs/oci-hooks.5.md
COPY oci-nvidia-hook.json $DIST_DIR/oci-nvidia-hook.json
WORKDIR $DIST_DIR/..
COPY packaging/rpm .
CMD arch=$(uname -m) && \
rpmbuild --clean --target=$arch -bb \
-D "_topdir $PWD" \
-D "version $VERSION" \
-D "libnvidia_container_version ${VERSION}-${RELEASE}" \
-D "release $RELEASE" \
SPECS/nvidia-container-toolkit.spec && \
mv RPMS/$arch/*.rpm /dist

View File

@@ -32,7 +32,6 @@ ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
# packaging
ARG PKG_NAME
ARG PKG_VERS
ARG PKG_REV
@@ -49,8 +48,6 @@ RUN mkdir -p $DIST_DIR /dist
WORKDIR $GOPATH/src/nvidia-container-toolkit
COPY . .
ARG GIT_COMMIT
ENV GIT_COMMIT ${GIT_COMMIT}
RUN make PREFIX=${DIST_DIR} cmds
ARG CONFIG_TOML_SUFFIX
@@ -65,17 +62,12 @@ RUN if [ "$(lsb_release -cs)" = "jessie" ]; then \
WORKDIR $DIST_DIR
COPY packaging/debian ./debian
ARG LIBNVIDIA_CONTAINER_TOOLS_VERSION
ENV LIBNVIDIA_CONTAINER_TOOLS_VERSION ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}
RUN dch --create --package="${PKG_NAME}" \
--newversion "${REVISION}" \
"See https://gitlab.com/nvidia/container-toolkit/container-toolkit/-/blob/${GIT_COMMIT}/CHANGELOG.md for the changelog" && \
dch --append "Bump libnvidia-container dependency to ${LIBNVIDIA_CONTAINER1_VERSION}" && \
dch -r "" && \
RUN sed -i "s;@VERSION@;${REVISION};" debian/changelog && \
dch --changelog debian/changelog --append "Bump libnvidia-container dependency to ${REVISION}}" && \
dch --changelog debian/changelog -r "" && \
if [ "$REVISION" != "$(dpkg-parsechangelog --show-field=Version)" ]; then exit 1; fi
CMD export DISTRIB="$(lsb_release -cs)" && \
debuild -eDISTRIB -eSECTION -eLIBNVIDIA_CONTAINER_TOOLS_VERSION -eVERSION="${REVISION}" \
debuild -eDISTRIB -eSECTION -eLIBNVIDIA_CONTAINER_VERSION="${REVISION}" \
--dpkg-buildpackage-hook='sh debian/prepare' -i -us -uc -b && \
mv /tmp/*.deb /dist
mv /tmp/nvidia-container-toolkit_*.deb /dist

View File

@@ -14,8 +14,7 @@
ARG GOLANG_VERSION=x.x.x
FROM golang:${GOLANG_VERSION}
RUN go install golang.org/x/lint/golint@latest
RUN go install github.com/matryer/moq@latest
RUN go install github.com/gordonklaus/ineffassign@latest
RUN go install github.com/client9/misspell/cmd/misspell@latest
RUN go install github.com/google/go-licenses@latest
RUN go get -u golang.org/x/lint/golint
RUN go get -u github.com/matryer/moq
RUN go get -u github.com/gordonklaus/ineffassign
RUN go get -u github.com/client9/misspell/cmd/misspell

View File

@@ -25,12 +25,11 @@ ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
# packaging
ARG PKG_NAME
ARG PKG_VERS
ARG PKG_REV
ENV PKG_NAME ${PKG_NAME}
ENV PKG_VERS ${PKG_VERS}
ENV PKG_REV ${PKG_REV}
ENV VERSION $PKG_VERS
ENV RELEASE $PKG_REV
# output directory
ENV DIST_DIR=/tmp/nvidia-container-toolkit-$PKG_VERS/SOURCES
@@ -40,8 +39,6 @@ RUN mkdir -p $DIST_DIR /dist
WORKDIR $GOPATH/src/nvidia-container-toolkit
COPY . .
ARG GIT_COMMIT
ENV GIT_COMMIT ${GIT_COMMIT}
RUN make PREFIX=${DIST_DIR} cmds
# Hook for Project Atomic's fork of Docker: https://github.com/projectatomic/docker/tree/docker-1.13.1-rhel#add-dockerhooks-exec-custom-hooks-for-prestartpoststop-containerspatch
@@ -57,16 +54,11 @@ COPY config/config.toml.${CONFIG_TOML_SUFFIX} $DIST_DIR/config.toml
WORKDIR $DIST_DIR/..
COPY packaging/rpm .
ARG LIBNVIDIA_CONTAINER_TOOLS_VERSION
ENV LIBNVIDIA_CONTAINER_TOOLS_VERSION ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}
CMD arch=$(uname -m) && \
rpmbuild --clean --target=$arch -bb \
-D "_topdir $PWD" \
-D "release_date $(date +'%a %b %d %Y')" \
-D "git_commit ${GIT_COMMIT}" \
-D "version ${PKG_VERS}" \
-D "libnvidia_container_tools_version ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}" \
-D "release ${PKG_REV}" \
-D "version $VERSION" \
-D "libnvidia_container_version ${VERSION}-${RELEASE}" \
-D "release $RELEASE" \
SPECS/nvidia-container-toolkit.spec && \
mv RPMS/$arch/*.rpm /dist

View File

@@ -30,7 +30,6 @@ ENV GOPATH /go
ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH
# packaging
ARG PKG_NAME
ARG PKG_VERS
ARG PKG_REV
@@ -47,8 +46,6 @@ RUN mkdir -p $DIST_DIR /dist
WORKDIR $GOPATH/src/nvidia-container-toolkit
COPY . .
ARG GIT_COMMIT
ENV GIT_COMMIT ${GIT_COMMIT}
RUN make PREFIX=${DIST_DIR} cmds
ARG CONFIG_TOML_SUFFIX
@@ -58,17 +55,12 @@ COPY config/config.toml.${CONFIG_TOML_SUFFIX} $DIST_DIR/config.toml
WORKDIR $DIST_DIR
COPY packaging/debian ./debian
ARG LIBNVIDIA_CONTAINER_TOOLS_VERSION
ENV LIBNVIDIA_CONTAINER_TOOLS_VERSION ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}
RUN dch --create --package="${PKG_NAME}" \
--newversion "${REVISION}" \
"See https://gitlab.com/nvidia/container-toolkit/container-toolkit/-/blob/${GIT_COMMIT}/CHANGELOG.md for the changelog" && \
dch --append "Bump libnvidia-container dependency to ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}" && \
dch -r "" && \
RUN sed -i "s;@VERSION@;${REVISION};" debian/changelog && \
dch --changelog debian/changelog --append "Bump libnvidia-container dependency to ${REVISION}}" && \
dch --changelog debian/changelog -r "" && \
if [ "$REVISION" != "$(dpkg-parsechangelog --show-field=Version)" ]; then exit 1; fi
CMD export DISTRIB="$(lsb_release -cs)" && \
debuild -eDISTRIB -eSECTION -eLIBNVIDIA_CONTAINER_TOOLS_VERSION -eVERSION="${REVISION}" \
debuild -eDISTRIB -eSECTION -eLIBNVIDIA_CONTAINER_VERSION="${REVISION}" \
--dpkg-buildpackage-hook='sh debian/prepare' -i -us -uc -b && \
mv /tmp/*.deb /dist

View File

@@ -14,10 +14,10 @@
# Supported OSs by architecture
AMD64_TARGETS := ubuntu20.04 ubuntu18.04 ubuntu16.04 debian10 debian9
X86_64_TARGETS := fedora35 centos7 centos8 rhel7 rhel8 amazonlinux2 opensuse-leap15.1
X86_64_TARGETS := centos7 centos8 rhel7 rhel8 amazonlinux2 opensuse-leap15.1
PPC64LE_TARGETS := ubuntu18.04 ubuntu16.04 centos7 centos8 rhel7 rhel8
ARM64_TARGETS := ubuntu20.04 ubuntu18.04
AARCH64_TARGETS := fedora35 centos8 rhel8 amazonlinux2
AARCH64_TARGETS := centos8 rhel8 amazonlinux2
# Define top-level build targets
docker%: SHELL:=/bin/bash
@@ -85,62 +85,34 @@ docker-all: $(AMD64_TARGETS) $(X86_64_TARGETS) \
--%: docker-build-%
@
LIBNVIDIA_CONTAINER_VERSION ?= $(LIB_VERSION)
LIBNVIDIA_CONTAINER_TAG ?= $(LIB_TAG)
# private ubuntu target
--ubuntu%: OS := ubuntu
--ubuntu%: LIB_VERSION := $(LIB_VERSION)$(if $(LIB_TAG),~$(LIB_TAG))
--ubuntu%: LIBNVIDIA_CONTAINER_TOOLS_VERSION := $(LIBNVIDIA_CONTAINER_VERSION)$(if $(LIBNVIDIA_CONTAINER_TAG),~$(LIBNVIDIA_CONTAINER_TAG))-1
--ubuntu%: PKG_REV := 1
# private debian target
--debian%: OS := debian
--debian%: LIB_VERSION := $(LIB_VERSION)$(if $(LIB_TAG),~$(LIB_TAG))
--debian%: LIBNVIDIA_CONTAINER_TOOLS_VERSION := $(LIBNVIDIA_CONTAINER_VERSION)$(if $(LIBNVIDIA_CONTAINER_TAG),~$(LIBNVIDIA_CONTAINER_TAG))-1
--debian%: PKG_REV := 1
# private centos target
--centos%: OS := centos
--centos%: PKG_REV := $(if $(LIB_TAG),0.1.$(LIB_TAG),1)
--centos%: LIBNVIDIA_CONTAINER_TOOLS_VERSION := $(LIBNVIDIA_CONTAINER_VERSION)-$(if $(LIBNVIDIA_CONTAINER_TAG),0.1.$(LIBNVIDIA_CONTAINER_TAG),1)
--centos%: DOCKERFILE = $(CURDIR)/docker/Dockerfile.rpm-yum
--centos%: CONFIG_TOML_SUFFIX := rpm-yum
--centos8%: BASEIMAGE = quay.io/centos/centos:stream8
# private fedora target
--fedora%: OS := fedora
--fedora%: PKG_REV := $(if $(LIB_TAG),0.1.$(LIB_TAG),1)
--fedora%: LIBNVIDIA_CONTAINER_TOOLS_VERSION := $(LIBNVIDIA_CONTAINER_VERSION)-$(if $(LIBNVIDIA_CONTAINER_TAG),0.1.$(LIBNVIDIA_CONTAINER_TAG),1)
--fedora%: DOCKERFILE = $(CURDIR)/docker/Dockerfile.rpm-yum
--fedora%: CONFIG_TOML_SUFFIX := rpm-yum
# The fedora(35) base image has very slow performance when building aarch64 packages.
# Since our primary concern here is glibc versions, we use the older glibc version available in centos8.
--fedora35%: BASEIMAGE = quay.io/centos/centos:stream8
# private amazonlinux target
--amazonlinux%: OS := amazonlinux
--amazonlinux%: LIBNVIDIA_CONTAINER_TOOLS_VERSION := $(LIBNVIDIA_CONTAINER_VERSION)-$(if $(LIBNVIDIA_CONTAINER_TAG),0.1.$(LIBNVIDIA_CONTAINER_TAG),1)
--amazonlinux%: PKG_REV := $(if $(LIB_TAG),0.1.$(LIB_TAG),1)
--amazonlinux%: DOCKERFILE = $(CURDIR)/docker/Dockerfile.rpm-yum
--amazonlinux%: CONFIG_TOML_SUFFIX := rpm-yum
# private opensuse-leap target
--opensuse-leap%: OS = opensuse-leap
--opensuse-leap%: BASEIMAGE = opensuse/leap:$(VERSION)
--opensuse-leap%: LIBNVIDIA_CONTAINER_TOOLS_VERSION := $(LIBNVIDIA_CONTAINER_VERSION)-$(if $(LIBNVIDIA_CONTAINER_TAG),0.1.$(LIBNVIDIA_CONTAINER_TAG),1)
--opensuse-leap%: PKG_REV := $(if $(LIB_TAG),0.1.$(LIB_TAG),1)
# private rhel target (actually built on centos)
--rhel%: OS := centos
--rhel%: LIBNVIDIA_CONTAINER_TOOLS_VERSION := $(LIBNVIDIA_CONTAINER_VERSION)-$(if $(LIBNVIDIA_CONTAINER_TAG),0.1.$(LIBNVIDIA_CONTAINER_TAG),1)
--rhel%: PKG_REV := $(if $(LIB_TAG),0.1.$(LIB_TAG),1)
--rhel%: VERSION = $(patsubst rhel%-$(ARCH),%,$(TARGET_PLATFORM))
--rhel%: ARTIFACTS_DIR = $(DIST_DIR)/rhel$(VERSION)/$(ARCH)
--rhel%: DOCKERFILE = $(CURDIR)/docker/Dockerfile.rpm-yum
--rhel%: CONFIG_TOML_SUFFIX := rpm-yum
--rhel8%: BASEIMAGE = quay.io/centos/centos:stream8
# We allow the CONFIG_TOML_SUFFIX to be overridden.
CONFIG_TOML_SUFFIX ?= $(OS)
@@ -150,20 +122,15 @@ docker-build-%:
docker pull --platform=linux/$(ARCH) $(BASEIMAGE)
DOCKER_BUILDKIT=1 \
$(DOCKER) build \
--platform=linux/$(ARCH) \
--progress=plain \
--build-arg BASEIMAGE="$(BASEIMAGE)" \
--build-arg GOLANG_VERSION="$(GOLANG_VERSION)" \
--build-arg PKG_NAME="$(LIB_NAME)" \
--build-arg PKG_VERS="$(LIB_VERSION)" \
--build-arg PKG_REV="$(PKG_REV)" \
--build-arg LIBNVIDIA_CONTAINER_TOOLS_VERSION="$(LIBNVIDIA_CONTAINER_TOOLS_VERSION)" \
--build-arg CONFIG_TOML_SUFFIX="$(CONFIG_TOML_SUFFIX)" \
--build-arg GIT_COMMIT="$(GIT_COMMIT)" \
--tag $(BUILDIMAGE) \
--file $(DOCKERFILE) .
$(DOCKER) run \
--platform=linux/$(ARCH) \
-e DISTRIB \
-e SECTION \
-v $(ARTIFACTS_DIR):/dist \

View File

@@ -14,37 +14,43 @@
# limitations under the License.
*/
package lookup
package main
import (
"fmt"
"encoding/json"
"os"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
log "github.com/sirupsen/logrus"
)
// NewDirectoryLocator creates a Locator that can be used to find directories at the specified root. A logger
// is also specified.
func NewDirectoryLocator(logger *log.Logger, root string) Locator {
l := file{
logger: logger,
prefixes: []string{root},
filter: assertDirectory,
}
func main() {
log.Infof("Starting device discovery with NVML")
return &l
}
// assertDirectory checks wither the specified path is a directory.
func assertDirectory(filename string) error {
info, err := os.Stat(filename)
d, err := discover.NewNVMLServer("")
if err != nil {
return fmt.Errorf("error getting info for %v: %v", filename, err)
log.Errorf("Error creating NVML Server: %v", err)
return
}
if !info.IsDir() {
return fmt.Errorf("specified path '%v' is not a directory", filename)
devices, err := d.Devices()
if err != nil {
log.Errorf("Error discovering devices: %v", err)
return
}
return nil
mounts, err := d.Mounts()
if err != nil {
log.Errorf("Error discovering mounts: %v", err)
return
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
log.Infof("Discovered devices:")
enc.Encode(devices)
log.Infof("Discovered libraries:")
enc.Encode(mounts)
}

65
examples/filter/main.go Normal file
View File

@@ -0,0 +1,65 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package main
import (
"encoding/json"
"os"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/filter"
log "github.com/sirupsen/logrus"
)
func main() {
d, err := discover.NewNVMLServer("")
if err != nil {
log.Errorf("Error discovering devices: %v", err)
}
selected := filter.NewSelectDevicesFrom(d, "all", nil)
devices, err := selected.Devices()
if err != nil {
log.Errorf("Error discovering devices: %v", err)
return
}
mounts, err := selected.Mounts()
if err != nil {
log.Errorf("Error discovering mounts: %v", err)
return
}
hooks, err := selected.Hooks()
if err != nil {
log.Errorf("Error discovering hooks: %v", err)
return
}
enc := json.NewEncoder(os.Stdout)
enc.SetIndent("", " ")
log.Infof("Discovered devices:")
enc.Encode(devices)
log.Infof("Discovered libraries:")
enc.Encode(mounts)
log.Infof("Discovered hook:")
enc.Encode(hooks)
}

View File

@@ -0,0 +1,26 @@
package main
import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
log "github.com/sirupsen/logrus"
)
var logger = log.StandardLogger()
func main() {
logger.SetLevel(log.DebugLevel)
logger.Infof("Starting device discovery with NVML")
cache, err := ldcache.NewLDCacheWithLogger(logger, "/run/nvidia/driver")
if err != nil {
logger.Errorf("Error loading ldcache: %v", err)
return
}
defer cache.Close()
libs32, libs64 := cache.Lookup("lib")
logger.Infof("32-bit: %v", libs32)
logger.Infof("64-bit: %v", libs64)
}

44
go.mod
View File

@@ -1,41 +1,17 @@
module github.com/NVIDIA/nvidia-container-toolkit
go 1.17
go 1.14
require (
github.com/BurntSushi/toml v1.0.0
github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82
github.com/container-orchestrated-devices/container-device-interface v0.5.2
github.com/opencontainers/runc v1.1.4
github.com/opencontainers/runtime-spec v1.0.3-0.20211214071223-8958f93039ab
github.com/pelletier/go-toml v1.9.4
github.com/sirupsen/logrus v1.9.0
github.com/BurntSushi/toml v0.3.1
github.com/NVIDIA/go-nvml v0.11.1-0
github.com/containers/podman/v2 v2.2.1
github.com/opencontainers/runtime-spec v1.0.3-0.20211101234015-a3c33d663ebc
github.com/pelletier/go-toml v1.9.3
github.com/sirupsen/logrus v1.8.1
github.com/stretchr/testify v1.7.0
github.com/tsaikd/KDGoLib v0.0.0-20191001134900-7f3cf518e07d
github.com/urfave/cli/v2 v2.3.0
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20220922133427-1049a7fa76a9
golang.org/x/mod v0.5.0
golang.org/x/sys v0.0.0-20220927170352-d9d178bc13c6
sigs.k8s.io/yaml v1.3.0
)
require (
github.com/blang/semver v3.5.1+incompatible // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.1 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/fsnotify/fsnotify v1.5.4 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/opencontainers/runtime-tools v0.9.1-0.20220110225228-7e2d60f1e41f // indirect
github.com/opencontainers/selinux v1.10.1 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
github.com/xeipuuv/gojsonschema v1.2.0 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b // indirect
golang.org/x/mod v0.3.0
golang.org/x/sys v0.0.0-20210426230700-d19ff857e887
)

808
go.sum

File diff suppressed because it is too large Load Diff

View File

@@ -1,48 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package config
import (
"github.com/pelletier/go-toml"
)
// ContainerCLIConfig stores the options for the nvidia-container-cli
type ContainerCLIConfig struct {
Root string
}
// getContainerCLIConfigFrom reads the nvidia container runtime config from the specified toml Tree.
func getContainerCLIConfigFrom(toml *toml.Tree) *ContainerCLIConfig {
cfg := getDefaultContainerCLIConfig()
if toml == nil {
return cfg
}
cfg.Root = toml.GetDefault("nvidia-container-cli.root", cfg.Root).(string)
return cfg
}
// getDefaultContainerCLIConfig defines the default values for the config
func getDefaultContainerCLIConfig() *ContainerCLIConfig {
c := ContainerCLIConfig{
Root: "",
}
return &c
}

View File

@@ -1,114 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package config
import (
"fmt"
"io"
"os"
"path"
"github.com/pelletier/go-toml"
)
const (
configOverride = "XDG_CONFIG_HOME"
configFilePath = "nvidia-container-runtime/config.toml"
)
var (
// DefaultExecutableDir specifies the default path to use for executables if they cannot be located in the path.
DefaultExecutableDir = "/usr/bin"
// NVIDIAContainerRuntimeHookExecutable is the executable name for the NVIDIA Container Runtime Hook
NVIDIAContainerRuntimeHookExecutable = "nvidia-container-runtime-hook"
// NVIDIAContainerToolkitExecutable is the executable name for the NVIDIA Container Toolkit (an alias for the NVIDIA Container Runtime Hook)
NVIDIAContainerToolkitExecutable = "nvidia-container-toolkit"
configDir = "/etc/"
)
// Config represents the contents of the config.toml file for the NVIDIA Container Toolkit
// Note: This is currently duplicated by the HookConfig in cmd/nvidia-container-toolkit/hook_config.go
type Config struct {
NVIDIAContainerCLIConfig ContainerCLIConfig `toml:"nvidia-container-cli"`
NVIDIACTKConfig CTKConfig `toml:"nvidia-ctk"`
NVIDIAContainerRuntimeConfig RuntimeConfig `toml:"nvidia-container-runtime"`
}
// GetConfig sets up the config struct. Values are read from a toml file
// or set via the environment.
func GetConfig() (*Config, error) {
if XDGConfigDir := os.Getenv(configOverride); len(XDGConfigDir) != 0 {
configDir = XDGConfigDir
}
configFilePath := path.Join(configDir, configFilePath)
tomlFile, err := os.Open(configFilePath)
if err != nil {
return getDefaultConfig(), nil
}
defer tomlFile.Close()
cfg, err := loadConfigFrom(tomlFile)
if err != nil {
return nil, fmt.Errorf("failed to read config values: %v", err)
}
return cfg, nil
}
// loadRuntimeConfigFrom reads the config from the specified Reader
func loadConfigFrom(reader io.Reader) (*Config, error) {
toml, err := toml.LoadReader(reader)
if err != nil {
return nil, err
}
return getConfigFrom(toml)
}
// getConfigFrom reads the nvidia container runtime config from the specified toml Tree.
func getConfigFrom(toml *toml.Tree) (*Config, error) {
cfg := getDefaultConfig()
if toml == nil {
return cfg, nil
}
cfg.NVIDIAContainerCLIConfig = *getContainerCLIConfigFrom(toml)
cfg.NVIDIACTKConfig = *getCTKConfigFrom(toml)
runtimeConfig, err := getRuntimeConfigFrom(toml)
if err != nil {
return nil, fmt.Errorf("failed to load nvidia-container-runtime config: %v", err)
}
cfg.NVIDIAContainerRuntimeConfig = *runtimeConfig
return cfg, nil
}
// getDefaultConfig defines the default values for the config
func getDefaultConfig() *Config {
c := Config{
NVIDIAContainerCLIConfig: *getDefaultContainerCLIConfig(),
NVIDIACTKConfig: *getDefaultCTKConfig(),
NVIDIAContainerRuntimeConfig: *GetDefaultRuntimeConfig(),
}
return &c
}

View File

@@ -1,165 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package config
import (
"io/ioutil"
"os"
"path/filepath"
"strings"
"testing"
"github.com/stretchr/testify/require"
)
func TestGetConfigWithCustomConfig(t *testing.T) {
wd, err := os.Getwd()
require.NoError(t, err)
// By default debug is disabled
contents := []byte("[nvidia-container-runtime]\ndebug = \"/nvidia-container-toolkit.log\"")
testDir := filepath.Join(wd, "test")
filename := filepath.Join(testDir, configFilePath)
os.Setenv(configOverride, testDir)
require.NoError(t, os.MkdirAll(filepath.Dir(filename), 0766))
require.NoError(t, ioutil.WriteFile(filename, contents, 0766))
defer func() { require.NoError(t, os.RemoveAll(testDir)) }()
cfg, err := GetConfig()
require.NoError(t, err)
require.Equal(t, cfg.NVIDIAContainerRuntimeConfig.DebugFilePath, "/nvidia-container-toolkit.log")
}
func TestGetConfig(t *testing.T) {
testCases := []struct {
description string
contents []string
expectedError error
expectedConfig *Config
}{
{
description: "empty config is default",
expectedConfig: &Config{
NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "",
},
NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/dev/null",
LogLevel: "info",
Runtimes: []string{"docker-runc", "runc"},
Mode: "auto",
Modes: modesConfig{
CSV: csvModeConfig{
MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d",
},
},
},
NVIDIACTKConfig: CTKConfig{
Path: "nvidia-ctk",
},
},
},
{
description: "config options set inline",
contents: []string{
"nvidia-container-cli.root = \"/bar/baz\"",
"nvidia-container-runtime.debug = \"/foo/bar\"",
"nvidia-container-runtime.experimental = true",
"nvidia-container-runtime.discover-mode = \"not-legacy\"",
"nvidia-container-runtime.log-level = \"debug\"",
"nvidia-container-runtime.runtimes = [\"/some/runtime\",]",
"nvidia-container-runtime.mode = \"not-auto\"",
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
},
expectedConfig: &Config{
NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "/bar/baz",
},
NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/foo/bar",
LogLevel: "debug",
Runtimes: []string{"/some/runtime"},
Mode: "not-auto",
Modes: modesConfig{
CSV: csvModeConfig{
MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d",
},
},
},
NVIDIACTKConfig: CTKConfig{
Path: "/foo/bar/nvidia-ctk",
},
},
},
{
description: "config options set in section",
contents: []string{
"[nvidia-container-cli]",
"root = \"/bar/baz\"",
"[nvidia-container-runtime]",
"debug = \"/foo/bar\"",
"experimental = true",
"discover-mode = \"not-legacy\"",
"log-level = \"debug\"",
"runtimes = [\"/some/runtime\",]",
"mode = \"not-auto\"",
"[nvidia-container-runtime.modes.csv]",
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"[nvidia-ctk]",
"path = \"/foo/bar/nvidia-ctk\"",
},
expectedConfig: &Config{
NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "/bar/baz",
},
NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/foo/bar",
LogLevel: "debug",
Runtimes: []string{"/some/runtime"},
Mode: "not-auto",
Modes: modesConfig{
CSV: csvModeConfig{
MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d",
},
},
},
NVIDIACTKConfig: CTKConfig{
Path: "/foo/bar/nvidia-ctk",
},
},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
reader := strings.NewReader(strings.Join(tc.contents, "\n"))
cfg, err := loadConfigFrom(reader)
if tc.expectedError != nil {
require.Error(t, err)
} else {
require.NoError(t, err)
}
require.EqualValues(t, tc.expectedConfig, cfg)
})
}
}

View File

@@ -1,115 +0,0 @@
/**
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package docker
import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"os"
log "github.com/sirupsen/logrus"
)
// LoadConfig loads the docker config from disk
func LoadConfig(configFilePath string) (map[string]interface{}, error) {
log.Infof("Loading docker config from %v", configFilePath)
info, err := os.Stat(configFilePath)
if os.IsExist(err) && info.IsDir() {
return nil, fmt.Errorf("config file is a directory")
}
cfg := make(map[string]interface{})
if os.IsNotExist(err) {
log.Infof("Config file does not exist, creating new one")
return cfg, nil
}
readBytes, err := ioutil.ReadFile(configFilePath)
if err != nil {
return nil, fmt.Errorf("unable to read config: %v", err)
}
reader := bytes.NewReader(readBytes)
if err := json.NewDecoder(reader).Decode(&cfg); err != nil {
return nil, err
}
log.Infof("Successfully loaded config")
return cfg, nil
}
// UpdateConfig updates the docker config to include the nvidia runtimes
func UpdateConfig(config map[string]interface{}, defaultRuntime string, newRuntimes map[string]interface{}) error {
if defaultRuntime != "" {
config["default-runtime"] = defaultRuntime
}
// Read the existing runtimes
runtimes := make(map[string]interface{})
if _, exists := config["runtimes"]; exists {
runtimes = config["runtimes"].(map[string]interface{})
}
// Add / update the runtime definitions
for name, rt := range newRuntimes {
runtimes[name] = rt
}
// Update the runtimes definition
if len(runtimes) > 0 {
config["runtimes"] = runtimes
}
return nil
}
// FlushConfig flushes the updated/reverted config out to disk
func FlushConfig(cfg map[string]interface{}, configFilePath string) error {
log.Infof("Flushing docker config to %v", configFilePath)
output, err := json.MarshalIndent(cfg, "", " ")
if err != nil {
return fmt.Errorf("unable to convert to JSON: %v", err)
}
switch len(output) {
case 0:
err := os.Remove(configFilePath)
if err != nil {
return fmt.Errorf("unable to remove empty file: %v", err)
}
log.Infof("Config empty, removing file")
default:
f, err := os.Create(configFilePath)
if err != nil {
return fmt.Errorf("unable to open %v for writing: %v", configFilePath, err)
}
defer f.Close()
_, err = f.WriteString(string(output))
if err != nil {
return fmt.Errorf("unable to write output: %v", err)
}
}
log.Infof("Successfully flushed config")
return nil
}

View File

@@ -1,228 +0,0 @@
/**
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package docker
import (
"encoding/json"
"fmt"
"testing"
"github.com/stretchr/testify/require"
)
func TestUpdateConfigDefaultRuntime(t *testing.T) {
testCases := []struct {
config map[string]interface{}
defaultRuntime string
runtimeName string
expectedDefaultRuntimeName interface{}
}{
{
defaultRuntime: "",
expectedDefaultRuntimeName: nil,
},
{
defaultRuntime: "NAME",
expectedDefaultRuntimeName: "NAME",
},
{
config: map[string]interface{}{
"default-runtime": "ALREADY_SET",
},
defaultRuntime: "",
expectedDefaultRuntimeName: "ALREADY_SET",
},
{
config: map[string]interface{}{
"default-runtime": "ALREADY_SET",
},
defaultRuntime: "NAME",
expectedDefaultRuntimeName: "NAME",
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
if tc.config == nil {
tc.config = make(map[string]interface{})
}
err := UpdateConfig(tc.config, tc.defaultRuntime, nil)
require.NoError(t, err)
defaultRuntimeName := tc.config["default-runtime"]
require.EqualValues(t, tc.expectedDefaultRuntimeName, defaultRuntimeName)
})
}
}
func TestUpdateConfigRuntimes(t *testing.T) {
testCases := []struct {
config map[string]interface{}
runtimes map[string]interface{}
expectedConfig map[string]interface{}
}{
{
config: map[string]interface{}{},
runtimes: map[string]interface{}{
"runtime1": map[string]interface{}{
"path": "/test/runtime/dir/runtime1",
"args": []string{},
},
"runtime2": map[string]interface{}{
"path": "/test/runtime/dir/runtime2",
"args": []string{},
},
},
expectedConfig: map[string]interface{}{
"runtimes": map[string]interface{}{
"runtime1": map[string]interface{}{
"path": "/test/runtime/dir/runtime1",
"args": []string{},
},
"runtime2": map[string]interface{}{
"path": "/test/runtime/dir/runtime2",
"args": []string{},
},
},
},
},
{
config: map[string]interface{}{
"runtimes": map[string]interface{}{
"runtime1": map[string]interface{}{
"path": "runtime1",
"args": []string{},
},
},
},
runtimes: map[string]interface{}{
"runtime1": map[string]interface{}{
"path": "/test/runtime/dir/runtime1",
"args": []string{},
},
"runtime2": map[string]interface{}{
"path": "/test/runtime/dir/runtime2",
"args": []string{},
},
},
expectedConfig: map[string]interface{}{
"runtimes": map[string]interface{}{
"runtime1": map[string]interface{}{
"path": "/test/runtime/dir/runtime1",
"args": []string{},
},
"runtime2": map[string]interface{}{
"path": "/test/runtime/dir/runtime2",
"args": []string{},
},
},
},
},
{
config: map[string]interface{}{
"runtimes": map[string]interface{}{
"not-nvidia": map[string]interface{}{
"path": "some-other-path",
"args": []string{},
},
},
},
runtimes: map[string]interface{}{
"runtime1": map[string]interface{}{
"path": "/test/runtime/dir/runtime1",
"args": []string{},
},
},
expectedConfig: map[string]interface{}{
"runtimes": map[string]interface{}{
"not-nvidia": map[string]interface{}{
"path": "some-other-path",
"args": []string{},
},
"runtime1": map[string]interface{}{
"path": "/test/runtime/dir/runtime1",
"args": []string{},
},
},
},
},
{
config: map[string]interface{}{
"exec-opts": []string{"native.cgroupdriver=systemd"},
"log-driver": "json-file",
"log-opts": map[string]string{
"max-size": "100m",
},
"storage-driver": "overlay2",
},
runtimes: map[string]interface{}{
"runtime1": map[string]interface{}{
"path": "/test/runtime/dir/runtime1",
"args": []string{},
},
},
expectedConfig: map[string]interface{}{
"exec-opts": []string{"native.cgroupdriver=systemd"},
"log-driver": "json-file",
"log-opts": map[string]string{
"max-size": "100m",
},
"storage-driver": "overlay2",
"runtimes": map[string]interface{}{
"runtime1": map[string]interface{}{
"path": "/test/runtime/dir/runtime1",
"args": []string{},
},
},
},
},
{
config: map[string]interface{}{
"exec-opts": []string{"native.cgroupdriver=systemd"},
"log-driver": "json-file",
"log-opts": map[string]string{
"max-size": "100m",
},
"storage-driver": "overlay2",
},
expectedConfig: map[string]interface{}{
"exec-opts": []string{"native.cgroupdriver=systemd"},
"log-driver": "json-file",
"log-opts": map[string]string{
"max-size": "100m",
},
"storage-driver": "overlay2",
},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
err := UpdateConfig(tc.config, "", tc.runtimes)
require.NoError(t, err)
configContent, err := json.MarshalIndent(tc.config, "", " ")
require.NoError(t, err)
expectedContent, err := json.MarshalIndent(tc.expectedConfig, "", " ")
require.NoError(t, err)
require.EqualValues(t, string(expectedContent), string(configContent))
})
}
}

View File

@@ -1,174 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package image
import (
"fmt"
"strconv"
"strings"
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/mod/semver"
)
const (
envCUDAVersion = "CUDA_VERSION"
envNVRequirePrefix = "NVIDIA_REQUIRE_"
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
envNVRequireJetpack = envNVRequirePrefix + "JETPACK"
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
)
// CUDA represents a CUDA image that can be used for GPU computing. This wraps
// a map of environment variable to values that can be used to perform lookups
// such as requirements.
type CUDA map[string]string
// NewCUDAImageFromSpec creates a CUDA image from the input OCI runtime spec.
// The process environment is read (if present) to construc the CUDA Image.
func NewCUDAImageFromSpec(spec *specs.Spec) (CUDA, error) {
if spec == nil || spec.Process == nil {
return NewCUDAImageFromEnv(nil)
}
return NewCUDAImageFromEnv(spec.Process.Env)
}
// NewCUDAImageFromEnv creates a CUDA image from the input environment. The environment
// is a list of strings of the form ENVAR=VALUE.
func NewCUDAImageFromEnv(env []string) (CUDA, error) {
c := make(CUDA)
for _, e := range env {
parts := strings.SplitN(e, "=", 2)
if len(parts) != 2 {
return nil, fmt.Errorf("invalid environment variable: %v", e)
}
c[parts[0]] = parts[1]
}
return c, nil
}
// IsLegacy returns whether the associated CUDA image is a "legacy" image. An
// image is considered legacy if it has a CUDA_VERSION environment variable defined
// and no NVIDIA_REQUIRE_CUDA environment variable defined.
func (i CUDA) IsLegacy() bool {
legacyCudaVersion := i[envCUDAVersion]
cudaRequire := i[envNVRequireCUDA]
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
}
// GetRequirements returns the requirements from all NVIDIA_REQUIRE_ environment
// variables.
func (i CUDA) GetRequirements() ([]string, error) {
// TODO: We need not process this if disable require is set, but this will be done
// in a single follow-up to ensure that the behavioural change is accurately captured.
// if i.HasDisableRequire() {
// return nil, nil
// }
// All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli
var requirements []string
for name, value := range i {
if strings.HasPrefix(name, envNVRequirePrefix) && !strings.HasPrefix(name, envNVRequireJetpack) {
requirements = append(requirements, value)
}
}
if i.IsLegacy() {
v, err := i.legacyVersion()
if err != nil {
return nil, fmt.Errorf("failed to get version: %v", err)
}
cudaRequire := fmt.Sprintf("cuda>=%s", v)
requirements = append(requirements, cudaRequire)
}
return requirements, nil
}
// HasDisableRequire checks for the value of the NVIDIA_DISABLE_REQUIRE. If set
// to a valid (true) boolean value this can be used to disable the requirement checks
func (i CUDA) HasDisableRequire() bool {
if disable, exists := i[envNVDisableRequire]; exists {
// i.logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", disable)
d, _ := strconv.ParseBool(disable)
return d
}
return false
}
// DevicesFromEnvvars returns the devices requested by the image through environment variables
func (i CUDA) DevicesFromEnvvars(envVars ...string) []string {
// Grab a reference to devices from the first envvar
// in the list that actually exists in the environment.
var devices *string
for _, envVar := range envVars {
if devs, ok := i[envVar]; ok {
devices = &devs
break
}
}
// Environment variable unset with legacy image: default to "all".
if devices == nil && i.IsLegacy() {
return []string{"all"}
}
// Environment variable unset or empty or "void": return nil
if devices == nil || len(*devices) == 0 || *devices == "void" {
return nil
}
// Environment variable set to "none": reset to "".
if *devices == "none" {
return []string{""}
}
return strings.Split(*devices, ",")
}
func (i CUDA) legacyVersion() (string, error) {
majorMinor, err := parseMajorMinorVersion(i[envCUDAVersion])
if err != nil {
return "", fmt.Errorf("invalid CUDA version: %v", err)
}
return majorMinor, nil
}
func parseMajorMinorVersion(version string) (string, error) {
vVersion := "v" + strings.TrimPrefix(version, "v")
if !semver.IsValid(vVersion) {
return "", fmt.Errorf("invalid version string")
}
majorMinor := strings.TrimPrefix(semver.MajorMinor(vVersion), "v")
parts := strings.Split(majorMinor, ".")
var err error
_, err = strconv.ParseUint(parts[0], 10, 32)
if err != nil {
return "", fmt.Errorf("invalid major version")
}
_, err = strconv.ParseUint(parts[1], 10, 32)
if err != nil {
return "", fmt.Errorf("invalid minor version")
}
return majorMinor, nil
}

View File

@@ -1,123 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package image
import (
"testing"
"github.com/stretchr/testify/require"
)
func TestParseMajorMinorVersionValid(t *testing.T) {
var tests = []struct {
version string
expected string
}{
{"0", "0.0"},
{"8", "8.0"},
{"7.5", "7.5"},
{"9.0.116", "9.0"},
{"4294967295.4294967295.4294967295", "4294967295.4294967295"},
{"v11.6", "11.6"},
}
for _, c := range tests {
t.Run(c.version, func(t *testing.T) {
version, err := parseMajorMinorVersion(c.version)
require.NoError(t, err)
require.Equal(t, c.expected, version)
})
}
}
func TestParseMajorMinorVersionInvalid(t *testing.T) {
var tests = []string{
"foo",
"foo.5.10",
"9.0.116.50",
"9.0.116foo",
"7.foo",
"9.0.bar",
"9.4294967296",
"9.0.116.",
"9..0",
"9.",
".5.10",
"-9",
"+9",
"-9.1.116",
"-9.-1.-116",
}
for _, c := range tests {
t.Run(c, func(t *testing.T) {
_, err := parseMajorMinorVersion(c)
require.Error(t, err)
})
}
}
func TestGetRequirements(t *testing.T) {
testCases := []struct {
description string
env []string
requirements []string
}{
{
description: "NVIDIA_REQUIRE_JETPACK is ignored",
env: []string{"NVIDIA_REQUIRE_JETPACK=csv-mounts=all"},
requirements: nil,
},
{
description: "NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS is ignored",
env: []string{"NVIDIA_REQUIRE_JETPACK_HOST_MOUNTS=base-only"},
requirements: nil,
},
{
description: "single requirement set",
env: []string{"NVIDIA_REQUIRE_CUDA=cuda>=11.6"},
requirements: []string{"cuda>=11.6"},
},
{
description: "requirements are concatenated requirement set",
env: []string{"NVIDIA_REQUIRE_CUDA=cuda>=11.6", "NVIDIA_REQUIRE_BRAND=brand=tesla"},
requirements: []string{"cuda>=11.6", "brand=tesla"},
},
{
description: "legacy image",
env: []string{"CUDA_VERSION=11.6"},
requirements: []string{"cuda>=11.6"},
},
{
description: "legacy image with additional requirement",
env: []string{"CUDA_VERSION=11.6", "NVIDIA_REQUIRE_BRAND=brand=tesla"},
requirements: []string{"cuda>=11.6", "brand=tesla"},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
image, err := NewCUDAImageFromEnv(tc.env)
require.NoError(t, err)
requirements, err := image.GetRequirements()
require.NoError(t, err)
require.ElementsMatch(t, tc.requirements, requirements)
})
}
}

View File

@@ -1,101 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package config
import (
"fmt"
"github.com/pelletier/go-toml"
"github.com/sirupsen/logrus"
)
const (
dockerRuncExecutableName = "docker-runc"
runcExecutableName = "runc"
auto = "auto"
)
// RuntimeConfig stores the config options for the NVIDIA Container Runtime
type RuntimeConfig struct {
DebugFilePath string `toml:"debug"`
// LogLevel defines the logging level for the application
LogLevel string `toml:"log-level"`
// Runtimes defines the candidates for the low-level runtime
Runtimes []string `toml:"runtimes"`
Mode string `toml:"mode"`
Modes modesConfig `toml:"modes"`
}
// modesConfig defines (optional) per-mode configs
type modesConfig struct {
CSV csvModeConfig `toml:"csv"`
CDI cdiModeConfig `toml:"cdi"`
}
type cdiModeConfig struct {
// SpecDirs allows for the default spec dirs for CDI to be overridden
SpecDirs []string `toml:"spec-dirs"`
}
type csvModeConfig struct {
MountSpecPath string `toml:"mount-spec-path"`
}
// dummy allows us to unmarshal only a RuntimeConfig from a *toml.Tree
type dummy struct {
Runtime RuntimeConfig `toml:"nvidia-container-runtime"`
}
// getRuntimeConfigFrom reads the nvidia container runtime config from the specified toml Tree.
func getRuntimeConfigFrom(toml *toml.Tree) (*RuntimeConfig, error) {
cfg := GetDefaultRuntimeConfig()
if toml == nil {
return cfg, nil
}
d := dummy{
Runtime: *cfg,
}
if err := toml.Unmarshal(&d); err != nil {
return nil, fmt.Errorf("failed to unmarshal runtime config: %v", err)
}
return &d.Runtime, nil
}
// GetDefaultRuntimeConfig defines the default values for the config
func GetDefaultRuntimeConfig() *RuntimeConfig {
c := RuntimeConfig{
DebugFilePath: "/dev/null",
LogLevel: logrus.InfoLevel.String(),
Runtimes: []string{
dockerRuncExecutableName,
runcExecutableName,
},
Mode: auto,
Modes: modesConfig{
CSV: csvModeConfig{
MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d",
},
},
}
return &c
}

View File

@@ -1,46 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package config
import "github.com/pelletier/go-toml"
// CTKConfig stores the config options for the NVIDIA Container Toolkit CLI (nvidia-ctk)
type CTKConfig struct {
Path string `toml:"path"`
}
// getCTKConfigFrom reads the nvidia container runtime config from the specified toml Tree.
func getCTKConfigFrom(toml *toml.Tree) *CTKConfig {
cfg := getDefaultCTKConfig()
if toml == nil {
return cfg
}
cfg.Path = toml.GetDefault("nvidia-ctk.path", cfg.Path).(string)
return cfg
}
// getDefaultCTKConfig defines the default values for the config
func getDefaultCTKConfig() *CTKConfig {
c := CTKConfig{
Path: "nvidia-ctk",
}
return &c
}

View File

@@ -1,137 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package cuda
import (
"fmt"
"github.com/NVIDIA/go-nvml/pkg/dl"
)
/*
#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files
#ifdef _WIN32
#define CUDAAPI __stdcall
#else
#define CUDAAPI
#endif
typedef int CUdevice;
typedef enum CUdevice_attribute_enum {
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76
} CUdevice_attribute;
typedef enum cudaError_enum {
CUDA_SUCCESS = 0
} CUresult;
CUresult CUDAAPI cuInit(unsigned int Flags);
CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
*/
import "C"
const (
libraryName = "libcuda.so.1"
libraryLoadFlags = dl.RTLD_LAZY | dl.RTLD_GLOBAL
)
// cuda stores a reference the cuda dynamic library
var lib *dl.DynamicLibrary
// Version returns the CUDA version of the driver as a string or an error if this
// cannot be determined.
func Version() (string, error) {
lib, err := load()
if err != nil {
return "", err
}
defer lib.Close()
if err := lib.Lookup("cuDriverGetVersion"); err != nil {
return "", fmt.Errorf("failed to lookup symbol: %v", err)
}
var version C.int
if result := C.cuDriverGetVersion(&version); result != C.CUDA_SUCCESS {
return "", fmt.Errorf("failed to get CUDA version: result=%v", result)
}
major := version / 1000
minor := version % 100 / 10
return fmt.Sprintf("%d.%d", major, minor), nil
}
// ComputeCapability returns the CUDA compute capability of a device with the specified index as a string
// or an error if this cannot be determined.
func ComputeCapability(index int) (string, error) {
lib, err := load()
if err != nil {
return "", err
}
defer lib.Close()
if err := lib.Lookup("cuInit"); err != nil {
return "", fmt.Errorf("failed to lookup symbol: %v", err)
}
if err := lib.Lookup("cuDeviceGet"); err != nil {
return "", fmt.Errorf("failed to lookup symbol: %v", err)
}
if err := lib.Lookup("cuDeviceGetAttribute"); err != nil {
return "", fmt.Errorf("failed to lookup symbol: %v", err)
}
if result := C.cuInit(C.uint(0)); result != C.CUDA_SUCCESS {
return "", fmt.Errorf("failed to initialize CUDA: result=%v", result)
}
var device C.CUdevice
// NOTE: We only query the first device
if result := C.cuDeviceGet(&device, C.int(index)); result != C.CUDA_SUCCESS {
return "", fmt.Errorf("failed to get CUDA device %v: result=%v", 0, result)
}
var major C.int
if result := C.cuDeviceGetAttribute(&major, C.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); result != C.CUDA_SUCCESS {
return "", fmt.Errorf("failed to get CUDA compute capability major for device %v : result=%v", 0, result)
}
var minor C.int
if result := C.cuDeviceGetAttribute(&minor, C.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); result != C.CUDA_SUCCESS {
return "", fmt.Errorf("failed to get CUDA compute capability minor for device %v: result=%v", 0, result)
}
return fmt.Sprintf("%d.%d", major, minor), nil
}
func load() (*dl.DynamicLibrary, error) {
lib := dl.New(libraryName, libraryLoadFlags)
if lib == nil {
return nil, fmt.Errorf("error instantiating DynamicLibrary for CUDA")
}
err := lib.Open()
if err != nil {
return nil, fmt.Errorf("error opening DynamicLibrary for CUDA: %v", err)
}
return lib, nil
}

View File

@@ -0,0 +1,51 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package discover
import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
log "github.com/sirupsen/logrus"
)
// NewBinaryMounts creates a discoverer for binaries using the specified root
func NewBinaryMounts(root string) Discover {
return NewBinaryMountsWithLogger(log.StandardLogger(), root)
}
// NewBinaryMountsWithLogger creates a Mounts discoverer as with NewBinaryMounts
// with the specified logger
func NewBinaryMountsWithLogger(logger *log.Logger, root string) Discover {
d := mounts{
logger: logger,
lookup: lookup.NewPathLocatorWithLogger(logger, root),
required: requiredBinaries,
}
return &d
}
// requiredBinaries defines a set of binaries and their labels
var requiredBinaries = map[string][]string{
"utility": {
"nvidia-smi", /* System management interface */
"nvidia-debugdump", /* GPU coredump utility */
"nvidia-persistenced", /* Persistence mode utility */
},
"compute": {
"nvidia-cuda-mps-control", /* Multi process service CLI */
"nvidia-cuda-mps-server", /* Multi process service server */
},
}

View File

@@ -0,0 +1,73 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package discover
import (
"testing"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
func TestBinaries(t *testing.T) {
binaryLookup := map[string]string{
"nvidia-smi": "/usr/bin/nvidia-smi",
"nvidia-persistenced": "/usr/bin/nvidia-persistenced",
"nvidia-debugdump": "test-duplicates",
"nvidia-cuda-mps-control": "test-duplicates",
}
logger, _ := testlog.NewNullLogger()
d := NewBinaryMountsWithLogger(logger, "")
// Override lookup for testing
mockLocator := NewLocatorMockFromMap(binaryLookup)
d.(*mounts).lookup = mockLocator
mounts, err := d.Mounts()
require.NoError(t, err)
expected := []Mount{
{
Path: "/usr/bin/nvidia-smi",
},
{
Path: "/usr/bin/nvidia-persistenced",
},
{
Path: "test-duplicates",
},
}
require.Equal(t, len(expected), len(mounts))
devices, err := d.Devices()
require.NoError(t, err)
require.Empty(t, devices)
hooks, err := d.Hooks()
require.NoError(t, err)
require.Empty(t, hooks)
}
func TestNewBinariesConstructor(t *testing.T) {
b := NewBinaryMounts("").(*mounts)
require.NotNil(t, b.logger)
require.NotNil(t, b.lookup)
}

View File

@@ -1,62 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package discover
import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/sirupsen/logrus"
)
// charDevices is a discover for a list of character devices
type charDevices mounts
var _ Discover = (*charDevices)(nil)
// NewCharDeviceDiscoverer creates a discoverer which locates the specified set of device nodes.
func NewCharDeviceDiscoverer(logger *logrus.Logger, devices []string, root string) Discover {
locator := lookup.NewCharDeviceLocator(logger, root)
return NewDeviceDiscoverer(logger, locator, root, devices)
}
// NewDeviceDiscoverer creates a discoverer which locates the specified set of device nodes using the specified locator.
func NewDeviceDiscoverer(logger *logrus.Logger, locator lookup.Locator, root string, devices []string) Discover {
m := NewMounts(logger, locator, root, devices).(*mounts)
return (*charDevices)(m)
}
// Mounts returns the discovered mounts for the charDevices.
// Since this explicitly specifies a device list, the mounts are nil.
func (d *charDevices) Mounts() ([]Mount, error) {
return nil, nil
}
// Devices returns the discovered devices for the charDevices.
// Here the device nodes are first discovered as mounts and these are converted to devices.
func (d *charDevices) Devices() ([]Device, error) {
devicesAsMounts, err := (*mounts)(d).Mounts()
if err != nil {
return nil, err
}
var devices []Device
for _, mount := range devicesAsMounts {
devices = append(devices, Device(mount))
}
return devices, nil
}

View File

@@ -1,83 +0,0 @@
/**
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package discover
import (
"fmt"
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
func TestCharDevices(t *testing.T) {
logger, logHook := testlog.NewNullLogger()
testCases := []struct {
description string
input *charDevices
expectedMounts []Mount
expectedMountsError error
expectedDevicesError error
expectedDevices []Device
}{
{
description: "dev mounts are empty",
input: (*charDevices)(
&mounts{
lookup: &lookup.LocatorMock{
LocateFunc: func(string) ([]string, error) {
return []string{"located"}, nil
},
},
required: []string{"required"},
},
),
expectedDevices: []Device{{Path: "located", HostPath: "located"}},
},
{
description: "dev devices returns error for nil lookup",
input: &charDevices{},
expectedDevicesError: fmt.Errorf("no lookup defined"),
},
}
for _, tc := range testCases {
logHook.Reset()
t.Run(tc.description, func(t *testing.T) {
tc.input.logger = logger
mounts, err := tc.input.Mounts()
if tc.expectedMountsError != nil {
require.Error(t, err)
} else {
require.NoError(t, err)
}
require.ElementsMatch(t, tc.expectedMounts, mounts)
devices, err := tc.input.Devices()
if tc.expectedDevicesError != nil {
require.Error(t, err)
} else {
require.NoError(t, err)
}
require.ElementsMatch(t, tc.expectedDevices, devices)
})
}
}

View File

@@ -18,26 +18,16 @@ package discover
import "fmt"
// list is a discoverer that contains a list of Discoverers. The output of the
// Mounts functions is the concatenation of the output for each of the
// composite is a discoverer that contains a list of Discoverers. The output of the
// Devices, Mounts, and Hooks functions is the concatenation of the output for each of the
// elements in the list.
type list struct {
type composite struct {
discoverers []Discover
}
var _ Discover = (*list)(nil)
var _ Discover = (*composite)(nil)
// Merge creates a discoverer that is the composite of a list of discoveres.
func Merge(d ...Discover) Discover {
l := list{
discoverers: d,
}
return &l
}
// Devices returns all devices from the included discoverers
func (d list) Devices() ([]Device, error) {
func (d composite) Devices() ([]Device, error) {
var allDevices []Device
for i, di := range d.discoverers {
@@ -51,8 +41,7 @@ func (d list) Devices() ([]Device, error) {
return allDevices, nil
}
// Mounts returns all mounts from the included discoverers
func (d list) Mounts() ([]Mount, error) {
func (d composite) Mounts() ([]Mount, error) {
var allMounts []Mount
for i, di := range d.discoverers {
@@ -66,8 +55,7 @@ func (d list) Mounts() ([]Mount, error) {
return allMounts, nil
}
// Hooks returns all Hooks from the included discoverers
func (d list) Hooks() ([]Hook, error) {
func (d composite) Hooks() ([]Hook, error) {
var allHooks []Hook
for i, di := range d.discoverers {
@@ -80,3 +68,7 @@ func (d list) Hooks() ([]Hook, error) {
return allHooks, nil
}
func (d *composite) add(di ...Discover) {
d.discoverers = append(d.discoverers, di...)
}

View File

@@ -1,107 +0,0 @@
/**
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package discover
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover/csv"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/sirupsen/logrus"
)
// NewFromCSVFiles creates a discoverer for the specified CSV files. A logger is also supplied.
// The constructed discoverer is comprised of a list, with each element in the list being associated with a
// single CSV files.
func NewFromCSVFiles(logger *logrus.Logger, files []string, root string) (Discover, error) {
if len(files) == 0 {
logger.Warnf("No CSV files specified")
return None{}, nil
}
symlinkLocator := lookup.NewSymlinkLocator(logger, root)
locators := map[csv.MountSpecType]lookup.Locator{
csv.MountSpecDev: lookup.NewCharDeviceLocator(logger, root),
csv.MountSpecDir: lookup.NewDirectoryLocator(logger, root),
// Libraries and symlinks are handled in the same way
csv.MountSpecLib: symlinkLocator,
csv.MountSpecSym: symlinkLocator,
}
var mountSpecs []*csv.MountSpec
for _, filename := range files {
targets, err := loadCSVFile(logger, filename)
if err != nil {
logger.Warnf("Skipping CSV file %v: %v", filename, err)
continue
}
mountSpecs = append(mountSpecs, targets...)
}
return newFromMountSpecs(logger, locators, root, mountSpecs)
}
// loadCSVFile loads the specified CSV file and returns the list of mount specs
func loadCSVFile(logger *logrus.Logger, filename string) ([]*csv.MountSpec, error) {
// Create a discoverer for each file-kind combination
targets, err := csv.NewCSVFileParser(logger, filename).Parse()
if err != nil {
return nil, fmt.Errorf("failed to parse CSV file: %v", err)
}
if len(targets) == 0 {
return nil, fmt.Errorf("CSV file is empty")
}
return targets, nil
}
// newFromMountSpecs creates a discoverer for the CSV file. A logger is also supplied.
// A list of csvDiscoverers is returned, with each being associated with a single MountSpecType.
func newFromMountSpecs(logger *logrus.Logger, locators map[csv.MountSpecType]lookup.Locator, root string, targets []*csv.MountSpec) (Discover, error) {
if len(targets) == 0 {
return &None{}, nil
}
var discoverers []Discover
var mountSpecTypes []csv.MountSpecType
candidatesByType := make(map[csv.MountSpecType][]string)
for _, t := range targets {
if _, exists := candidatesByType[t.Type]; !exists {
mountSpecTypes = append(mountSpecTypes, t.Type)
}
candidatesByType[t.Type] = append(candidatesByType[t.Type], t.Path)
}
for _, t := range mountSpecTypes {
locator, exists := locators[t]
if !exists {
return nil, fmt.Errorf("no locator defined for '%v'", t)
}
var m Discover
switch t {
case csv.MountSpecDev:
m = NewDeviceDiscoverer(logger, locator, root, candidatesByType[t])
default:
m = NewMounts(logger, locator, root, candidatesByType[t])
}
discoverers = append(discoverers, m)
}
return &list{discoverers: discoverers}, nil
}

View File

@@ -1,131 +0,0 @@
/**
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package csv
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/sirupsen/logrus"
)
const (
// DefaultMountSpecPath is default location of CSV files that define the modifications required to the OCI spec
DefaultMountSpecPath = "/etc/nvidia-container-runtime/host-files-for-container.d"
)
// GetFileList returns the (non-recursive) list of CSV files in the specified
// folder
func GetFileList(root string) ([]string, error) {
contents, err := os.ReadDir(root)
if err != nil && errors.Is(err, os.ErrNotExist) {
return nil, nil
} else if err != nil {
return nil, fmt.Errorf("failed to read the contents of %v: %v", root, err)
}
var csvFilePaths []string
for _, c := range contents {
if c.IsDir() {
continue
}
if c.Name() == ".csv" {
continue
}
ext := strings.ToLower(filepath.Ext(c.Name()))
if ext != ".csv" {
continue
}
csvFilePaths = append(csvFilePaths, filepath.Join(root, c.Name()))
}
return csvFilePaths, nil
}
// BaseFilesOnly filters out non-base CSV files from the list of CSV files.
func BaseFilesOnly(filenames []string) []string {
filter := map[string]bool{
"l4t.csv": true,
"drivers.csv": true,
"devices.csv": true,
}
var selected []string
for _, file := range filenames {
base := filepath.Base(file)
if filter[base] {
selected = append(selected, file)
}
}
return selected
}
// Parser specifies an interface for parsing MountSpecs
type Parser interface {
Parse() ([]*MountSpec, error)
}
type csv struct {
logger *logrus.Logger
filename string
}
// NewCSVFileParser creates a new parser for reading MountSpecs from the specified CSV file
func NewCSVFileParser(logger *logrus.Logger, filename string) Parser {
p := csv{
logger: logger,
filename: filename,
}
return &p
}
// Parse parses the csv file and returns a list of MountSpecs in the file
func (p csv) Parse() ([]*MountSpec, error) {
reader, err := os.Open(p.filename)
if err != nil {
return nil, fmt.Errorf("failed to open %v for reading: %v", p.filename, err)
}
defer reader.Close()
return p.parseFromReader(reader), nil
}
// parseFromReader parses the specified file and returns a list of required jetson mounts
func (p csv) parseFromReader(reader io.Reader) []*MountSpec {
var targets []*MountSpec
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
line := scanner.Text()
target, err := NewMountSpecFromLine(line)
if err != nil {
p.logger.Debugf("Skipping invalid mount spec '%v': %v", line, err)
continue
}
targets = append(targets, target)
}
return targets
}

View File

@@ -1,83 +0,0 @@
/**
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package csv
import (
"path/filepath"
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/test"
"github.com/stretchr/testify/require"
)
func TestGetFileList(t *testing.T) {
moduleRoot, _ := test.GetModuleRoot()
testCases := []struct {
description string
root string
files []string
expectedError error
}{
{
description: "returns list of CSV files",
root: "test/input/csv_samples/",
files: []string{
"jetson.csv",
"simple_wrong.csv",
"simple.csv",
"spaced.csv",
},
},
{
description: "handles empty folder",
root: "test/input/csv_samples/empty",
},
{
description: "handles non-existent folder",
root: "test/input/csv_samples/NONEXISTENT",
},
{
description: "handles non-existent folder root",
root: "/NONEXISTENT/test/input/csv_samples/",
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
root := filepath.Join(moduleRoot, tc.root)
files, err := GetFileList(root)
if tc.expectedError != nil {
require.Error(t, err)
require.Empty(t, files)
return
}
require.NoError(t, err)
var foundFiles []string
for _, f := range files {
require.Equal(t, root, filepath.Dir(f))
require.Equal(t, ".csv", filepath.Ext(f))
foundFiles = append(foundFiles, filepath.Base(f))
}
require.ElementsMatch(t, tc.files, foundFiles)
})
}
}

View File

@@ -1,74 +0,0 @@
/**
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package csv
import (
"fmt"
"strings"
)
// MountSpecType defines the mount types allowed in a CSV file
type MountSpecType string
const (
// MountSpecDev is used for character devices
MountSpecDev = MountSpecType("dev")
// MountSpecDir is used for directories
MountSpecDir = MountSpecType("dir")
// MountSpecLib is used for libraries or regular files
MountSpecLib = MountSpecType("lib")
// MountSpecSym is used for symlinks.
MountSpecSym = MountSpecType("sym")
)
// MountSpec represents a Jetson mount consisting of a type and a path.
type MountSpec struct {
Type MountSpecType
Path string
}
// NewMountSpecFromLine parses the specified line and returns the MountSpec or an error if the line is malformed
func NewMountSpecFromLine(line string) (*MountSpec, error) {
parts := strings.SplitN(strings.TrimSpace(line), ",", 2)
if len(parts) < 2 {
return nil, fmt.Errorf("failed to parse line: %v", line)
}
mountType := strings.TrimSpace(parts[0])
path := strings.TrimSpace(parts[1])
return NewMountSpec(mountType, path)
}
// NewMountSpec creates a MountSpec with the specified type and path. An error is returned if the type is invalid.
func NewMountSpec(mountType string, path string) (*MountSpec, error) {
mt := MountSpecType(mountType)
switch mt {
case MountSpecDev, MountSpecLib, MountSpecSym, MountSpecDir:
default:
return nil, fmt.Errorf("unexpected mount type: %v", mt)
}
if path == "" {
return nil, fmt.Errorf("invalid path: %v", path)
}
mount := MountSpec{
Type: mt,
Path: path,
}
return &mount, nil
}

View File

@@ -1,82 +0,0 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package csv
import (
"fmt"
"testing"
"github.com/stretchr/testify/require"
)
func TestNewMountSpecFromLine(t *testing.T) {
parseError := fmt.Errorf("failed to parse line")
unexpectedError := fmt.Errorf("unexpected mount type")
testCases := []struct {
line string
expectedError error
expectedValue MountSpec
}{
{
line: "",
expectedError: parseError,
},
{
line: "\t",
expectedError: parseError,
},
{
line: ",",
expectedError: parseError,
},
{
line: "dev,",
expectedError: parseError,
},
{
line: "dev ,/a/path",
expectedValue: MountSpec{
Path: "/a/path",
Type: "dev",
},
},
{
line: "dev ,/a/path,with,commas",
expectedValue: MountSpec{
Path: "/a/path,with,commas",
Type: "dev",
},
},
{
line: "not-dev ,/a/path",
expectedError: unexpectedError,
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
target, err := NewMountSpecFromLine(tc.line)
if tc.expectedError != nil {
require.Error(t, err)
return
}
require.NoError(t, err)
require.EqualValues(t, &tc.expectedValue, target)
})
}
}

View File

@@ -1,142 +0,0 @@
/**
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package discover
import (
"fmt"
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover/csv"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
func TestNewFromMountSpec(t *testing.T) {
logger, _ := testlog.NewNullLogger()
locators := map[csv.MountSpecType]lookup.Locator{
"dev": &lookup.LocatorMock{},
"lib": &lookup.LocatorMock{},
}
testCases := []struct {
description string
root string
targets []*csv.MountSpec
expectedError error
expectedDiscoverer Discover
}{
{
description: "empty targets returns None discoverer list",
expectedDiscoverer: &None{},
},
{
description: "unexpected locator returns error",
targets: []*csv.MountSpec{
{
Type: "foo",
Path: "bar",
},
},
expectedError: fmt.Errorf("no locator defined for foo"),
},
{
description: "creates discoverers based on type",
targets: []*csv.MountSpec{
{
Type: "dev",
Path: "dev0",
},
{
Type: "lib",
Path: "lib0",
},
{
Type: "dev",
Path: "dev1",
},
},
expectedDiscoverer: &list{
discoverers: []Discover{
(*charDevices)(
&mounts{
logger: logger,
lookup: locators["dev"],
root: "/",
required: []string{"dev0", "dev1"},
},
),
&mounts{
logger: logger,
lookup: locators["lib"],
root: "/",
required: []string{"lib0"},
},
},
},
},
{
description: "sets root",
targets: []*csv.MountSpec{
{
Type: "dev",
Path: "dev0",
},
{
Type: "lib",
Path: "lib0",
},
{
Type: "dev",
Path: "dev1",
},
},
root: "/some/root",
expectedDiscoverer: &list{
discoverers: []Discover{
(*charDevices)(
&mounts{
logger: logger,
lookup: locators["dev"],
root: "/some/root",
required: []string{"dev0", "dev1"},
},
),
&mounts{
logger: logger,
lookup: locators["lib"],
root: "/some/root",
required: []string{"lib0"},
},
},
},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
discoverer, err := newFromMountSpecs(logger, locators, tc.root, tc.targets)
if tc.expectedError != nil {
require.Error(t, err)
return
}
require.NoError(t, err)
require.EqualValues(t, tc.expectedDiscoverer, discoverer)
})
}
}

View File

@@ -1,5 +1,5 @@
/*
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -16,33 +16,48 @@
package discover
// Config represents the configuration options for discovery
type Config struct {
Root string
NVIDIAContainerToolkitCLIExecutablePath string
// DevicePath is a path in /dev associated with a device
type DevicePath string
// ProcPath is a path in /proc associated with a devices
type ProcPath string
// PCIBusID is the ID on the PCI bus of a device
type PCIBusID string
// DeviceNode represents a device on the file system
type DeviceNode struct {
Path DevicePath
Major int
Minor int
}
// Device represents a discovered character device.
// Device represents a discovered device including identifiers (Index, UUID, PCI bus ID)
// for selection and paths in /dev and /proc associated with the device
type Device struct {
HostPath string
Path string
Index string
UUID string
PCIBusID PCIBusID
DeviceNodes []DeviceNode
ProcPaths []ProcPath
}
// Mount represents a discovered mount.
// Mount represents a discovered mount. This includes a set of labels
// for selection and the mount path
type Mount struct {
HostPath string
Path string
Path string
Labels map[string]string
}
// Hook represents a discovered hook.
// Hook represents a discovered hook
type Hook struct {
Lifecycle string
Path string
Args []string
Path string
Args []string
HookName string
Labels map[string]string
}
//go:generate moq -stub -out discover_mock.go . Discover
// Discover defines an interface for discovering the devices, mounts, and hooks available on a system
// Discover defines an interface for discovering the devices and mounts available on a system
type Discover interface {
Devices() ([]Device, error)
Mounts() ([]Mount, error)

View File

@@ -1,150 +0,0 @@
// Code generated by moq; DO NOT EDIT.
// github.com/matryer/moq
package discover
import (
"sync"
)
// Ensure, that DiscoverMock does implement Discover.
// If this is not the case, regenerate this file with moq.
var _ Discover = &DiscoverMock{}
// DiscoverMock is a mock implementation of Discover.
//
// func TestSomethingThatUsesDiscover(t *testing.T) {
//
// // make and configure a mocked Discover
// mockedDiscover := &DiscoverMock{
// DevicesFunc: func() ([]Device, error) {
// panic("mock out the Devices method")
// },
// HooksFunc: func() ([]Hook, error) {
// panic("mock out the Hooks method")
// },
// MountsFunc: func() ([]Mount, error) {
// panic("mock out the Mounts method")
// },
// }
//
// // use mockedDiscover in code that requires Discover
// // and then make assertions.
//
// }
type DiscoverMock struct {
// DevicesFunc mocks the Devices method.
DevicesFunc func() ([]Device, error)
// HooksFunc mocks the Hooks method.
HooksFunc func() ([]Hook, error)
// MountsFunc mocks the Mounts method.
MountsFunc func() ([]Mount, error)
// calls tracks calls to the methods.
calls struct {
// Devices holds details about calls to the Devices method.
Devices []struct {
}
// Hooks holds details about calls to the Hooks method.
Hooks []struct {
}
// Mounts holds details about calls to the Mounts method.
Mounts []struct {
}
}
lockDevices sync.RWMutex
lockHooks sync.RWMutex
lockMounts sync.RWMutex
}
// Devices calls DevicesFunc.
func (mock *DiscoverMock) Devices() ([]Device, error) {
callInfo := struct {
}{}
mock.lockDevices.Lock()
mock.calls.Devices = append(mock.calls.Devices, callInfo)
mock.lockDevices.Unlock()
if mock.DevicesFunc == nil {
var (
devicesOut []Device
errOut error
)
return devicesOut, errOut
}
return mock.DevicesFunc()
}
// DevicesCalls gets all the calls that were made to Devices.
// Check the length with:
// len(mockedDiscover.DevicesCalls())
func (mock *DiscoverMock) DevicesCalls() []struct {
} {
var calls []struct {
}
mock.lockDevices.RLock()
calls = mock.calls.Devices
mock.lockDevices.RUnlock()
return calls
}
// Hooks calls HooksFunc.
func (mock *DiscoverMock) Hooks() ([]Hook, error) {
callInfo := struct {
}{}
mock.lockHooks.Lock()
mock.calls.Hooks = append(mock.calls.Hooks, callInfo)
mock.lockHooks.Unlock()
if mock.HooksFunc == nil {
var (
hooksOut []Hook
errOut error
)
return hooksOut, errOut
}
return mock.HooksFunc()
}
// HooksCalls gets all the calls that were made to Hooks.
// Check the length with:
// len(mockedDiscover.HooksCalls())
func (mock *DiscoverMock) HooksCalls() []struct {
} {
var calls []struct {
}
mock.lockHooks.RLock()
calls = mock.calls.Hooks
mock.lockHooks.RUnlock()
return calls
}
// Mounts calls MountsFunc.
func (mock *DiscoverMock) Mounts() ([]Mount, error) {
callInfo := struct {
}{}
mock.lockMounts.Lock()
mock.calls.Mounts = append(mock.calls.Mounts, callInfo)
mock.lockMounts.Unlock()
if mock.MountsFunc == nil {
var (
mountsOut []Mount
errOut error
)
return mountsOut, errOut
}
return mock.MountsFunc()
}
// MountsCalls gets all the calls that were made to Mounts.
// Check the length with:
// len(mockedDiscover.MountsCalls())
func (mock *DiscoverMock) MountsCalls() []struct {
} {
var calls []struct {
}
mock.lockMounts.RLock()
calls = mock.calls.Mounts
mock.lockMounts.RUnlock()
return calls
}

View File

@@ -0,0 +1,424 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package discover
import (
"fmt"
"path/filepath"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvml"
"github.com/NVIDIA/nvidia-container-toolkit/internal/proc"
log "github.com/sirupsen/logrus"
)
const (
// ControlDeviceUUID is used as the UUID for control devices such as nvidiactl or nvidia-modeset
ControlDeviceUUID = "CONTROL"
// MIGConfigDeviceUUID is used to indicate the MIG config control device
MIGConfigDeviceUUID = "CONFIG"
// MIGMonitorDeviceUUID is used to indicate the MIG monitor control device
MIGMonitorDeviceUUID = "MONITOR"
nvidiaGPUDeviceName = "nvidia-frontend"
nvidiaCapsDeviceName = "nvidia-caps"
nvidiaUVMDeviceName = "nvidia-uvm"
)
type nvmlDiscover struct {
None
logger *log.Logger
nvml nvml.Interface
migCaps map[ProcPath]DeviceNode
nvidiaDevices proc.NvidiaDevices
}
var _ Discover = (*nvmlDiscover)(nil)
// NewNVMLDiscover constructs a discoverer that uses NVML to find the devices
// available on a system.
func NewNVMLDiscover(nvml nvml.Interface) (Discover, error) {
return NewNVMLDiscoverWithLogger(log.StandardLogger(), nvml)
}
// NewNVMLDiscoverWithLogger constructs a discovered as with NewNVMLDiscover with the specified
// logger
func NewNVMLDiscoverWithLogger(logger *log.Logger, nvml nvml.Interface) (Discover, error) {
nvidiaDevices, err := proc.GetNvidiaDevices()
if err != nil {
return nil, fmt.Errorf("error loading NVIDIA devices: %v", err)
}
var migCaps map[ProcPath]DeviceNode
nvcapsDevice, exists := nvidiaDevices.Get(nvidiaCapsDeviceName)
if !exists {
logger.Warnf("%v nvcaps device could not be found", nvidiaCapsDeviceName)
} else if migCaps, err = getMigCaps(nvcapsDevice.Major); err != nil {
logger.Warnf("Could not load MIG capability devices: %v", err)
migCaps = nil
}
discover := &nvmlDiscover{
logger: logger,
nvml: nvml,
migCaps: migCaps,
nvidiaDevices: nvidiaDevices,
}
return discover, nil
}
// hasMigSupport checks if MIG device discovery is supported.
// Cases where this will be disabled include where no MIG minors file is
// present.
func (d nvmlDiscover) hasMigSupport() bool {
return len(d.migCaps) > 0
}
func (d *nvmlDiscover) Devices() ([]Device, error) {
ret := d.nvml.Init()
if ret.Value() != nvml.SUCCESS {
return nil, fmt.Errorf("error initalizing NVML: %v", ret.Error())
}
defer d.tryShutdownNVML()
c, ret := d.nvml.DeviceGetCount()
if ret.Value() != nvml.SUCCESS {
return nil, fmt.Errorf("error getting device count: %v", ret.Error())
}
var handles []nvml.Device
for i := 0; i < c; i++ {
handle, ret := d.nvml.DeviceGetHandleByIndex(i)
if ret.Value() != nvml.SUCCESS {
return nil, fmt.Errorf("error getting device handle for device %v: %v", i, ret.Error())
}
if !d.hasMigSupport() {
handles = append(handles, handle)
continue
}
migHandles, err := getMIGHandlesForDevice(handle)
if err != nil {
return nil, fmt.Errorf("error getting MIG handles for device: %v", err)
}
if len(migHandles) == 0 {
handles = append(handles, handle)
}
handles = append(handles, migHandles...)
}
return d.devicesByHandle(handles)
}
func (d *nvmlDiscover) devicesByHandle(handles []nvml.Device) ([]Device, error) {
var devices []Device
var largestMinorNumber int
for _, h := range handles {
device, err := d.deviceFromNVMLHandle(h)
if err != nil {
return nil, fmt.Errorf("error constructing device from handle %v: %v", h, err)
}
devices = append(devices, device)
if largestMinorNumber < device.DeviceNodes[0].Minor {
largestMinorNumber = device.DeviceNodes[0].Minor
}
}
controlDevices, err := d.getControlDevices()
if err != nil {
return nil, fmt.Errorf("error getting control devices: %v", err)
}
devices = append(devices, controlDevices...)
if d.hasMigSupport() {
migControlDevices, err := d.getMigControlDevices(largestMinorNumber)
if err != nil {
return nil, fmt.Errorf("error getting MIG control devices: %v", err)
}
devices = append(devices, migControlDevices...)
}
return devices, nil
}
func (d *nvmlDiscover) deviceFromNVMLHandle(handle nvml.Device) (Device, error) {
if d.hasMigSupport() {
isMigDevice, ret := handle.IsMigDeviceHandle()
if ret.Value() != nvml.SUCCESS {
return Device{}, fmt.Errorf("error checking device handle: %v", ret.Error())
}
if isMigDevice {
return d.deviceFromMIGDeviceHandle(handle)
}
}
return d.deviceFromFullDeviceHandle(handle)
}
func (d *nvmlDiscover) deviceFromFullDeviceHandle(handle nvml.Device) (Device, error) {
index, ret := handle.GetIndex()
if ret.Value() != nvml.SUCCESS {
return Device{}, fmt.Errorf("error getting device index: %v", ret.Error())
}
uuid, ret := handle.GetUUID()
if ret.Value() != nvml.SUCCESS {
return Device{}, fmt.Errorf("error getting device UUID: %v", ret.Error())
}
pciInfo, ret := handle.GetPciInfo()
if ret.Value() != nvml.SUCCESS {
return Device{}, fmt.Errorf("error getting PCI info: %v", ret.Error())
}
busID := NewPCIBusID(pciInfo)
minor, ret := handle.GetMinorNumber()
if ret.Value() != nvml.SUCCESS {
return Device{}, fmt.Errorf("error getting minor number: %v", ret.Error())
}
nvidiaGPUDevice, exists := d.nvidiaDevices.Get(nvidiaGPUDeviceName)
if !exists {
return Device{}, fmt.Errorf("device for '%v' does not exist", nvidiaGPUDeviceName)
}
deviceNode := DeviceNode{
Path: DevicePath(fmt.Sprintf("/dev/nvidia%d", minor)),
Major: nvidiaGPUDevice.Major,
Minor: minor,
}
device := Device{
Index: fmt.Sprintf("%d", index),
PCIBusID: busID,
UUID: uuid,
DeviceNodes: []DeviceNode{deviceNode},
ProcPaths: []ProcPath{busID.GetProcPath()},
}
return device, nil
}
func (d *nvmlDiscover) deviceFromMIGDeviceHandle(handle nvml.Device) (Device, error) {
parent, ret := handle.GetDeviceHandleFromMigDeviceHandle()
if ret.Value() != nvml.SUCCESS {
return Device{}, fmt.Errorf("error getting parent device handle: %v", ret.Error())
}
gpu, ret := parent.GetMinorNumber()
if ret.Value() != nvml.SUCCESS {
return Device{}, fmt.Errorf("error getting GPU minor number: %v", ret.Error())
}
parentDevice, err := d.deviceFromFullDeviceHandle(parent)
if err != nil {
return Device{}, fmt.Errorf("error getting parent device: %v", err)
}
index, ret := handle.GetIndex()
if ret.Value() != nvml.SUCCESS {
return Device{}, fmt.Errorf("error getting device index: %v", ret.Error())
}
uuid, ret := handle.GetUUID()
if ret.Value() != nvml.SUCCESS {
return Device{}, fmt.Errorf("error getting device UUID: %v", ret.Error())
}
capDeviceNodes := []DeviceNode{}
procPaths, err := getProcPathsForMigDevice(gpu, handle)
if err != nil {
return Device{}, fmt.Errorf("error getting proc paths for MIG device: %v", err)
}
for _, p := range procPaths {
capDeviceNode, ok := d.migCaps[p]
if !ok {
return Device{}, fmt.Errorf("could not determine cap device path for %v", p)
}
capDeviceNodes = append(capDeviceNodes, capDeviceNode)
}
device := Device{
Index: fmt.Sprintf("%s:%d", parentDevice.Index, index),
UUID: uuid,
DeviceNodes: append(parentDevice.DeviceNodes, capDeviceNodes...),
ProcPaths: append(parentDevice.ProcPaths, procPaths...),
}
return device, nil
}
func (d *nvmlDiscover) getControlDevices() ([]Device, error) {
devices := []struct {
name string
path string
minor int
}{
// TODO: Where is the best place to find these device Minors programatically?
{nvidiaGPUDeviceName, "/dev/nvidia-modeset", 254},
{nvidiaGPUDeviceName, "/dev/nvidiactl", 255},
{nvidiaUVMDeviceName, "/dev/nvidia-uvm", 0},
{nvidiaUVMDeviceName, "/dev/nvidia-uvm-tools", 1},
}
var controlDevices []Device
for _, info := range devices {
device, exists := d.nvidiaDevices.Get(info.name)
if !exists {
d.logger.Warnf("device name %v not defined; skipping control devices %v", info.name, info.path)
continue
}
deviceNode := DeviceNode{
Path: DevicePath(info.path),
Major: device.Major,
Minor: info.minor,
}
controlDevices = append(controlDevices, Device{
UUID: ControlDeviceUUID,
DeviceNodes: []DeviceNode{deviceNode},
ProcPaths: []ProcPath{},
})
}
return controlDevices, nil
}
func (d *nvmlDiscover) getMigControlDevices(numGpus int) ([]Device, error) {
targets := map[string]ProcPath{
MIGConfigDeviceUUID: ProcPath("/proc/driver/nvidia/capabilities/mig/config"),
MIGMonitorDeviceUUID: ProcPath("/proc/driver/nvidia/capabilities/mig/monitor"),
}
var devices []Device
for id, procPath := range targets {
deviceNode, exists := d.migCaps[procPath]
if !exists {
return nil, fmt.Errorf("device node for '%v' is undefined", procPath)
}
var procPaths []ProcPath
for gpu := 0; gpu <= numGpus; gpu++ {
procPaths = append(procPaths, ProcPath(fmt.Sprintf("/proc/driver/nvidia/capabilities/gpu%d/mig", gpu)))
}
procPaths = append(procPaths, procPath)
devices = append(devices, Device{
UUID: id,
DeviceNodes: []DeviceNode{deviceNode},
ProcPaths: procPaths,
})
}
return devices, nil
}
func getProcPathsForMigDevice(gpu int, handle nvml.Device) ([]ProcPath, error) {
gi, ret := handle.GetGPUInstanceId()
if ret.Value() != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU instance ID: %v", ret.Error())
}
ci, ret := handle.GetComputeInstanceId()
if ret.Value() != nvml.SUCCESS {
return nil, fmt.Errorf("error getting comput instance ID: %v", ret.Error())
}
procPaths := []ProcPath{
ProcPath(fmt.Sprintf("/proc/driver/nvidia/capabilities/gpu%d/mig/gi%d/access", gpu, gi)),
ProcPath(fmt.Sprintf("/proc/driver/nvidia/capabilities/gpu%d/mig/gi%d/ci%d/access", gpu, gi, ci)),
}
return procPaths, nil
}
func getMIGHandlesForDevice(handle nvml.Device) ([]nvml.Device, error) {
currentMigMode, _, ret := handle.GetMigMode()
if ret.Value() == nvml.ERROR_NOT_SUPPORTED {
return nil, nil
}
if ret.Value() != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG mode for device: %v", ret.Error())
}
if currentMigMode == nvml.DEVICE_MIG_DISABLE {
return nil, nil
}
maxMigDeviceCount, ret := handle.GetMaxMigDeviceCount()
if ret.Value() != nvml.SUCCESS {
return nil, fmt.Errorf("error getting number of MIG devices: %v", ret.Error())
}
var migHandles []nvml.Device
for mi := 0; mi < maxMigDeviceCount; mi++ {
migHandle, ret := handle.GetMigDeviceHandleByIndex(mi)
if ret.Value() == nvml.ERROR_NOT_FOUND {
continue
}
if ret.Value() != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device %v: %v", mi, ret.Error())
}
migHandles = append(migHandles, migHandle)
}
return migHandles, nil
}
func (d *nvmlDiscover) tryShutdownNVML() {
ret := d.nvml.Shutdown()
if ret.Value() != nvml.SUCCESS {
d.logger.Warnf("Could not shut down NVML: %v", ret.Error())
}
}
// NewPCIBusID provides a utility function that returns the string representation
// of the bus ID.
func NewPCIBusID(p nvml.PciInfo) PCIBusID {
var bytes []byte
for _, b := range p.BusId {
if byte(b) == '\x00' {
break
}
bytes = append(bytes, byte(b))
}
return PCIBusID(string(bytes))
}
// GetProcPath returns the path in /proc associated with the PCI bus ID
func (p PCIBusID) GetProcPath() ProcPath {
id := strings.ToLower(p.String())
if strings.HasPrefix(id, "0000") {
id = id[4:]
}
return ProcPath(filepath.Join("/proc/driver/nvidia/gpus", id))
}
func (p PCIBusID) String() string {
return string(p)
}

View File

@@ -0,0 +1,44 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package discover
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
)
// getMigCaps returns a mapping of MIG capability path to device nodes
func getMigCaps(capDeviceMajor int) (map[ProcPath]DeviceNode, error) {
migCaps, err := nvcaps.LoadMigMinors()
if err != nil {
return nil, fmt.Errorf("error loading MIG minors: %v", err)
}
return getMigCapsFromMigMinors(migCaps, capDeviceMajor), nil
}
func getMigCapsFromMigMinors(migCaps map[nvcaps.MigCap]nvcaps.MigMinor, capDeviceMajor int) map[ProcPath]DeviceNode {
capsDevicePaths := make(map[ProcPath]DeviceNode)
for cap, minor := range migCaps {
capsDevicePaths[ProcPath(cap.ProcPath())] = DeviceNode{
Path: DevicePath(minor.DevicePath()),
Major: capDeviceMajor,
Minor: int(minor),
}
}
return capsDevicePaths
}

View File

@@ -0,0 +1,41 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package discover
import (
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
"github.com/stretchr/testify/require"
)
func TestGetMigCaps(t *testing.T) {
migMinors := map[nvcaps.MigCap]nvcaps.MigMinor{
"config": 1,
"monitor": 2,
"gpu0/gi0/access": 3,
"gpu0/gi0/ci0/access": 4,
}
migCapMajor := 999
migCaps := getMigCapsFromMigMinors(migMinors, migCapMajor)
require.Len(t, migCaps, len(migMinors))
for _, c := range migCaps {
require.Equal(t, migCapMajor, c.Major)
}
}

View File

@@ -0,0 +1,219 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package discover
import (
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvml"
"github.com/NVIDIA/nvidia-container-toolkit/internal/proc"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
const (
nvidiaGPUDeviceMajorDefault = 195
nvidiaCapsDeviceMajorDefault = 235
)
func newTestDiscover() nvmlDiscover {
logger, _ := testlog.NewNullLogger()
nvml := nvml.NewMockNVMLServer(nvml.NewMockA100Device(0))
return nvmlDiscover{
logger: logger,
nvml: nvml,
nvidiaDevices: proc.NewMockNvidiaDevices(
proc.Device{Name: nvidiaGPUDeviceName, Major: nvidiaGPUDeviceMajorDefault},
),
}
}
func newTestDiscoverWithMIG() nvmlDiscover {
device := &nvml.MockA100Device{
Index: 0,
MigMode: nvml.DEVICE_MIG_ENABLE,
GpuInstances: make(map[*nvml.MockA100GpuInstance]struct{}),
GpuInstanceCounter: 0,
}
// Create a single gi and ci on the device
gpuInstanceProfileInfo := &nvml.GpuInstanceProfileInfo{
Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE,
}
gi, _ := device.CreateGpuInstance(gpuInstanceProfileInfo)
computeInstanceProfileInfo := &nvml.ComputeInstanceProfileInfo{
Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
}
_, _ = gi.CreateComputeInstance(computeInstanceProfileInfo)
logger, _ := testlog.NewNullLogger()
return nvmlDiscover{
logger: logger,
nvml: nvml.NewMockNVMLServer(device),
migCaps: map[ProcPath]DeviceNode{
ProcPath("/proc/driver/nvidia/capabilities/gpu0/mig/gi0/access"): {
Path: DevicePath("/dev/nvidia-caps/nvidia-cap3"),
Minor: 3,
Major: 235,
},
ProcPath("/proc/driver/nvidia/capabilities/gpu0/mig/gi0/ci0/access"): {
Path: DevicePath("/dev/nvidia-caps/nvidia-cap4"),
Minor: 4,
Major: 235,
},
ProcPath("/proc/driver/nvidia/capabilities/mig/config"): {
Path: DevicePath("/dev/nvidia-caps/nvidia-cap1"),
Minor: 1,
Major: 235,
},
ProcPath("/proc/driver/nvidia/capabilities/mig/monitor"): {
Path: DevicePath("/dev/nvidia-caps/nvidia-cap2"),
Minor: 2,
Major: 235,
},
},
nvidiaDevices: proc.NewMockNvidiaDevices(
proc.Device{Name: nvidiaGPUDeviceName, Major: nvidiaGPUDeviceMajorDefault},
proc.Device{Name: nvidiaCapsDeviceName, Major: nvidiaCapsDeviceMajorDefault},
),
}
}
func TestDiscoverNvmlDevices(t *testing.T) {
d := newTestDiscover()
devices, err := d.Devices()
require.NoError(t, err)
require.Len(t, devices, 3)
device := devices[0]
require.Equal(t, "0", device.Index)
require.Equal(t, "GPU-0", device.UUID)
require.Equal(t, "0000FFFF:FF:FF.F", device.PCIBusID.String())
expectedDeviceNodes := []DeviceNode{
{
Path: DevicePath("/dev/nvidia0"),
Minor: 0,
Major: nvidiaGPUDeviceMajorDefault,
},
}
require.Equal(t, expectedDeviceNodes, device.DeviceNodes)
expectedProcPaths := []ProcPath{ProcPath("/proc/driver/nvidia/gpus/ffff:ff:ff.f")}
require.Equal(t, expectedProcPaths, device.ProcPaths)
}
func TestDiscoverNvmlMigDevices(t *testing.T) {
d := newTestDiscoverWithMIG()
devices, err := d.Devices()
require.NoError(t, err)
require.Len(t, devices, 5)
mig := devices[0]
require.Equal(t, "0:0", mig.Index)
require.Equal(t, "MIG-0", mig.UUID)
require.Empty(t, mig.PCIBusID)
expectedDeviceNodes := []DeviceNode{
{
Path: DevicePath("/dev/nvidia0"),
Minor: 0,
Major: nvidiaGPUDeviceMajorDefault,
},
{
Path: DevicePath("/dev/nvidia-caps/nvidia-cap3"),
Minor: 3,
Major: nvidiaCapsDeviceMajorDefault,
},
{
Path: DevicePath("/dev/nvidia-caps/nvidia-cap4"),
Minor: 4,
Major: nvidiaCapsDeviceMajorDefault,
},
}
require.Equal(t, expectedDeviceNodes, mig.DeviceNodes)
expectedProcPaths := []ProcPath{
ProcPath("/proc/driver/nvidia/gpus/ffff:ff:ff.f"),
ProcPath("/proc/driver/nvidia/capabilities/gpu0/mig/gi0/access"),
ProcPath("/proc/driver/nvidia/capabilities/gpu0/mig/gi0/ci0/access"),
}
require.Equal(t, expectedProcPaths, mig.ProcPaths)
var config *Device
var monitor *Device
for i, d := range devices {
if d.UUID == "CONFIG" {
config = &devices[i]
}
if d.UUID == "MONITOR" {
monitor = &devices[i]
}
}
require.NotNil(t, config)
require.NotNil(t, monitor)
require.Equal(t, "CONFIG", config.UUID)
expectedDeviceNodes = []DeviceNode{
{
Path: DevicePath("/dev/nvidia-caps/nvidia-cap1"),
Minor: 1,
Major: nvidiaCapsDeviceMajorDefault,
},
}
require.Equal(t, expectedDeviceNodes, config.DeviceNodes)
require.Contains(t, config.ProcPaths, ProcPath("/proc/driver/nvidia/capabilities/mig/config"))
require.Len(t, config.ProcPaths, 2)
require.Equal(t, "MONITOR", monitor.UUID)
expectedDeviceNodes = []DeviceNode{
{
Path: DevicePath("/dev/nvidia-caps/nvidia-cap2"),
Minor: 2,
Major: nvidiaCapsDeviceMajorDefault,
},
}
require.Equal(t, expectedDeviceNodes, monitor.DeviceNodes)
require.Contains(t, monitor.ProcPaths, ProcPath("/proc/driver/nvidia/capabilities/mig/monitor"))
require.Len(t, monitor.ProcPaths, 2)
}
func TestPCIBusID(t *testing.T) {
testCases := map[string]ProcPath{
"0000FFFF:FF:FF.F": "/proc/driver/nvidia/gpus/ffff:ff:ff.f",
"FFFFFFFF:FF:FF.F": "/proc/driver/nvidia/gpus/ffffffff:ff:ff.f",
}
for busID, procPath := range testCases {
p := PCIBusID(busID)
require.Equal(t, busID, p.String())
require.Equal(t, procPath, p.GetProcPath())
}
}

Some files were not shown because too many files have changed in this diff Show More