Compare commits

..

74 Commits

Author SHA1 Message Date
Evan Lezar
28b70663f1 Merge branch 'skip-for-point-release' into 'main'
Skip components for patch releases

See merge request nvidia/container-toolkit/container-toolkit!374
2023-04-24 12:12:36 +00:00
Evan Lezar
c0fe8f27eb Skip components for patch releases
This change ensures that the nvidia-docker2 and nvidia-container-runtime
components are not build and distributed for patch releases.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-24 14:00:10 +02:00
Evan Lezar
926ac77bc0 Merge branch 'fix-cdi-spec-generation-on-debian' into 'main'
Resolve all symlinks when finding libraries in LDCache

See merge request nvidia/container-toolkit/container-toolkit!370
2023-04-24 10:09:37 +00:00
Evan Lezar
fc7c8f7520 Resolve all symlinks in ldcache
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-21 17:28:49 +02:00
Evan Lezar
46c1c45d85 Add /usr/lib/current to search path
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-21 11:47:42 +02:00
Evan Lezar
f99e863649 Merge branch 'CNT-4142/xorg-missing-not-fatal' into 'main'
Make discovery of Xorg libraries optional

See merge request nvidia/container-toolkit/container-toolkit!368
2023-04-21 09:47:10 +00:00
Evan Lezar
dcc21ece97 Merge branch 'add-debug-output' into 'main'
Fix target folder for kitmaker

See merge request nvidia/container-toolkit/container-toolkit!373
2023-04-20 18:38:05 +00:00
Evan Lezar
a53e3604a6 Fix target folder for kitmaker
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-20 20:37:46 +02:00
Evan Lezar
cfea6c1179 Merge branch 'add-debug-output' into 'main'
Properly create target folder for kitmaker

See merge request nvidia/container-toolkit/container-toolkit!372
2023-04-20 14:53:57 +00:00
Evan Lezar
4d1daa0b6c Properly create target folder for kitmaker
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-20 16:53:32 +02:00
Evan Lezar
df925bc7fd Merge branch 'include-libnvidia-container-in-kitmaker' into 'main'
Add a workaround to publish libnvidia-container0

See merge request nvidia/container-toolkit/container-toolkit!371
2023-04-20 08:29:02 +00:00
Evan Lezar
df22e37dfd Add a workaround to publish libnvidia-container0
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-18 23:23:36 +02:00
Evan Lezar
2136266d1d Make discovery of Xorg libraries optional
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-13 18:41:38 +02:00
Evan Lezar
a95232dd33 Merge branch 'CNT-4144/non-ldcache' into 'main'
Only update ldcache if it exists

See merge request nvidia/container-toolkit/container-toolkit!369
2023-04-13 16:12:22 +00:00
Evan Lezar
29c6288128 Only update ldcache if it exists
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-13 17:18:09 +02:00
Evan Lezar
cd6fcb5297 Merge branch 'bump-version-1.13.1' into 'main'
Bump version to 1.13.1

See merge request nvidia/container-toolkit/container-toolkit!367
2023-04-13 11:12:37 +00:00
Evan Lezar
36989deff7 Update libnvidia-container
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-13 13:11:39 +02:00
Evan Lezar
7f6c9851fe Bump version to 1.13.1
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-04-13 13:11:09 +02:00
Evan Lezar
b7079454b5 Merge branch 'bump-versions' into 'main'
Bump nvidia-docker and nvidia-container-runtime versions

See merge request nvidia/container-toolkit/container-toolkit!366
2023-03-31 13:00:58 +00:00
Evan Lezar
448bd45ab4 Bump nvidia-docker and nvidia-container-runtime versions
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-31 15:00:15 +02:00
Evan Lezar
dde6170df1 Merge branch 'bump-version-v1.13.0' into 'main'
Bump version to v1.13.0

See merge request nvidia/container-toolkit/container-toolkit!365
2023-03-31 10:58:18 +00:00
Evan Lezar
e4b9350e65 Update libnvidia-container to v1.13.0
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-31 11:38:01 +02:00
Evan Lezar
622a0649ce Bump version to v1.13.0
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-31 11:35:53 +02:00
Evan Lezar
f6983969ad Merge branch 'nvidia-ctk-cdi-transform' into 'main'
Add 'target-driver-root' option to 'nvidia-ctk cdi generate' to transform root...

See merge request nvidia/container-toolkit/container-toolkit!363
2023-03-28 20:05:12 +00:00
Evan Lezar
7f7fc35843 Move input and output to transform root subcommand
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 21:12:48 +02:00
Evan Lezar
8eef7e5406 Merge branch 'add-runtimes' into 'main'
Add nvidia-container-runtime.runtimes config option

See merge request nvidia/container-toolkit/container-toolkit!364
2023-03-28 18:58:46 +00:00
Evan Lezar
f27c33b45f Remove target-driver-root from generate
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 11:49:45 -07:00
Evan Lezar
6a83e2ebe5 Add nvidia-ctk cdi transform root command
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 11:45:58 -07:00
Christopher Desiniotis
ee5be5e3f2 Merge branch 'CNT-4056/add-cdi-annotations' into 'main'
Add nvidia-container-runtime.modes.cdi.annotation-prefixes config option.

See merge request nvidia/container-toolkit/container-toolkit!356
2023-03-28 16:47:51 +00:00
Evan Lezar
be0cc9dc6e Add nvidia-container-runtime.runtimes config option
This change adds an nvidia-container-runtime.runtimes config option.

If this is unset no changes are made to the config and the default values are used. This
allows this setting to be overridden in cases where this is required. One such example is
crio where crun is set as the default runtime.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 17:39:17 +02:00
Evan Lezar
7c5283bb97 Merge branch 'create-device-nodes' into 'main'
Add nvidia-ctk system create-device-nodes command

See merge request nvidia/container-toolkit/container-toolkit!362
2023-03-28 15:07:04 +00:00
Evan Lezar
4d5ba09d88 Add --ignore-errors option for testing
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 16:24:17 +02:00
Evan Lezar
149236b002 Configure containerd config based on specified annotation prefixes
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 16:22:48 +02:00
Evan Lezar
ee141f97dc Reorganise setting toolkit config options
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 16:22:48 +02:00
Evan Lezar
646503ff31 Set nvidia-container-runtime.modes.cdi.annotation-prefixes in toolkit-contianer
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 16:22:48 +02:00
Evan Lezar
cdaaf5e46f Generate device nodes when creating management spec
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 11:29:45 +02:00
Evan Lezar
e774c51c97 Add nvidia-ctk system create-device-nodes command
This change adds an nvidia-ctk system create-device-nodes command for
creating NVIDIA device nodes. Currently this is limited to control devices
(nvidia-uvm, nvidia-uvm-tools, nvidia-modeset, nvidiactl).

A --dry-run mode is included for outputing commands that would be executed and
the driver root can be specified.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-28 11:29:45 +02:00
Christopher Desiniotis
7f5c9abc1e Add ability to configure CDI kind with 'nvidia-ctk cdi generate'
Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
2023-03-27 23:12:00 -07:00
Christopher Desiniotis
92d82ceaee Add 'target-driver-root' option to 'nvidia-ctk cdi generate' to transform root paths in generated spec
Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
2023-03-27 22:22:36 -07:00
Evan Lezar
c46b118f37 Add nvidia-container-runtime.modes.cdi.annotation-prefixes config option.
This change adds an nvidia-container-runtime.modes.cdi.annotation-prefixes config
option that defaults to cdi.k8s.io/. This allows the annotation prefixes parsed
for CDI devices to be overridden in cases where CDI support in container engines such
as containerd or crio need to be overridden.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-27 16:36:54 +02:00
Evan Lezar
1722b07615 Merge branch 'CNT-2264/xorg-libs' into 'main'
Inject xorg libs and config in container

See merge request nvidia/container-toolkit/container-toolkit!328
2023-03-27 14:19:52 +00:00
Evan Lezar
c13c6ebadb Inject xorg libs and config in container
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-26 17:04:06 +02:00
Evan Lezar
2abe679dd1 Move libcuda locator to internal/lookup package
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-26 17:04:06 +02:00
Evan Lezar
9571513601 Merge branch 'update-changelog' into 'main'
Update changelog

See merge request nvidia/container-toolkit/container-toolkit!361
2023-03-26 15:03:28 +00:00
Evan Lezar
ff2767ee7b Reorder changelog
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-26 17:03:05 +02:00
Evan Lezar
56319475a6 Merge branch 'fix-changelog' into 'main'
Reorder changelog

See merge request nvidia/container-toolkit/container-toolkit!360
2023-03-26 14:52:27 +00:00
Evan Lezar
a3ee58a294 Reorder changelog
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-26 16:51:59 +02:00
Evan Lezar
7a533aeff3 Merge branch 'update-nvcdi-new-with-error' into 'main'
Allow nvcdi.Option to return an error

See merge request nvidia/container-toolkit/container-toolkit!352
2023-03-26 14:13:41 +00:00
Evan Lezar
226c54613e Also return an error from nvcdi.New
This change allows nvcdi.New to return an error in addition to the
constructed library instead of panicing.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-26 16:13:12 +02:00
Evan Lezar
1ebbebf5de Merge branch 'CNT-3932/deduplicate-entries-in-cdi-spec' into 'main'
Add transform to deduplicate entities in CDI spec

See merge request nvidia/container-toolkit/container-toolkit!345
2023-03-24 19:04:43 +00:00
Evan Lezar
33f6fe0217 Generate a simplified CDI spec by default
As simplified CDI spec has no duplicate entities in any single set of container edits.
Furthermore, contianer edits defined at a spec-level are not included in the container
edits for a device.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-24 11:01:46 +02:00
Evan Lezar
5ff206e1a9 Add transform to deduplicate entities in CDI spec
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-24 11:01:23 +02:00
Evan Lezar
df618d3cba Merge branch 'CNT-4052/fix-arm-management-containers' into 'main'
Fix generation of management CDI spec in containers

See merge request nvidia/container-toolkit/container-toolkit!354
2023-03-23 16:39:10 +00:00
Evan Lezar
9506bd9da0 Fix generation of management CDI spec in containers
Since we relied on finding libcuda.so in the LDCache to determine both the CUDA
version and the expected directory for the driver libraries, the generation of the
management CDI specifications fails in containers where the LDCache has not been updated.

This change falls back to searching a set of predefined paths instead when the lookup of
libcuda.so in the cache fails.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-23 15:59:01 +02:00
Evan Lezar
5e0684e99d Merge branch 'update-libnvidia-container' into 'main'
Update libnvidia-container

See merge request nvidia/container-toolkit/container-toolkit!353
2023-03-23 08:50:18 +00:00
Evan Lezar
09a0cb24cc Remove fedora make targets
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-23 10:35:57 +02:00
Evan Lezar
ff92f1d799 Update libnvidia-container
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-23 10:33:26 +02:00
Christopher Desiniotis
b87703c503 Merge branch 'fix-nil-logger-in-library-locator' into 'main'
Instantiate a logger when constructing a library Locator

See merge request nvidia/container-toolkit/container-toolkit!351
2023-03-21 21:54:14 +00:00
Christopher Desiniotis
b2aaa21b0a Instantiate a logger when constructing a library Locator
Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
2023-03-21 13:38:36 -07:00
Evan Lezar
310c15b046 Merge branch 'CNT-4026/only-init-nvml-when-required' into 'main'
Only init nvml as required when generating CDI specs

See merge request nvidia/container-toolkit/container-toolkit!344
2023-03-20 13:26:07 +00:00
Evan Lezar
685802b1ce Only init nvml as required when generating CDI specs
CDI generation modes such as management and wsl don't require
NVML. This change removes the top-level instantiation of nvmllib
and replaces it with an instanitation in the nvml CDI spec generation
code.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-20 14:24:08 +02:00
Evan Lezar
380eb8340a Merge branch 'blossom-ci' into 'main'
Add blossom-ci github action

See merge request nvidia/container-toolkit/container-toolkit!349
2023-03-20 09:56:23 +00:00
Evan Lezar
f98e1160f5 Update components with blossim-ci
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-20 11:06:44 +02:00
Evan Lezar
1962fd68df Merge branch 'locate-ipc-sockets-at-run' into 'main'
Locate persistenced and fabricmanager sockets at /run instead of /var/run

See merge request nvidia/container-toolkit/container-toolkit!347
2023-03-20 08:08:59 +00:00
Carlos Eduardo Arango Gutierrez
29813c1e14 Add blossom-ci github action
Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
2023-03-17 16:16:27 +01:00
Evan Lezar
df40fbe03e Locate persistenced and fabricmanager sockets at /run instead of /var/run
This chagne prefers (non-symlink) sockets at /run over /var/run for
nvidia-persistenced and nvidia-fabricmanager sockets.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-17 09:23:48 +02:00
Carlos Eduardo Arango Gutierrez
7000c6074e Merge branch 'ci_rules' into 'main'
Rework pipeline triggers for MRs

See merge request nvidia/container-toolkit/container-toolkit!346
2023-03-15 13:15:23 +00:00
Evan Lezar
ef1fe3ab41 Rework pipeline triggers for MRs
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-15 14:15:20 +02:00
Evan Lezar
fdd198b0e8 Merge branch 'bump-v1.13.0-rc.3' into 'main'
Bump version to v1.13.0-rc.3

See merge request nvidia/container-toolkit/container-toolkit!343
2023-03-15 07:50:50 +00:00
Evan Lezar
e37f77e02d Update libnvidia-container
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-15 09:49:49 +02:00
Evan Lezar
3fcfee88be Bump version to v1.13.0-rc.3
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-15 09:26:19 +02:00
Evan Lezar
a082413d09 Merge branch 'trigger-ci-on-mrs-only' into 'main'
Add workflow rule to only trigger on MRs

See merge request nvidia/container-toolkit/container-toolkit!342
2023-03-15 07:10:30 +00:00
Evan Lezar
280f40508e Make pipeline manual on MRs
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-15 08:51:18 +02:00
Evan Lezar
e2be0e2ff0 Add workflow rule to only trigger on MRs
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2023-03-15 08:45:26 +02:00
46 changed files with 2350 additions and 158 deletions

View File

@@ -23,6 +23,7 @@ variables:
BUILD_MULTI_ARCH_IMAGES: "true"
stages:
- trigger
- image
- lint
- go-checks
@@ -34,14 +35,44 @@ stages:
- scan
- release
.pipeline-trigger-rules:
rules:
# We trigger the pipeline if started manually
- if: $CI_PIPELINE_SOURCE == "web"
# We trigger the pipeline on the main branch
- if: $CI_COMMIT_BRANCH == "main"
# We trigger the pipeline on the release- branches
- if: $CI_COMMIT_BRANCH =~ /^release-.*$/
# We trigger the pipeline on tags
- if: $CI_COMMIT_TAG && $CI_COMMIT_TAG != ""
workflow:
rules:
# We trigger the pipeline on a merge request
- if: $CI_PIPELINE_SOURCE == 'merge_request_event'
# We then add all the regular triggers
- !reference [.pipeline-trigger-rules, rules]
# The main or manual job is used to filter out distributions or architectures that are not required on
# every build.
.main-or-manual:
rules:
- if: $CI_COMMIT_BRANCH == "main"
- if: $CI_COMMIT_BRANCH =~ /^release-.*$/
- if: $CI_COMMIT_TAG && $CI_COMMIT_TAG != ""
- !reference [.pipeline-trigger-rules, rules]
- if: $CI_PIPELINE_SOURCE == "schedule"
when: manual
# The trigger-pipeline job adds a manualy triggered job to the pipeline on merge requests.
trigger-pipeline:
stage: trigger
script:
- echo "starting pipeline"
rules:
- !reference [.main-or-manual, rules]
- if: $CI_PIPELINE_SOURCE == "merge_request_event"
when: manual
allow_failure: false
- when: always
# Define the distribution targets
.dist-amazonlinux2:
rules:

113
.github/workflows/blossom-ci.yaml vendored Normal file
View File

@@ -0,0 +1,113 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# A workflow to trigger ci on hybrid infra (github + self hosted runner)
name: Blossom-CI
on:
issue_comment:
types: [created]
workflow_dispatch:
inputs:
platform:
description: 'runs-on argument'
required: false
args:
description: 'argument'
required: false
jobs:
Authorization:
name: Authorization
runs-on: blossom
outputs:
args: ${{ env.args }}
# This job only runs for pull request comments
if: |
contains( '\
anstockatnv,\
rohitrajani2018,\
cdesiniotis,\
shivamerla,\
ArangoGutierrez,\
elezar,\
klueska,\
zvonkok,\
', format('{0},', github.actor)) &&
github.event.comment.body == '/blossom-ci'
steps:
- name: Check if comment is issued by authorized person
run: blossom-ci
env:
OPERATION: 'AUTH'
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
Vulnerability-scan:
name: Vulnerability scan
needs: [Authorization]
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
lfs: 'true'
# repo specific steps
#- name: Setup java
# uses: actions/setup-java@v1
# with:
# java-version: 1.8
# add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file
#- name: Setup blackduck properties
# run: |
# PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')
# echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties
# echo detect.maven.included.scopes=compile >> application.properties
- name: Run blossom action
uses: NVIDIA/blossom-action@main
env:
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
with:
args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
Job-trigger:
name: Start ci job
needs: [Vulnerability-scan]
runs-on: blossom
steps:
- name: Start ci job
run: blossom-ci
env:
OPERATION: 'START-CI-JOB'
CI_SERVER: ${{ secrets.CI_SERVER }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Upload-Log:
name: Upload log
runs-on: blossom
if : github.event_name == 'workflow_dispatch'
steps:
- name: Jenkins log for pull request ${{ fromJson(github.event.inputs.args).pr }} (click here)
run: blossom-ci
env:
OPERATION: 'POST-PROCESSING'
CI_SERVER: ${{ secrets.CI_SERVER }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -1,5 +1,37 @@
# NVIDIA Container Toolkit Changelog
## v1.13.1
* Update `update-ldcache` hook to only update ldcache if it exists.
* Update `update-ldcache` hook to create `/etc/ld.so.conf.d` folder if it doesn't exist.
* Fix failure when libcuda cannot be located during XOrg library discovery.
* Fix CDI spec generation on systems that use `/etc/alternatives` (e.g. Debian)
## v1.13.0
* Promote 1.13.0-rc.3 to 1.13.0
## v1.13.0-rc.3
* Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate`.
* Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets.
* Fix the generation of CDI specifications for management containers when the driver libraries are not in the LDCache.
* Add transformers to deduplicate and simplify CDI specifications.
* Generate a simplified CDI specification by default. This means that entities in the common edits in a spec are not included in device definitions.
* Also return an error from the nvcdi.New constructor instead of panicing.
* Detect XOrg libraries for injection and CDI spec generation.
* Add `nvidia-ctk system create-device-nodes` command to create control devices.
* Add `nvidia-ctk cdi transform` command to apply transforms to CDI specifications.
* Add `--vendor` and `--class` options to `nvidia-ctk cdi generate`
* [libnvidia-container] Fix segmentation fault when RPC initialization fails.
* [libnvidia-container] Build centos variants of the NVIDIA Container Library with static libtirpc v1.3.2.
* [libnvidia-container] Remove make targets for fedora35 as the centos8 packages are compatible.
* [toolkit-container] Add `nvidia-container-runtime.modes.cdi.annotation-prefixes` config option that allows the CDI annotation prefixes that are read to be overridden.
* [toolkit-container] Create device nodes when generating CDI specification for management containers.
* [toolkit-container] Add `nvidia-container-runtime.runtimes` config option to set the low-level runtime for the NVIDIA Container Runtime
## v1.13.0-rc.2
* Don't fail chmod hook if paths are not injected

View File

@@ -18,6 +18,7 @@ package cdi
import (
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/cdi/generate"
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/cdi/transform"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
@@ -44,6 +45,7 @@ func (m command) build() *cli.Command {
hook.Subcommands = []*cli.Command{
generate.NewCommand(m.logger),
transform.NewCommand(m.logger),
}
return &hook

View File

@@ -30,8 +30,6 @@ import (
specs "github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
const (
@@ -49,6 +47,8 @@ type config struct {
driverRoot string
nvidiaCTKPath string
mode string
vendor string
class string
}
// NewCommand constructs a generate-cdi command with the specified logger
@@ -110,6 +110,20 @@ func (m command) build() *cli.Command {
Usage: "Specify the path to use for the nvidia-ctk in the generated CDI specification. If this is left empty, the path will be searched.",
Destination: &cfg.nvidiaCTKPath,
},
&cli.StringFlag{
Name: "vendor",
Aliases: []string{"cdi-vendor"},
Usage: "the vendor string to use for the generated CDI specification.",
Value: "nvidia.com",
Destination: &cfg.vendor,
},
&cli.StringFlag{
Name: "class",
Aliases: []string{"cdi-class"},
Usage: "the class string to use for the generated CDI specification.",
Value: "gpu",
Destination: &cfg.class,
},
}
return &c
@@ -151,6 +165,12 @@ func (m command) validateFlags(c *cli.Context, cfg *config) error {
}
}
if err := cdi.ValidateVendorName(cfg.vendor); err != nil {
return fmt.Errorf("invalid CDI vendor name: %v", err)
}
if err := cdi.ValidateClassName(cfg.class); err != nil {
return fmt.Errorf("invalid CDI class name: %v", err)
}
return nil
}
@@ -190,23 +210,16 @@ func (m command) generateSpec(cfg *config) (spec.Interface, error) {
return nil, fmt.Errorf("failed to create device namer: %v", err)
}
nvmllib := nvml.New()
if r := nvmllib.Init(); r != nvml.SUCCESS {
return nil, r
}
defer nvmllib.Shutdown()
devicelib := device.New(device.WithNvml(nvmllib))
cdilib := nvcdi.New(
cdilib, err := nvcdi.New(
nvcdi.WithLogger(m.logger),
nvcdi.WithDriverRoot(cfg.driverRoot),
nvcdi.WithNVIDIACTKPath(cfg.nvidiaCTKPath),
nvcdi.WithDeviceNamer(deviceNamer),
nvcdi.WithDeviceLib(devicelib),
nvcdi.WithNvmlLib(nvmllib),
nvcdi.WithMode(string(cfg.mode)),
)
if err != nil {
return nil, fmt.Errorf("failed to create CDI library: %v", err)
}
deviceSpecs, err := cdilib.GetAllDeviceSpecs()
if err != nil {
@@ -233,8 +246,8 @@ func (m command) generateSpec(cfg *config) (spec.Interface, error) {
}
return spec.New(
spec.WithVendor("nvidia.com"),
spec.WithClass("gpu"),
spec.WithVendor(cfg.vendor),
spec.WithClass(cfg.class),
spec.WithDeviceSpecs(deviceSpecs),
spec.WithEdits(*commonEdits.ContainerEdits),
spec.WithFormat(cfg.format),

View File

@@ -0,0 +1,159 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package root
import (
"fmt"
"io"
"os"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
type loadSaver interface {
Load() (spec.Interface, error)
Save(spec.Interface) error
}
type command struct {
logger *logrus.Logger
}
type transformOptions struct {
input string
output string
}
type options struct {
transformOptions
from string
to string
}
// NewCommand constructs a generate-cdi command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
// build creates the CLI command
func (m command) build() *cli.Command {
opts := options{}
c := cli.Command{
Name: "root",
Usage: "Apply a root transform to a CDI specification",
Before: func(c *cli.Context) error {
return m.validateFlags(c, &opts)
},
Action: func(c *cli.Context) error {
return m.run(c, &opts)
},
}
c.Flags = []cli.Flag{
&cli.StringFlag{
Name: "input",
Usage: "Specify the file to read the CDI specification from. If this is '-' the specification is read from STDIN",
Value: "-",
Destination: &opts.input,
},
&cli.StringFlag{
Name: "output",
Usage: "Specify the file to output the generated CDI specification to. If this is '' the specification is output to STDOUT",
Destination: &opts.output,
},
&cli.StringFlag{
Name: "from",
Usage: "specify the root to be transformed",
Destination: &opts.from,
},
&cli.StringFlag{
Name: "to",
Usage: "specify the replacement root. If this is the same as the from root, the transform is a no-op.",
Value: "",
Destination: &opts.to,
},
}
return &c
}
func (m command) validateFlags(c *cli.Context, opts *options) error {
return nil
}
func (m command) run(c *cli.Context, opts *options) error {
spec, err := opts.Load()
if err != nil {
return fmt.Errorf("failed to load CDI specification: %w", err)
}
err = transform.NewRootTransformer(
opts.from,
opts.to,
).Transform(spec.Raw())
if err != nil {
return fmt.Errorf("failed to transform CDI specification: %w", err)
}
return opts.Save(spec)
}
// Load lodas the input CDI specification
func (o transformOptions) Load() (spec.Interface, error) {
contents, err := o.getContents()
if err != nil {
return nil, fmt.Errorf("failed to read spec contents: %v", err)
}
raw, err := cdi.ParseSpec(contents)
if err != nil {
return nil, fmt.Errorf("failed to parse CDI spec: %v", err)
}
return spec.New(
spec.WithRawSpec(raw),
)
}
func (o transformOptions) getContents() ([]byte, error) {
if o.input == "-" {
return io.ReadAll(os.Stdin)
}
return os.ReadFile(o.input)
}
// Save saves the CDI specification to the output file
func (o transformOptions) Save(s spec.Interface) error {
if o.output == "" {
_, err := s.WriteTo(os.Stdout)
if err != nil {
return fmt.Errorf("failed to write CDI spec to STDOUT: %v", err)
}
return nil
}
return s.Save(o.output)
}

View File

@@ -0,0 +1,51 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package transform
import (
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/cdi/transform/root"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
type command struct {
logger *logrus.Logger
}
// NewCommand constructs a command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
// build creates the CLI command
func (m command) build() *cli.Command {
c := cli.Command{
Name: "transform",
Usage: "Apply a transform to a CDI specification",
}
c.Flags = []cli.Flag{}
c.Subcommands = []*cli.Command{
root.NewCommand(m.logger),
}
return &c
}

View File

@@ -84,6 +84,12 @@ func (m command) run(c *cli.Context, cfg *config) error {
return fmt.Errorf("failed to determined container root: %v", err)
}
_, err = os.Stat(filepath.Join(containerRoot, "/etc/ld.so.cache"))
if err != nil && os.IsNotExist(err) {
m.logger.Debugf("No ld.so.cache found, skipping update")
return nil
}
err = m.createConfig(containerRoot, cfg.folders.Value())
if err != nil {
return fmt.Errorf("failed to update ld.so.conf: %v", err)
@@ -105,6 +111,10 @@ func (m command) createConfig(root string, folders []string) error {
return nil
}
if err := os.MkdirAll(filepath.Join(root, "/etc/ld.so.conf.d"), 0755); err != nil {
return fmt.Errorf("failed to create ld.so.conf.d: %v", err)
}
configFile, err := os.CreateTemp(filepath.Join(root, "/etc/ld.so.conf.d"), "nvcr-*.conf")
if err != nil {
return fmt.Errorf("failed to create config file: %v", err)

View File

@@ -0,0 +1,107 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package createdevicenodes
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/system"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
type command struct {
logger *logrus.Logger
}
type options struct {
driverRoot string
dryRun bool
control bool
}
// NewCommand constructs a command sub-command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
// build
func (m command) build() *cli.Command {
opts := options{}
c := cli.Command{
Name: "create-device-nodes",
Usage: "A utility to create NVIDIA device ndoes",
Before: func(c *cli.Context) error {
return m.validateFlags(c, &opts)
},
Action: func(c *cli.Context) error {
return m.run(c, &opts)
},
}
c.Flags = []cli.Flag{
&cli.StringFlag{
Name: "driver-root",
Usage: "the path to the driver root. Device nodes will be created at `DRIVER_ROOT`/dev",
Value: "/",
Destination: &opts.driverRoot,
EnvVars: []string{"DRIVER_ROOT"},
},
&cli.BoolFlag{
Name: "control-devices",
Usage: "create all control device nodes: nvidiactl, nvidia-modeset, nvidia-uvm, nvidia-uvm-tools",
Destination: &opts.control,
},
&cli.BoolFlag{
Name: "dry-run",
Usage: "if set, the command will not create any symlinks.",
Value: false,
Destination: &opts.dryRun,
EnvVars: []string{"DRY_RUN"},
},
}
return &c
}
func (m command) validateFlags(r *cli.Context, opts *options) error {
return nil
}
func (m command) run(c *cli.Context, opts *options) error {
s, err := system.New(
system.WithLogger(m.logger),
system.WithDryRun(opts.dryRun),
)
if err != nil {
return fmt.Errorf("failed to create library: %v", err)
}
if opts.control {
m.logger.Infof("Creating control device nodes at %s", opts.driverRoot)
if err := s.CreateNVIDIAControlDeviceNodesAt(opts.driverRoot); err != nil {
return fmt.Errorf("failed to create control device nodes: %v", err)
}
}
return nil
}

View File

@@ -18,6 +18,7 @@ package system
import (
devchar "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/system/create-dev-char-symlinks"
devicenodes "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/system/create-device-nodes"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
@@ -43,6 +44,7 @@ func (m command) build() *cli.Command {
system.Subcommands = []*cli.Command{
devchar.NewCommand(m.logger),
devicenodes.NewCommand(m.logger),
}
return &system

View File

@@ -14,10 +14,10 @@
# Supported OSs by architecture
AMD64_TARGETS := ubuntu20.04 ubuntu18.04 ubuntu16.04 debian10 debian9
X86_64_TARGETS := fedora35 centos7 centos8 rhel7 rhel8 amazonlinux2 opensuse-leap15.1
X86_64_TARGETS := centos7 centos8 rhel7 rhel8 amazonlinux2 opensuse-leap15.1
PPC64LE_TARGETS := ubuntu18.04 ubuntu16.04 centos7 centos8 rhel7 rhel8
ARM64_TARGETS := ubuntu20.04 ubuntu18.04
AARCH64_TARGETS := fedora35 centos8 rhel8 amazonlinux2
AARCH64_TARGETS := centos8 rhel8 amazonlinux2
# Define top-level build targets
docker%: SHELL:=/bin/bash
@@ -102,14 +102,6 @@ LIBNVIDIA_CONTAINER_TOOLS_VERSION := $(LIBNVIDIA_CONTAINER_VERSION)$(if $(LIBNVI
--centos%: CONFIG_TOML_SUFFIX := rpm-yum
--centos8%: BASEIMAGE = quay.io/centos/centos:stream8
# private fedora target
--fedora%: OS := fedora
--fedora%: DOCKERFILE = $(CURDIR)/docker/Dockerfile.rpm-yum
--fedora%: CONFIG_TOML_SUFFIX := rpm-yum
# The fedora(35) base image has very slow performance when building aarch64 packages.
# Since our primary concern here is glibc versions, we use the older glibc version available in centos8.
--fedora35%: BASEIMAGE = quay.io/centos/centos:stream8
# private amazonlinux target
--amazonlinux%: OS := amazonlinux
--amazonlinux%: DOCKERFILE = $(CURDIR)/docker/Dockerfile.rpm-yum

View File

@@ -71,7 +71,8 @@ func TestGetConfig(t *testing.T) {
MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d",
},
CDI: cdiModeConfig{
DefaultKind: "nvidia.com/gpu",
DefaultKind: "nvidia.com/gpu",
AnnotationPrefixes: []string{"cdi.k8s.io/"},
},
},
},
@@ -92,6 +93,7 @@ func TestGetConfig(t *testing.T) {
"nvidia-container-runtime.runtimes = [\"/some/runtime\",]",
"nvidia-container-runtime.mode = \"not-auto\"",
"nvidia-container-runtime.modes.cdi.default-kind = \"example.vendor.com/device\"",
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
},
@@ -111,6 +113,10 @@ func TestGetConfig(t *testing.T) {
},
CDI: cdiModeConfig{
DefaultKind: "example.vendor.com/device",
AnnotationPrefixes: []string{
"cdi.k8s.io/",
"example.vendor.com/",
},
},
},
},
@@ -134,6 +140,7 @@ func TestGetConfig(t *testing.T) {
"mode = \"not-auto\"",
"[nvidia-container-runtime.modes.cdi]",
"default-kind = \"example.vendor.com/device\"",
"annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
"[nvidia-container-runtime.modes.csv]",
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"[nvidia-ctk]",
@@ -155,6 +162,10 @@ func TestGetConfig(t *testing.T) {
},
CDI: cdiModeConfig{
DefaultKind: "example.vendor.com/device",
AnnotationPrefixes: []string{
"cdi.k8s.io/",
"example.vendor.com/",
},
},
},
},

View File

@@ -50,12 +50,15 @@ func (c *ConfigV1) AddRuntime(name string, path string, setAsDefault bool) error
config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "runtime_engine"}, "")
config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "privileged_without_host_devices"}, false)
}
cdiAnnotations := []interface{}{"cdi.k8s.io/*"}
containerAnnotations, ok := config.GetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "container_annotations"}).([]interface{})
if ok && containerAnnotations != nil {
cdiAnnotations = append(containerAnnotations, cdiAnnotations...)
if len(c.ContainerAnnotations) > 0 {
annotations, err := (*Config)(c).getRuntimeAnnotations([]string{"plugins", "cri", "containerd", "runtimes", name, "container_annotations"})
if err != nil {
return err
}
annotations = append(c.ContainerAnnotations, annotations...)
config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "container_annotations"}, annotations)
}
config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "container_annotations"}, cdiAnnotations)
config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "options", "BinaryName"}, path)
config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "options", "Runtime"}, path)

View File

@@ -45,12 +45,14 @@ func (c *Config) AddRuntime(name string, path string, setAsDefault bool) error {
config.SetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "privileged_without_host_devices"}, false)
}
cdiAnnotations := []interface{}{"cdi.k8s.io/*"}
containerAnnotations, ok := config.GetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "container_annotations"}).([]interface{})
if ok && containerAnnotations != nil {
cdiAnnotations = append(containerAnnotations, cdiAnnotations...)
if len(c.ContainerAnnotations) > 0 {
annotations, err := c.getRuntimeAnnotations([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "container_annotations"})
if err != nil {
return err
}
annotations = append(c.ContainerAnnotations, annotations...)
config.SetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "container_annotations"}, annotations)
}
config.SetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "container_annotations"}, cdiAnnotations)
config.SetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "options", "BinaryName"}, path)
@@ -62,6 +64,32 @@ func (c *Config) AddRuntime(name string, path string, setAsDefault bool) error {
return nil
}
func (c *Config) getRuntimeAnnotations(path []string) ([]string, error) {
if c == nil || c.Tree == nil {
return nil, nil
}
config := *c.Tree
if !config.HasPath(path) {
return nil, nil
}
annotationsI, ok := config.GetPath(path).([]interface{})
if !ok {
return nil, fmt.Errorf("invalid annotations: %v", annotationsI)
}
var annotations []string
for _, annotation := range annotationsI {
a, ok := annotation.(string)
if !ok {
return nil, fmt.Errorf("invalid annotation: %v", annotation)
}
annotations = append(annotations, a)
}
return annotations, nil
}
// DefaultRuntime returns the default runtime for the cri-o config
func (c Config) DefaultRuntime() string {
if runtime, ok := c.GetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "default_runtime_name"}).(string); ok {

View File

@@ -26,6 +26,7 @@ type Config struct {
*toml.Tree
RuntimeType string
UseDefaultRuntimeName bool
ContainerAnnotations []string
}
// New creates a containerd config with the specified options

View File

@@ -30,9 +30,10 @@ const (
)
type builder struct {
path string
runtimeType string
useLegacyConfig bool
path string
runtimeType string
useLegacyConfig bool
containerAnnotations []string
}
// Option defines a function that can be used to configure the config builder
@@ -59,6 +60,13 @@ func WithUseLegacyConfig(useLegacyConfig bool) Option {
}
}
// WithContainerAnnotations sets the container annotations for the config builder
func WithContainerAnnotations(containerAnnotations ...string) Option {
return func(b *builder) {
b.containerAnnotations = containerAnnotations
}
}
func (b *builder) build() (engine.Interface, error) {
if b.path == "" {
return nil, fmt.Errorf("config path is empty")
@@ -74,6 +82,7 @@ func (b *builder) build() (engine.Interface, error) {
}
config.RuntimeType = b.runtimeType
config.UseDefaultRuntimeName = !b.useLegacyConfig
config.ContainerAnnotations = b.containerAnnotations
version, err := config.parseVersion(b.useLegacyConfig)
if err != nil {

View File

@@ -19,6 +19,7 @@ package config
import (
"fmt"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/pelletier/go-toml"
"github.com/sirupsen/logrus"
)
@@ -52,6 +53,8 @@ type cdiModeConfig struct {
SpecDirs []string `toml:"spec-dirs"`
// DefaultKind sets the default kind to be used when constructing fully-qualified CDI device names
DefaultKind string `toml:"default-kind"`
// AnnotationPrefixes sets the allowed prefixes for CDI annotation-based device injection
AnnotationPrefixes []string `toml:"annotation-prefixes"`
}
type csvModeConfig struct {
@@ -98,6 +101,9 @@ func GetDefaultRuntimeConfig() *RuntimeConfig {
},
CDI: cdiModeConfig{
DefaultKind: "nvidia.com/gpu",
AnnotationPrefixes: []string{
cdi.AnnotationPrefix,
},
},
},
}

View File

@@ -20,11 +20,13 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda"
"github.com/sirupsen/logrus"
)
@@ -44,9 +46,12 @@ func NewGraphicsDiscoverer(logger *logrus.Logger, devices image.VisibleDevices,
drmByPathSymlinks := newCreateDRMByPathSymlinks(logger, drmDeviceNodes, cfg)
xorg := optionalXorgDiscoverer(logger, driverRoot, cfg.NvidiaCTKPath)
discover := Merge(
Merge(drmDeviceNodes, drmByPathSymlinks),
mounts,
xorg,
)
return discover, nil
@@ -243,6 +248,123 @@ func newDRMDeviceFilter(logger *logrus.Logger, devices image.VisibleDevices, dri
return filter, nil
}
type xorgHooks struct {
libraries Discover
driverVersion string
nvidiaCTKPath string
}
var _ Discover = (*xorgHooks)(nil)
// optionalXorgDiscoverer creates a discoverer for Xorg libraries.
// If the creation of the discoverer fails, a None discoverer is returned.
func optionalXorgDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string) Discover {
xorg, err := newXorgDiscoverer(logger, driverRoot, nvidiaCTKPath)
if err != nil {
logger.Warnf("Failed to create Xorg discoverer: %v; skipping xorg libraries", err)
return None{}
}
return xorg
}
func newXorgDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string) (Discover, error) {
libCudaPaths, err := cuda.New(
cuda.WithLogger(logger),
cuda.WithDriverRoot(driverRoot),
).Locate(".*.*.*")
if err != nil {
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
}
libcudaPath := libCudaPaths[0]
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
if version == "" {
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
}
libRoot := filepath.Dir(libcudaPath)
xorgLibs := NewMounts(
logger,
lookup.NewFileLocator(
lookup.WithLogger(logger),
lookup.WithRoot(driverRoot),
lookup.WithSearchPaths(libRoot, "/usr/lib/x86_64-linux-gnu"),
lookup.WithCount(1),
),
driverRoot,
[]string{
"nvidia/xorg/nvidia_drv.so",
fmt.Sprintf("nvidia/xorg/libglxserver_nvidia.so.%s", version),
},
)
xorgHooks := xorgHooks{
libraries: xorgLibs,
driverVersion: version,
nvidiaCTKPath: FindNvidiaCTK(logger, nvidiaCTKPath),
}
xorgConfg := NewMounts(
logger,
lookup.NewFileLocator(
lookup.WithLogger(logger),
lookup.WithRoot(driverRoot),
lookup.WithSearchPaths("/usr/share"),
),
driverRoot,
[]string{"X11/xorg.conf.d/10-nvidia.conf"},
)
d := Merge(
xorgLibs,
xorgConfg,
xorgHooks,
)
return d, nil
}
// Devices returns no devices for Xorg
func (m xorgHooks) Devices() ([]Device, error) {
return nil, nil
}
// Hooks returns a hook to create symlinks for Xorg libraries
func (m xorgHooks) Hooks() ([]Hook, error) {
mounts, err := m.libraries.Mounts()
if err != nil {
return nil, fmt.Errorf("failed to get mounts: %v", err)
}
if len(mounts) == 0 {
return nil, nil
}
var target string
for _, mount := range mounts {
filename := filepath.Base(mount.HostPath)
if filename == "libglxserver_nvidia.so."+m.driverVersion {
target = mount.Path
}
}
if target == "" {
return nil, nil
}
link := strings.TrimSuffix(target, "."+m.driverVersion)
links := []string{fmt.Sprintf("%s::%s", filepath.Base(target), link)}
symlinkHook := CreateCreateSymlinkHook(
m.nvidiaCTKPath,
links,
)
return symlinkHook.Hooks()
}
// Mounts returns the libraries required for Xorg
func (m xorgHooks) Mounts() ([]Mount, error) {
return nil, nil
}
// selectDeviceByPath is a filter that allows devices to be selected by the path
type selectDeviceByPath map[string]bool

View File

@@ -25,21 +25,39 @@ type ipcMounts mounts
// NewIPCDiscoverer creats a discoverer for NVIDIA IPC sockets.
func NewIPCDiscoverer(logger *logrus.Logger, driverRoot string) (Discover, error) {
d := newMounts(
sockets := newMounts(
logger,
lookup.NewFileLocator(
lookup.WithLogger(logger),
lookup.WithRoot(driverRoot),
lookup.WithSearchPaths("/run", "/var/run"),
lookup.WithCount(1),
),
driverRoot,
[]string{
"/nvidia-persistenced/socket",
"/nvidia-fabricmanager/socket",
},
)
mps := newMounts(
logger,
lookup.NewFileLocator(
lookup.WithLogger(logger),
lookup.WithRoot(driverRoot),
lookup.WithCount(1),
),
driverRoot,
[]string{
"/var/run/nvidia-persistenced/socket",
"/var/run/nvidia-fabricmanager/socket",
"/tmp/nvidia-mps",
},
)
return (*ipcMounts)(d), nil
d := Merge(
(*ipcMounts)(sockets),
(*ipcMounts)(mps),
)
return d, nil
}
// Mounts returns the discovered mounts with "noexec" added to the mount options.

View File

@@ -307,11 +307,7 @@ func (c *ldcache) resolve(target string) (string, error) {
link = filepath.Join(filepath.Dir(target), link)
}
// Ensure that the returned path is relative to the root.
link = filepath.Join(c.root, link)
c.logger.Debugf("Resolved link: '%v' => '%v'", name, link)
return link, nil
return c.resolve(link)
}
// bytesToString converts a byte slice to a string.

View File

@@ -0,0 +1,104 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package cuda
import (
"path/filepath"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/sirupsen/logrus"
)
type cudaLocator struct {
logger *logrus.Logger
driverRoot string
}
// Options is a function that configures a cudaLocator.
type Options func(*cudaLocator)
// WithLogger is an option that configures the logger used by the locator.
func WithLogger(logger *logrus.Logger) Options {
return func(c *cudaLocator) {
c.logger = logger
}
}
// WithDriverRoot is an option that configures the driver root used by the locator.
func WithDriverRoot(driverRoot string) Options {
return func(c *cudaLocator) {
c.driverRoot = driverRoot
}
}
// New creates a new CUDA library locator.
func New(opts ...Options) lookup.Locator {
c := &cudaLocator{}
for _, opt := range opts {
opt(c)
}
if c.logger == nil {
c.logger = logrus.StandardLogger()
}
if c.driverRoot == "" {
c.driverRoot = "/"
}
return c
}
// Locate returns the path to the libcuda.so.RMVERSION file.
// libcuda.so is prefixed to the specified pattern.
func (l *cudaLocator) Locate(pattern string) ([]string, error) {
ldcacheLocator, err := lookup.NewLibraryLocator(
l.logger,
l.driverRoot,
)
if err != nil {
l.logger.Debugf("Failed to create LDCache locator: %v", err)
}
fullPattern := "libcuda.so" + pattern
candidates, err := ldcacheLocator.Locate("libcuda.so")
if err == nil {
for _, c := range candidates {
if match, err := filepath.Match(fullPattern, filepath.Base(c)); err != nil || !match {
l.logger.Debugf("Skipping non-matching candidate %v: %v", c, err)
continue
}
return []string{c}, nil
}
}
l.logger.Debugf("Could not locate %q in LDCache: Checking predefined library paths.", pattern)
pathLocator := lookup.NewFileLocator(
lookup.WithLogger(l.logger),
lookup.WithRoot(l.driverRoot),
lookup.WithSearchPaths(
"/usr/lib64",
"/usr/lib/x86_64-linux-gnu",
"/usr/lib/aarch64-linux-gnu",
"/usr/lib/x86_64-linux-gnu/nvidia/current",
"/usr/lib/aarch64-linux-gnu/nvidia/current",
),
lookup.WithCount(1),
)
return pathLocator.Locate(fullPattern)
}

View File

@@ -40,6 +40,7 @@ func NewLibraryLocator(logger *log.Logger, root string) (Locator, error) {
}
l := library{
logger: logger,
symlink: NewSymlinkLocator(logger, root),
cache: cache,
}

View File

@@ -18,6 +18,7 @@ package modifier
import (
"fmt"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
@@ -67,7 +68,7 @@ func getDevicesFromSpec(logger *logrus.Logger, ociSpec oci.Spec, cfg *config.Con
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
}
_, annotationDevices, err := cdi.ParseAnnotations(rawSpec.Annotations)
annotationDevices, err := getAnnotationDevices(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes, rawSpec.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
}
@@ -107,6 +108,38 @@ func getDevicesFromSpec(logger *logrus.Logger, ociSpec oci.Spec, cfg *config.Con
return nil, nil
}
// getAnnotationDevices returns a list of devices specified in the annotations.
// Keys starting with the specified prefixes are considered and expected to contain a comma-separated list of
// fully-qualified CDI devices names. If any device name is not fully-quality an error is returned.
// The list of returned devices is deduplicated.
func getAnnotationDevices(prefixes []string, annotations map[string]string) ([]string, error) {
devicesByKey := make(map[string][]string)
for key, value := range annotations {
for _, prefix := range prefixes {
if strings.HasPrefix(key, prefix) {
devicesByKey[key] = strings.Split(value, ",")
}
}
}
seen := make(map[string]bool)
var annotationDevices []string
for key, devices := range devicesByKey {
for _, device := range devices {
if !cdi.IsQualifiedName(device) {
return nil, fmt.Errorf("invalid device name %q in annotation %q", device, key)
}
if seen[device] {
continue
}
annotationDevices = append(annotationDevices, device)
seen[device] = true
}
}
return annotationDevices, nil
}
// Modify loads the CDI registry and injects the specified CDI devices into the OCI runtime specification.
func (m cdiModifier) Modify(spec *specs.Spec) error {
registry := cdi.GetRegistry(

View File

@@ -0,0 +1,92 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package modifier
import (
"fmt"
"testing"
"github.com/stretchr/testify/require"
)
func TestGetAnnotationDevices(t *testing.T) {
testCases := []struct {
description string
prefixes []string
annotations map[string]string
expectedDevices []string
expectedError error
}{
{
description: "no annotations",
},
{
description: "no matching annotations",
prefixes: []string{"not-prefix/"},
annotations: map[string]string{
"prefix/foo": "example.com/device=bar",
},
},
{
description: "single matching annotation",
prefixes: []string{"prefix/"},
annotations: map[string]string{
"prefix/foo": "example.com/device=bar",
},
expectedDevices: []string{"example.com/device=bar"},
},
{
description: "multiple matching annotations",
prefixes: []string{"prefix/", "another-prefix/"},
annotations: map[string]string{
"prefix/foo": "example.com/device=bar",
"another-prefix/bar": "example.com/device=baz",
},
expectedDevices: []string{"example.com/device=bar", "example.com/device=baz"},
},
{
description: "multiple matching annotations with duplicate devices",
prefixes: []string{"prefix/", "another-prefix/"},
annotations: map[string]string{
"prefix/foo": "example.com/device=bar",
"another-prefix/bar": "example.com/device=bar",
},
expectedDevices: []string{"example.com/device=bar"},
},
{
description: "invalid devices",
prefixes: []string{"prefix/"},
annotations: map[string]string{
"prefix/foo": "example.com/device",
},
expectedError: fmt.Errorf("invalid device %q", "example.com/device"),
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
devices, err := getAnnotationDevices(tc.prefixes, tc.annotations)
if tc.expectedError != nil {
require.Error(t, err)
return
}
require.NoError(t, err)
require.ElementsMatch(t, tc.expectedDevices, devices)
})
}
}

View File

@@ -0,0 +1,36 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package system
import "github.com/sirupsen/logrus"
// Option is a functional option for the system command
type Option func(*Interface)
// WithLogger sets the logger for the system command
func WithLogger(logger *logrus.Logger) Option {
return func(i *Interface) {
i.logger = logger
}
}
// WithDryRun sets the dry run flag
func WithDryRun(dryRun bool) Option {
return func(i *Interface) {
i.dryRun = dryRun
}
}

149
internal/system/system.go Normal file
View File

@@ -0,0 +1,149 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package system
import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// Interface is the interface for the system command
type Interface struct {
logger *logrus.Logger
dryRun bool
nvidiaDevices nvidiaDevices
}
// New constructs a system command with the specified options
func New(opts ...Option) (*Interface, error) {
i := &Interface{
logger: logrus.StandardLogger(),
}
for _, opt := range opts {
opt(i)
}
devices, err := devices.GetNVIDIADevices()
if err != nil {
return nil, fmt.Errorf("failed to create devices info: %v", err)
}
i.nvidiaDevices = nvidiaDevices{devices}
return i, nil
}
// CreateNVIDIAControlDeviceNodesAt creates the NVIDIA control device nodes associated with the NVIDIA driver at the specified root.
func (m *Interface) CreateNVIDIAControlDeviceNodesAt(root string) error {
controlNodes := []string{"/dev/nvidiactl", "/dev/nvidia-modeset", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools"}
for _, node := range controlNodes {
path := filepath.Join(root, node)
err := m.CreateNVIDIADeviceNode(path)
if err != nil {
return fmt.Errorf("failed to create device node %s: %v", path, err)
}
}
return nil
}
// CreateNVIDIADeviceNode creates a specified device node associated with the NVIDIA driver.
func (m *Interface) CreateNVIDIADeviceNode(path string) error {
node := filepath.Base(path)
if !strings.HasPrefix(node, "nvidia") {
return fmt.Errorf("invalid device node %q", node)
}
major, err := m.nvidiaDevices.Major(node)
if err != nil {
return fmt.Errorf("failed to determine major: %v", err)
}
minor, err := m.nvidiaDevices.Minor(node)
if err != nil {
return fmt.Errorf("failed to determine minor: %v", err)
}
return m.createDeviceNode(path, int(major), int(minor))
}
func (m *Interface) createDeviceNode(path string, major int, minor int) error {
if m.dryRun {
m.logger.Infof("Running: mknod --mode=0666 %s c %d %d", path, major, minor)
return nil
}
if _, err := os.Stat(path); err == nil {
m.logger.Infof("Skipping: %s already exists", path)
return nil
} else if !os.IsNotExist(err) {
return fmt.Errorf("failed to stat %s: %v", path, err)
}
err := unix.Mknod(path, unix.S_IFCHR, int(unix.Mkdev(uint32(major), uint32(minor))))
if err != nil {
return err
}
return unix.Chmod(path, 0666)
}
type nvidiaDevices struct {
devices.Devices
}
// Major returns the major number for the specified NVIDIA device node.
// If the device node is not supported, an error is returned.
func (n *nvidiaDevices) Major(node string) (int64, error) {
var valid bool
var major devices.Major
switch node {
case "nvidia-uvm", "nvidia-uvm-tools":
major, valid = n.Get(devices.NVIDIAUVM)
case "nvidia-modeset", "nvidiactl":
major, valid = n.Get(devices.NVIDIAGPU)
}
if !valid {
return 0, fmt.Errorf("invalid device node %q", node)
}
return int64(major), nil
}
// Minor returns the minor number for the specified NVIDIA device node.
// If the device node is not supported, an error is returned.
func (n *nvidiaDevices) Minor(node string) (int64, error) {
switch node {
case "nvidia-modeset":
return devices.NVIDIAModesetMinor, nil
case "nvidia-uvm-tools":
return devices.NVIDIAUVMToolsMinor, nil
case "nvidia-uvm":
return devices.NVIDIAUVMMinor, nil
case "nvidiactl":
return devices.NVIDIACTLMinor, nil
}
return 0, fmt.Errorf("invalid device node %q", node)
}

View File

@@ -22,8 +22,8 @@ import (
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda"
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
@@ -31,6 +31,11 @@ import (
// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation.
// The supplied NVML Library is used to query the expected driver version.
func NewDriverDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, nvmllib nvml.Interface) (discover.Discover, error) {
if r := nvmllib.Init(); r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initalize NVML: %v", r)
}
defer nvmllib.Shutdown()
version, r := nvmllib.SystemGetDriverVersion()
if r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to determine driver version: %v", r)
@@ -131,26 +136,24 @@ func NewDriverBinariesDiscoverer(logger *logrus.Logger, driverRoot string) disco
func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) {
logger.Infof("Using driver version %v", version)
cache, err := ldcache.New(logger, driverRoot)
libCudaPaths, err := cuda.New(
cuda.WithLogger(logger),
cuda.WithDriverRoot(driverRoot),
).Locate("." + version)
if err != nil {
return nil, fmt.Errorf("failed to load ldcache: %v", err)
return nil, fmt.Errorf("failed to locate libcuda.so.%v: %v", version, err)
}
libRoot := filepath.Dir(libCudaPaths[0])
libs32, libs64 := cache.List()
libraries := lookup.NewFileLocator(
lookup.WithLogger(logger),
lookup.WithSearchPaths(libRoot),
lookup.WithOptional(true),
)
var libs []string
for _, l := range libs64 {
if strings.HasSuffix(l, version) {
logger.Infof("found 64-bit driver lib: %v", l)
libs = append(libs, l)
}
}
for _, l := range libs32 {
if strings.HasSuffix(l, version) {
logger.Infof("found 32-bit driver lib: %v", l)
libs = append(libs, l)
}
libs, err := libraries.Locate("*.so." + version)
if err != nil {
return nil, fmt.Errorf("failed to locate libraries for driver version %v: %v", version, err)
}
if driverRoot == "/" || driverRoot == "" {

View File

@@ -24,6 +24,7 @@ import (
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
type nvmllib nvcdilib
@@ -39,6 +40,11 @@ func (l *nvmllib) GetSpec() (spec.Interface, error) {
func (l *nvmllib) GetAllDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device
if r := l.nvmllib.Init(); r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initalize NVML: %v", r)
}
defer l.nvmllib.Shutdown()
gpuDeviceSpecs, err := l.getGPUDeviceSpecs()
if err != nil {
return nil, err

View File

@@ -17,6 +17,8 @@
package nvcdi
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
@@ -47,7 +49,7 @@ type nvcdilib struct {
}
// New creates a new nvcdi library
func New(opts ...Option) Interface {
func New(opts ...Option) (Interface, error) {
l := &nvcdilib{}
for _, opt := range opts {
opt(l)
@@ -100,8 +102,7 @@ func New(opts ...Option) Interface {
}
lib = (*mofedlib)(l)
default:
// TODO: We would like to return an error here instead of panicking
panic("Unknown mode")
return nil, fmt.Errorf("unknown mode %q", l.mode)
}
w := wrapper{
@@ -109,7 +110,7 @@ func New(opts ...Option) Interface {
vendor: l.vendor,
class: l.class,
}
return &w
return &w, nil
}
// GetSpec combines the device specs and common edits from the wrapped Interface to a single spec.Interface.
@@ -151,3 +152,24 @@ func (l *nvcdilib) resolveMode() (rmode string) {
return ModeNvml
}
// getCudaVersion returns the CUDA version of the current system.
func (l *nvcdilib) getCudaVersion() (string, error) {
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
return "", fmt.Errorf("nvml not detected: %v", reason)
}
if l.nvmllib == nil {
return "", fmt.Errorf("nvml library not initialized")
}
r := l.nvmllib.Init()
if r != nvml.SUCCESS {
return "", fmt.Errorf("failed to initialize nvml: %v", r)
}
defer l.nvmllib.Shutdown()
version, r := l.nvmllib.SystemGetDriverVersion()
if r != nvml.SUCCESS {
return "", fmt.Errorf("failed to get driver version: %v", r)
}
return version, nil
}

View File

@@ -23,7 +23,7 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
@@ -60,23 +60,9 @@ func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
// GetCommonEdits returns the common edits for use in managementlib containers.
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
locator, err := lookup.NewLibraryLocator(
m.logger,
m.driverRoot,
)
version, err := m.getCudaVersion()
if err != nil {
return nil, fmt.Errorf("failed to create library locator: %v", err)
}
candidates, err := locator.Locate("libcuda.so")
if err != nil {
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
}
libcudaPath := candidates[0]
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
if version == "" {
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
}
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
@@ -92,6 +78,28 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
return edits, nil
}
// getCudaVersion returns the CUDA version for use in managementlib containers.
func (m *managementlib) getCudaVersion() (string, error) {
version, err := (*nvcdilib)(m).getCudaVersion()
if err == nil {
return version, nil
}
libCudaPaths, err := cuda.New(
cuda.WithLogger(m.logger),
cuda.WithDriverRoot(m.driverRoot),
).Locate(".*.*.*")
if err != nil {
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
}
libCudaPath := libCudaPaths[0]
version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")
return version, nil
}
type managementDiscoverer struct {
discover.Discover
}

View File

@@ -19,6 +19,7 @@ package spec
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
)
@@ -31,6 +32,7 @@ type builder struct {
deviceSpecs []specs.Device
edits specs.ContainerEdits
format string
noSimplify bool
}
// newBuilder creates a new spec builder with the supplied options
@@ -39,6 +41,13 @@ func newBuilder(opts ...Option) *builder {
for _, opt := range opts {
opt(s)
}
if s.raw != nil {
s.noSimplify = true
vendor, class := cdi.ParseQualifier(s.raw.Kind)
s.vendor = vendor
s.class = class
}
if s.version == "" {
s.version = DetectMinimumVersion
}
@@ -58,7 +67,6 @@ func newBuilder(opts ...Option) *builder {
// Build builds a CDI spec form the spec builder.
func (o *builder) Build() (*spec, error) {
raw := o.raw
if raw == nil {
raw = &specs.Spec{
Version: o.version,
@@ -76,6 +84,13 @@ func (o *builder) Build() (*spec, error) {
raw.Version = minVersion
}
if !o.noSimplify {
err := transform.NewSimplifier().Transform(raw)
if err != nil {
return nil, fmt.Errorf("failed to simplify spec: %v", err)
}
}
s := spec{
Spec: raw,
format: o.format,
@@ -128,3 +143,17 @@ func WithFormat(format string) Option {
o.format = format
}
}
// WithNoSimplify sets whether the spec must be simplified
func WithNoSimplify(noSimplify bool) Option {
return func(o *builder) {
o.noSimplify = noSimplify
}
}
// WithRawSpec sets the raw spec for the spec builder
func WithRawSpec(raw *specs.Spec) Option {
return func(o *builder) {
o.raw = raw
}
}

View File

@@ -0,0 +1,151 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package transform
import (
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
)
type dedupe struct{}
var _ Transformer = (*dedupe)(nil)
// NewDedupe creates a transformer that deduplicates container edits.
func NewDedupe() (Transformer, error) {
return &dedupe{}, nil
}
// Transform removes duplicate entris from devices and common container edits.
func (d dedupe) Transform(spec *specs.Spec) error {
if spec == nil {
return nil
}
if err := d.transformEdits(&spec.ContainerEdits); err != nil {
return err
}
var updatedDevices []specs.Device
for _, device := range spec.Devices {
if err := d.transformEdits(&device.ContainerEdits); err != nil {
return err
}
updatedDevices = append(updatedDevices, device)
}
spec.Devices = updatedDevices
return nil
}
func (d dedupe) transformEdits(edits *specs.ContainerEdits) error {
deviceNodes, err := d.deduplicateDeviceNodes(edits.DeviceNodes)
if err != nil {
return err
}
edits.DeviceNodes = deviceNodes
envs, err := d.deduplicateEnvs(edits.Env)
if err != nil {
return err
}
edits.Env = envs
hooks, err := d.deduplicateHooks(edits.Hooks)
if err != nil {
return err
}
edits.Hooks = hooks
mounts, err := d.deduplicateMounts(edits.Mounts)
if err != nil {
return err
}
edits.Mounts = mounts
return nil
}
func (d dedupe) deduplicateDeviceNodes(entities []*specs.DeviceNode) ([]*specs.DeviceNode, error) {
seen := make(map[string]bool)
var deviceNodes []*specs.DeviceNode
for _, e := range entities {
if e == nil {
continue
}
id, err := deviceNode(*e).id()
if err != nil {
return nil, err
}
if seen[id] {
continue
}
seen[id] = true
deviceNodes = append(deviceNodes, e)
}
return deviceNodes, nil
}
func (d dedupe) deduplicateEnvs(entities []string) ([]string, error) {
seen := make(map[string]bool)
var envs []string
for _, e := range entities {
id := e
if seen[id] {
continue
}
seen[id] = true
envs = append(envs, e)
}
return envs, nil
}
func (d dedupe) deduplicateHooks(entities []*specs.Hook) ([]*specs.Hook, error) {
seen := make(map[string]bool)
var hooks []*specs.Hook
for _, e := range entities {
if e == nil {
continue
}
id, err := hook(*e).id()
if err != nil {
return nil, err
}
if seen[id] {
continue
}
seen[id] = true
hooks = append(hooks, e)
}
return hooks, nil
}
func (d dedupe) deduplicateMounts(entities []*specs.Mount) ([]*specs.Mount, error) {
seen := make(map[string]bool)
var mounts []*specs.Mount
for _, e := range entities {
if e == nil {
continue
}
id, err := mount(*e).id()
if err != nil {
return nil, err
}
if seen[id] {
continue
}
seen[id] = true
mounts = append(mounts, e)
}
return mounts, nil
}

View File

@@ -0,0 +1,250 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package transform
import (
"testing"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/stretchr/testify/require"
)
func TestDeduplicate(t *testing.T) {
testCases := []struct {
description string
spec *specs.Spec
expectedError error
expectedSpec *specs.Spec
}{
{
description: "nil spec",
},
{
description: "duplicate deviceNode is removed",
spec: &specs.Spec{
ContainerEdits: specs.ContainerEdits{
DeviceNodes: []*specs.DeviceNode{
{
Path: "/dev/gpu0",
},
{
Path: "/dev/gpu0",
},
},
},
},
expectedSpec: &specs.Spec{
ContainerEdits: specs.ContainerEdits{
DeviceNodes: []*specs.DeviceNode{
{
Path: "/dev/gpu0",
},
},
},
},
},
{
description: "duplicate deviceNode is remved from device edits",
spec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
DeviceNodes: []*specs.DeviceNode{
{
Path: "/dev/gpu0",
},
{
Path: "/dev/gpu0",
},
},
},
},
},
},
expectedSpec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
DeviceNodes: []*specs.DeviceNode{
{
Path: "/dev/gpu0",
},
},
},
},
},
},
},
{
description: "duplicate hook is removed",
spec: &specs.Spec{
ContainerEdits: specs.ContainerEdits{
Hooks: []*specs.Hook{
{
HookName: "createContainer",
Path: "/usr/bin/nvidia-ctk",
Args: []string{"nvidia-ctk", "hook", "chmod", "--mode", "755", "--path", "/dev/dri"},
},
{
HookName: "createContainer",
Path: "/usr/bin/nvidia-ctk",
Args: []string{"nvidia-ctk", "hook", "chmod", "--mode", "755", "--path", "/dev/dri"},
},
},
},
},
expectedSpec: &specs.Spec{
ContainerEdits: specs.ContainerEdits{
Hooks: []*specs.Hook{
{
HookName: "createContainer",
Path: "/usr/bin/nvidia-ctk",
Args: []string{"nvidia-ctk", "hook", "chmod", "--mode", "755", "--path", "/dev/dri"},
},
},
},
},
},
{
description: "duplicate mount is removed",
spec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
Mounts: []*specs.Mount{
{
HostPath: "/host/mount2",
ContainerPath: "/mount2",
},
{
HostPath: "/host/mount2",
ContainerPath: "/mount2",
},
{
HostPath: "/host/mount1",
ContainerPath: "/mount1",
},
},
},
},
},
ContainerEdits: specs.ContainerEdits{
Mounts: []*specs.Mount{
{
HostPath: "/host/mount1",
ContainerPath: "/mount1",
Options: []string{"bind", "ro"},
Type: "tmpfs",
},
{
HostPath: "/host/mount1",
ContainerPath: "/mount1",
Options: []string{"bind", "ro"},
Type: "tmpfs",
},
{
HostPath: "/host/mount1",
ContainerPath: "/mount1",
Options: []string{"bind", "ro"},
},
},
},
},
expectedSpec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
Mounts: []*specs.Mount{
{
HostPath: "/host/mount2",
ContainerPath: "/mount2",
},
{
HostPath: "/host/mount1",
ContainerPath: "/mount1",
},
},
},
},
},
ContainerEdits: specs.ContainerEdits{
Mounts: []*specs.Mount{
{
HostPath: "/host/mount1",
ContainerPath: "/mount1",
Options: []string{"bind", "ro"},
Type: "tmpfs",
},
{
HostPath: "/host/mount1",
ContainerPath: "/mount1",
Options: []string{"bind", "ro"},
},
},
},
},
},
{
description: "duplicate env is removed",
spec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
Env: []string{"ENV1=VAL1", "ENV1=VAL1", "ENV2=ONE_VALUE", "ENV2=ANOTHER_VALUE"},
},
},
},
ContainerEdits: specs.ContainerEdits{
Env: []string{"ENV1=VAL1", "ENV1=VAL1", "ENV2=ONE_VALUE", "ENV2=ANOTHER_VALUE"},
},
},
expectedSpec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
Env: []string{"ENV1=VAL1", "ENV2=ONE_VALUE", "ENV2=ANOTHER_VALUE"},
},
},
},
ContainerEdits: specs.ContainerEdits{
Env: []string{"ENV1=VAL1", "ENV2=ONE_VALUE", "ENV2=ANOTHER_VALUE"},
},
},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
d := dedupe{}
err := d.Transform(tc.spec)
if tc.expectedError != nil {
require.Error(t, err)
return
}
require.NoError(t, err)
require.EqualValues(t, tc.expectedSpec, tc.spec)
})
}
}

View File

@@ -0,0 +1,166 @@
/*
*
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package transform
import (
"encoding/json"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
)
type containerEdits specs.ContainerEdits
// IsEmpty returns true if the edits are empty.
func (e containerEdits) IsEmpty() bool {
// Devices with empty edits are invalid
if len(e.DeviceNodes) > 0 {
return false
}
if len(e.Env) > 0 {
return false
}
if len(e.Hooks) > 0 {
return false
}
if len(e.Mounts) > 0 {
return false
}
return true
}
func (e *containerEdits) getEntityIds() ([]string, error) {
if e == nil {
return nil, nil
}
uniqueIDs := make(map[string]bool)
deviceNodes, err := e.getDeviceNodeIDs()
if err != nil {
return nil, err
}
for k := range deviceNodes {
uniqueIDs[k] = true
}
envs, err := e.getEnvIDs()
if err != nil {
return nil, err
}
for k := range envs {
uniqueIDs[k] = true
}
hooks, err := e.getHookIDs()
if err != nil {
return nil, err
}
for k := range hooks {
uniqueIDs[k] = true
}
mounts, err := e.getMountIDs()
if err != nil {
return nil, err
}
for k := range mounts {
uniqueIDs[k] = true
}
var ids []string
for k := range uniqueIDs {
ids = append(ids, k)
}
return ids, nil
}
func (e *containerEdits) getDeviceNodeIDs() (map[string]bool, error) {
deviceIDs := make(map[string]bool)
for _, entity := range e.DeviceNodes {
id, err := deviceNode(*entity).id()
if err != nil {
return nil, err
}
deviceIDs[id] = true
}
return deviceIDs, nil
}
func (e *containerEdits) getEnvIDs() (map[string]bool, error) {
envIDs := make(map[string]bool)
for _, entity := range e.Env {
id, err := env(entity).id()
if err != nil {
return nil, err
}
envIDs[id] = true
}
return envIDs, nil
}
func (e *containerEdits) getHookIDs() (map[string]bool, error) {
hookIDs := make(map[string]bool)
for _, entity := range e.Hooks {
id, err := hook(*entity).id()
if err != nil {
return nil, err
}
hookIDs[id] = true
}
return hookIDs, nil
}
func (e *containerEdits) getMountIDs() (map[string]bool, error) {
mountIDs := make(map[string]bool)
for _, entity := range e.Mounts {
id, err := mount(*entity).id()
if err != nil {
return nil, err
}
mountIDs[id] = true
}
return mountIDs, nil
}
type deviceNode specs.DeviceNode
func (dn deviceNode) id() (string, error) {
b, err := json.Marshal(dn)
return string(b), err
}
type env string
func (e env) id() (string, error) {
return string(e), nil
}
type mount specs.Mount
func (m mount) id() (string, error) {
b, err := json.Marshal(m)
return string(b), err
}
type hook specs.Hook
func (m hook) id() (string, error) {
b, err := json.Marshal(m)
return string(b), err
}

View File

@@ -0,0 +1,105 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package transform
import (
"fmt"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
)
type remove map[string]bool
func newRemover(ids ...string) Transformer {
r := make(remove)
for _, id := range ids {
r[id] = true
}
return r
}
// Transform remove the specified entities from the spec.
func (r remove) Transform(spec *specs.Spec) error {
if spec == nil {
return nil
}
for _, device := range spec.Devices {
if err := r.transformEdits(&device.ContainerEdits); err != nil {
return fmt.Errorf("failed to remove edits from device %q: %w", device.Name, err)
}
}
return r.transformEdits(&spec.ContainerEdits)
}
func (r remove) transformEdits(edits *specs.ContainerEdits) error {
if edits == nil {
return nil
}
var deviceNodes []*specs.DeviceNode
for _, entity := range edits.DeviceNodes {
id, err := deviceNode(*entity).id()
if err != nil {
return err
}
if r[id] {
continue
}
deviceNodes = append(deviceNodes, entity)
}
edits.DeviceNodes = deviceNodes
var envs []string
for _, entity := range edits.Env {
id := entity
if r[id] {
continue
}
envs = append(envs, entity)
}
edits.Env = envs
var hooks []*specs.Hook
for _, entity := range edits.Hooks {
id, err := hook(*entity).id()
if err != nil {
return err
}
if r[id] {
continue
}
hooks = append(hooks, entity)
}
edits.Hooks = hooks
var mounts []*specs.Mount
for _, entity := range edits.Mounts {
id, err := mount(*entity).id()
if err != nil {
return err
}
if r[id] {
continue
}
mounts = append(mounts, entity)
}
edits.Mounts = mounts
return nil
}

View File

@@ -0,0 +1,74 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package transform
import (
"fmt"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
)
type simplify struct{}
var _ Transformer = (*simplify)(nil)
// NewSimplifier creates a simplifier transformer.
// This transoformer ensures that entities in the spec are deduplicated and that common edits are removed from device-specific edits.
func NewSimplifier() Transformer {
return &simplify{}
}
// Transform simplifies the supplied spec.
// Edits that are present in the common edits are removed from device-specific edits.
func (s simplify) Transform(spec *specs.Spec) error {
if spec == nil {
return nil
}
dedupe := dedupe{}
if err := dedupe.Transform(spec); err != nil {
return err
}
commonEntityIDs, err := (*containerEdits)(&spec.ContainerEdits).getEntityIds()
if err != nil {
return err
}
toRemove := newRemover(commonEntityIDs...)
var updatedDevices []specs.Device
for _, device := range spec.Devices {
deviceAsSpec := specs.Spec{
ContainerEdits: device.ContainerEdits,
}
err := toRemove.Transform(&deviceAsSpec)
if err != nil {
return fmt.Errorf("failed to transform device edits: %w", err)
}
if !(containerEdits)(deviceAsSpec.ContainerEdits).IsEmpty() {
// Devices with empty edits are invalid.
// We only update the container edits for the device if this would
// result in a valid device.
device.ContainerEdits = deviceAsSpec.ContainerEdits
}
updatedDevices = append(updatedDevices, device)
}
spec.Devices = updatedDevices
return nil
}

View File

@@ -0,0 +1,125 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package transform
import (
"testing"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/stretchr/testify/require"
)
func TestSimplify(t *testing.T) {
testCases := []struct {
description string
spec *specs.Spec
expectedError error
expectedSpec *specs.Spec
}{
{
description: "nil spec is a no-op",
},
{
description: "empty spec is simplified",
spec: &specs.Spec{},
expectedSpec: &specs.Spec{},
},
{
description: "simplify does not allow empty device",
spec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
Env: []string{"FOO=BAR"},
},
},
},
ContainerEdits: specs.ContainerEdits{
Env: []string{"FOO=BAR"},
},
},
expectedSpec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
Env: []string{"FOO=BAR"},
},
},
},
ContainerEdits: specs.ContainerEdits{
Env: []string{"FOO=BAR"},
},
},
},
{
description: "simplify removes common entities",
spec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
Env: []string{"FOO=BAR"},
DeviceNodes: []*specs.DeviceNode{
{
Path: "/dev/gpu0",
},
},
},
},
},
ContainerEdits: specs.ContainerEdits{
Env: []string{"FOO=BAR"},
},
},
expectedSpec: &specs.Spec{
Devices: []specs.Device{
{
Name: "device0",
ContainerEdits: specs.ContainerEdits{
DeviceNodes: []*specs.DeviceNode{
{
Path: "/dev/gpu0",
},
},
},
},
},
ContainerEdits: specs.ContainerEdits{
Env: []string{"FOO=BAR"},
},
},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
s := simplify{}
err := s.Transform(tc.spec)
if tc.expectedError != nil {
require.Error(t, err)
return
}
require.NoError(t, err)
require.EqualValues(t, tc.expectedSpec, tc.spec)
})
}
}

View File

@@ -62,7 +62,12 @@ make -C "${NVIDIA_CONTAINER_TOOLKIT_ROOT}" \
LIBNVIDIA_CONTAINER_TAG="${LIBNVIDIA_CONTAINER_TAG}" \
"${TARGET}"
if [[ -z ${NVIDIA_CONTAINER_TOOLKIT_TAG} ]]; then
# If required we also build the nvidia-container-runtime and nvidia-docker packages.
# Since these are essentially meta packages intended to allow for users to
# transition from older installation workflows, we skip these for rc builds
# (NVIDIA_CONTAINER_TOOLKIT_TAG != "") and releases with a non-zero patch
# version of 0.
if [[ -z ${NVIDIA_CONTAINER_TOOLKIT_TAG} && "${NVIDIA_CONTAINER_TOOLKIT_VERSION%.0}" != "${NVIDIA_CONTAINER_TOOLKIT_VERSION}" ]]; then
# We set the TOOLKIT_VERSION, TOOLKIT_TAG for the nvidia-container-runtime and nvidia-docker targets
# The LIB_TAG is also overridden to match the TOOLKIT_TAG.
# Build nvidia-container-runtime
@@ -82,5 +87,5 @@ if [[ -z ${NVIDIA_CONTAINER_TOOLKIT_TAG} ]]; then
${TARGET}
else
echo "Skipping nvidia-container-runtime and nvidia-docker builds for release candidate"
echo "Skipping nvidia-container-runtime and nvidia-docker builds."
fi

View File

@@ -45,17 +45,21 @@ function skip-for-release-candidate() {
return 0
fi
# We allow all other packages for non-rc versions.
if [[ "${VERSION/rc./}" == "${VERSION}" ]]; then
return 1
local is_non_patch_full_release=1
# We allow all other packages for non-rc and non-patch release versions.
if [[ "${VERSION/rc./}" != "${VERSION}" ]]; then
is_non_patch_full_release=0
fi
if [[ "${VERSION%.0}" == "${VERSION}" ]]; then
is_non_patch_full_release=0
fi
local package_name=$1
if [[ "${package_name/"nvidia-docker2"/}" != "${package_name}" ]]; then
return 0
return ${is_non_patch_full_release}
fi
if [[ "${package_name/"nvidia-container-runtime"/}" != "${package_name}" ]]; then
return 0
return ${is_non_patch_full_release}
fi
return 1
}
@@ -91,6 +95,18 @@ function extract-all() {
echo "Extracting packages for ${dist} from ${PACKAGE_IMAGE}"
if [ $dist == "ubuntu18.04" ]; then
set -x
# We need to publish the libnvidia-container0 packages to the kitmaker repository as a once off operation.
# We include the packages here so that these will be added to the archive for the ubuntu18.04 arm64 packages.
mkdir -p "${ARTIFACTS_DIR}/packages/ubuntu18.04/arm64/"
curl -L "https://nvidia.github.io/libnvidia-container/ubuntu18.04/arm64/libnvidia-container0_0.10.0+jetpack_arm64.deb" \
--output "${ARTIFACTS_DIR}/packages/ubuntu18.04/arm64/libnvidia-container0_0.10.0+jetpack_arm64.deb"
curl -L "https://nvidia.github.io/libnvidia-container/ubuntu18.04/arm64/libnvidia-container0_0.11.0+jetpack_arm64.deb" \
--output "${ARTIFACTS_DIR}/packages/ubuntu18.04/arm64/libnvidia-container0_0.11.0+jetpack_arm64.deb"
set +x
fi
# Extract every file for the specified dist-arch combiniation in MANIFEST.txt
grep "/${dist}/" "${ARTIFACTS_DIR}/manifest.txt" | while read -r f ; do
package_name="$(basename "$f")"

View File

@@ -62,8 +62,10 @@ echo "LIBNVIDIA_CONTAINER_PACKAGE_VERSION=${libnvidia_container_version_tag//\~/
echo "NVIDIA_CONTAINER_TOOLKIT_VERSION=${nvidia_container_toolkit_version}"
echo "NVIDIA_CONTAINER_TOOLKIT_TAG=${nvidia_container_toolkit_tag}"
echo "NVIDIA_CONTAINER_TOOLKIT_PACKAGE_VERSION=${nvidia_container_toolkit_version_tag//\~/-}"
if [[ "${libnvidia_container_version_tag}" != "${nvidia_container_toolkit_version_tag}" ]]; then
if [[ "${LIBNVIDIA_CONTAINER_PACKAGE_VERSION}" != "${NVIDIA_CONTAINER_TOOLKIT_PACKAGE_VERSION}" ]]; then
>&2 echo "WARNING: The libnvidia-container and nvidia-container-toolkit versions do not match"
>&2 echo "WARNING: lib: ${LIBNVIDIA_CONTAINER_PACKAGE_VERSION}"
>&2 echo "WARNING: toolkit: ${NVIDIA_CONTAINER_TOOLKIT_PACKAGE_VERSION}"
fi
echo "NVIDIA_CONTAINER_RUNTIME_VERSION=${nvidia_container_runtime_version}"
echo "NVIDIA_CONTAINER_RUNTIME_TAG=${nvidia_container_runtime_tag}"

View File

@@ -332,6 +332,7 @@ func TestUpdateV1Config(t *testing.T) {
Tree: config,
UseDefaultRuntimeName: true,
RuntimeType: runtimeType,
ContainerAnnotations: []string{"cdi.k8s.io/*"},
}
err = UpdateConfig(v1, o)
@@ -585,6 +586,7 @@ func TestUpdateV1ConfigWithRuncPresent(t *testing.T) {
Tree: config,
UseDefaultRuntimeName: true,
RuntimeType: runtimeType,
ContainerAnnotations: []string{"cdi.k8s.io/*"},
}
err = UpdateConfig(v1, o)

View File

@@ -279,8 +279,9 @@ func TestUpdateV2Config(t *testing.T) {
require.NoError(t, err)
v2 := &containerd.Config{
Tree: config,
RuntimeType: runtimeType,
Tree: config,
RuntimeType: runtimeType,
ContainerAnnotations: []string{"cdi.k8s.io/*"},
}
err = UpdateConfig(v2, o)
@@ -520,8 +521,9 @@ func TestUpdateV2ConfigWithRuncPresent(t *testing.T) {
require.NoError(t, err)
v2 := &containerd.Config{
Tree: config,
RuntimeType: runtimeType,
Tree: config,
RuntimeType: runtimeType,
ContainerAnnotations: []string{"cdi.k8s.io/*"},
}
err = UpdateConfig(v2, o)

View File

@@ -72,6 +72,8 @@ type options struct {
hostRootMount string
runtimeDir string
useLegacyConfig bool
ContainerRuntimeModesCDIAnnotationPrefixes cli.StringSlice
}
func main() {
@@ -173,6 +175,11 @@ func main() {
Destination: &options.useLegacyConfig,
EnvVars: []string{"CONTAINERD_USE_LEGACY_CONFIG"},
},
&cli.StringSliceFlag{
Name: "nvidia-container-runtime-modes.cdi.annotation-prefixes",
Destination: &options.ContainerRuntimeModesCDIAnnotationPrefixes,
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES"},
},
}
// Update the subcommand flags with the common subcommand flags
@@ -199,6 +206,7 @@ func Setup(c *cli.Context, o *options) error {
containerd.WithPath(o.config),
containerd.WithRuntimeType(o.runtimeType),
containerd.WithUseLegacyConfig(o.useLegacyConfig),
containerd.WithContainerAnnotations(o.containerAnnotationsFromCDIPrefixes()...),
)
if err != nil {
return fmt.Errorf("unable to load config: %v", err)
@@ -241,6 +249,7 @@ func Cleanup(c *cli.Context, o *options) error {
containerd.WithPath(o.config),
containerd.WithRuntimeType(o.runtimeType),
containerd.WithUseLegacyConfig(o.useLegacyConfig),
containerd.WithContainerAnnotations(o.containerAnnotationsFromCDIPrefixes()...),
)
if err != nil {
return fmt.Errorf("unable to load config: %v", err)
@@ -434,3 +443,13 @@ func RestartContainerdSystemd(hostRootMount string) error {
return nil
}
// containerAnnotationsFromCDIPrefixes returns the container annotations to set for the given CDI prefixes.
func (o *options) containerAnnotationsFromCDIPrefixes() []string {
var annotations []string
for _, prefix := range o.ContainerRuntimeModesCDIAnnotationPrefixes.Value() {
annotations = append(annotations, prefix+"*")
}
return annotations
}

View File

@@ -23,6 +23,7 @@ import (
"path/filepath"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/system"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
@@ -46,10 +47,14 @@ type options struct {
DriverRoot string
DriverRootCtrPath string
ContainerRuntimeMode string
ContainerRuntimeModesCdiDefaultKind string
ContainerRuntimeDebug string
ContainerRuntimeLogLevel string
ContainerRuntimeMode string
ContainerRuntimeDebug string
ContainerRuntimeLogLevel string
ContainerRuntimeModesCdiDefaultKind string
ContainerRuntimeModesCDIAnnotationPrefixes cli.StringSlice
ContainerRuntimeRuntimes cli.StringSlice
ContainerRuntimeHookSkipModeDetection bool
@@ -64,6 +69,8 @@ type options struct {
acceptNVIDIAVisibleDevicesWhenUnprivileged bool
acceptNVIDIAVisibleDevicesAsVolumeMounts bool
ignoreErrors bool
}
func main() {
@@ -120,26 +127,39 @@ func main() {
EnvVars: []string{"DRIVER_ROOT_CTR_PATH"},
},
&cli.StringFlag{
Name: "nvidia-container-runtime-debug",
Name: "nvidia-container-runtime.debug",
Aliases: []string{"nvidia-container-runtime-debug"},
Usage: "Specify the location of the debug log file for the NVIDIA Container Runtime",
Destination: &opts.ContainerRuntimeDebug,
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_DEBUG"},
},
&cli.StringFlag{
Name: "nvidia-container-runtime-debug-log-level",
Name: "nvidia-container-runtime.log-level",
Aliases: []string{"nvidia-container-runtime-debug-log-level"},
Destination: &opts.ContainerRuntimeLogLevel,
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_LOG_LEVEL"},
},
&cli.StringFlag{
Name: "nvidia-container-runtime-mode",
Name: "nvidia-container-runtime.mode",
Aliases: []string{"nvidia-container-runtime-mode"},
Destination: &opts.ContainerRuntimeMode,
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODE"},
},
&cli.StringFlag{
Name: "nvidia-container-runtime-modes.cdi.default-kind",
Name: "nvidia-container-runtime.modes.cdi.default-kind",
Destination: &opts.ContainerRuntimeModesCdiDefaultKind,
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODES_CDI_DEFAULT_KIND"},
},
&cli.StringSliceFlag{
Name: "nvidia-container-runtime.modes.cdi.annotation-prefixes",
Destination: &opts.ContainerRuntimeModesCDIAnnotationPrefixes,
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES"},
},
&cli.StringSliceFlag{
Name: "nvidia-container-runtime.runtimes",
Destination: &opts.ContainerRuntimeRuntimes,
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_RUNTIMES"},
},
&cli.BoolFlag{
Name: "nvidia-container-runtime-hook.skip-mode-detection",
Value: true,
@@ -147,7 +167,8 @@ func main() {
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_HOOK_SKIP_MODE_DETECTION"},
},
&cli.StringFlag{
Name: "nvidia-container-cli-debug",
Name: "nvidia-container-cli.debug",
Aliases: []string{"nvidia-container-cli-debug"},
Usage: "Specify the location of the debug log file for the NVIDIA Container CLI",
Destination: &opts.ContainerCLIDebug,
EnvVars: []string{"NVIDIA_CONTAINER_CLI_DEBUG"},
@@ -193,6 +214,12 @@ func main() {
Destination: &opts.cdiKind,
EnvVars: []string{"CDI_KIND"},
},
&cli.BoolFlag{
Name: "ignore-errors",
Usage: "ignore errors when installing the NVIDIA Container toolkit. This is used for testing purposes only.",
Hidden: true,
Destination: &opts.ignoreErrors,
},
}
// Update the subcommand flags with the common subcommand flags
@@ -241,46 +268,62 @@ func Install(cli *cli.Context, opts *options) error {
log.Infof("Removing existing NVIDIA container toolkit installation")
err := os.RemoveAll(opts.toolkitRoot)
if err != nil {
if err != nil && !opts.ignoreErrors {
return fmt.Errorf("error removing toolkit directory: %v", err)
} else if err != nil {
log.Errorf("Ignoring error: %v", fmt.Errorf("error removing toolkit directory: %v", err))
}
toolkitConfigDir := filepath.Join(opts.toolkitRoot, ".config", "nvidia-container-runtime")
toolkitConfigPath := filepath.Join(toolkitConfigDir, configFilename)
err = createDirectories(opts.toolkitRoot, toolkitConfigDir)
if err != nil {
if err != nil && !opts.ignoreErrors {
return fmt.Errorf("could not create required directories: %v", err)
} else if err != nil {
log.Errorf("Ignoring error: %v", fmt.Errorf("could not create required directories: %v", err))
}
err = installContainerLibraries(opts.toolkitRoot)
if err != nil {
if err != nil && !opts.ignoreErrors {
return fmt.Errorf("error installing NVIDIA container library: %v", err)
} else if err != nil {
log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container library: %v", err))
}
err = installContainerRuntimes(opts.toolkitRoot, opts.DriverRoot)
if err != nil {
if err != nil && !opts.ignoreErrors {
return fmt.Errorf("error installing NVIDIA container runtime: %v", err)
} else if err != nil {
log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container runtime: %v", err))
}
nvidiaContainerCliExecutable, err := installContainerCLI(opts.toolkitRoot)
if err != nil {
if err != nil && !opts.ignoreErrors {
return fmt.Errorf("error installing NVIDIA container CLI: %v", err)
} else if err != nil {
log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container CLI: %v", err))
}
_, err = installRuntimeHook(opts.toolkitRoot, toolkitConfigPath)
if err != nil {
if err != nil && !opts.ignoreErrors {
return fmt.Errorf("error installing NVIDIA container runtime hook: %v", err)
} else if err != nil {
log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container runtime hook: %v", err))
}
nvidiaCTKPath, err := installContainerToolkitCLI(opts.toolkitRoot)
if err != nil {
if err != nil && !opts.ignoreErrors {
return fmt.Errorf("error installing NVIDIA Container Toolkit CLI: %v", err)
} else if err != nil {
log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA Container Toolkit CLI: %v", err))
}
err = installToolkitConfig(toolkitConfigPath, nvidiaContainerCliExecutable, nvidiaCTKPath, opts)
if err != nil {
err = installToolkitConfig(cli, toolkitConfigPath, nvidiaContainerCliExecutable, nvidiaCTKPath, opts)
if err != nil && !opts.ignoreErrors {
return fmt.Errorf("error installing NVIDIA container toolkit config: %v", err)
} else if err != nil {
log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container toolkit config: %v", err))
}
return generateCDISpec(opts, nvidiaCTKPath)
@@ -336,10 +379,10 @@ func installLibrary(libName string, toolkitRoot string) error {
// installToolkitConfig installs the config file for the NVIDIA container toolkit ensuring
// that the settings are updated to match the desired install and nvidia driver directories.
func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutablePath string, nvidiaCTKPath string, opts *options) error {
func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContainerCliExecutablePath string, nvidiaCTKPath string, opts *options) error {
log.Infof("Installing NVIDIA container toolkit config '%v'", toolkitConfigPath)
config, err := toml.LoadFile(nvidiaContainerToolkitConfigSource)
config, err := loadConfig(nvidiaContainerToolkitConfigSource)
if err != nil {
return fmt.Errorf("could not open source config file: %v", err)
}
@@ -350,45 +393,65 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable
}
defer targetConfig.Close()
// Set the options in the root toml table
config.Set("accept-nvidia-visible-devices-envvar-when-unprivileged", opts.acceptNVIDIAVisibleDevicesWhenUnprivileged)
config.Set("accept-nvidia-visible-devices-as-volume-mounts", opts.acceptNVIDIAVisibleDevicesAsVolumeMounts)
nvidiaContainerCliKey := func(p string) []string {
return []string{"nvidia-container-cli", p}
}
// Read the ldconfig path from the config as this may differ per platform
// On ubuntu-based systems this ends in `.real`
ldconfigPath := fmt.Sprintf("%s", config.GetPath(nvidiaContainerCliKey("ldconfig")))
ldconfigPath := fmt.Sprintf("%s", config.GetDefault("nvidia-container-cli.ldconfig", "/sbin/ldconfig"))
// Use the driver run root as the root:
driverLdconfigPath := "@" + filepath.Join(opts.DriverRoot, strings.TrimPrefix(ldconfigPath, "@/"))
config.SetPath(nvidiaContainerCliKey("root"), opts.DriverRoot)
config.SetPath(nvidiaContainerCliKey("path"), nvidiaContainerCliExecutablePath)
config.SetPath(nvidiaContainerCliKey("ldconfig"), driverLdconfigPath)
// Set the debug options if selected
debugOptions := map[string]string{
"nvidia-container-runtime.debug": opts.ContainerRuntimeDebug,
"nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel,
"nvidia-container-runtime.mode": opts.ContainerRuntimeMode,
"nvidia-container-runtime.modes.cdi.default-kind": opts.ContainerRuntimeModesCdiDefaultKind,
"nvidia-container-cli.debug": opts.ContainerCLIDebug,
configValues := map[string]interface{}{
// Set the options in the root toml table
"accept-nvidia-visible-devices-envvar-when-unprivileged": opts.acceptNVIDIAVisibleDevicesWhenUnprivileged,
"accept-nvidia-visible-devices-as-volume-mounts": opts.acceptNVIDIAVisibleDevicesAsVolumeMounts,
// Set the nvidia-container-cli options
"nvidia-container-cli.root": opts.DriverRoot,
"nvidia-container-cli.path": nvidiaContainerCliExecutablePath,
"nvidia-container-cli.ldconfig": driverLdconfigPath,
// Set nvidia-ctk options
"nvidia-ctk.path": nvidiaCTKPath,
// Set the nvidia-container-runtime-hook options
"nvidia-container-runtime-hook.skip-mode-detection": opts.ContainerRuntimeHookSkipModeDetection,
}
for key, value := range debugOptions {
if value == "" {
continue
}
for key, value := range configValues {
config.Set(key, value)
}
// Set nvidia-ctk options
config.Set("nvidia-ctk.path", nvidiaCTKPath)
// Set the optional config options
optionalConfigValues := map[string]interface{}{
"nvidia-container-runtime.debug": opts.ContainerRuntimeDebug,
"nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel,
"nvidia-container-runtime.mode": opts.ContainerRuntimeMode,
"nvidia-container-runtime.modes.cdi.annotation-prefixes": opts.ContainerRuntimeModesCDIAnnotationPrefixes,
"nvidia-container-runtime.modes.cdi.default-kind": opts.ContainerRuntimeModesCdiDefaultKind,
"nvidia-container-runtime.runtimes": opts.ContainerRuntimeRuntimes,
"nvidia-container-cli.debug": opts.ContainerCLIDebug,
}
for key, value := range optionalConfigValues {
if !c.IsSet(key) {
log.Infof("Skipping unset option: %v", key)
continue
}
if value == nil {
log.Infof("Skipping option with nil value: %v", key)
continue
}
// Set the nvidia-container-runtime-hook options
config.Set("nvidia-container-runtime-hook.skip-mode-detection", opts.ContainerRuntimeHookSkipModeDetection)
switch v := value.(type) {
case string:
if v == "" {
continue
}
case cli.StringSlice:
if len(v.Value()) == 0 {
continue
}
value = v.Value()
default:
log.Warnf("Unexpected type for option %v=%v: %T", key, value, v)
}
config.Set(key, value)
}
_, err = config.WriteTo(targetConfig)
if err != nil {
@@ -401,6 +464,16 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable
return nil
}
func loadConfig(path string) (*toml.Tree, error) {
_, err := os.Stat(path)
if err == nil {
return toml.LoadFile(path)
} else if os.IsNotExist(err) {
return toml.TreeFromMap(nil)
}
return nil, err
}
// installContainerToolkitCLI installs the nvidia-ctk CLI executable and wrapper.
func installContainerToolkitCLI(toolkitDir string) (string, error) {
e := executable{
@@ -608,13 +681,26 @@ func generateCDISpec(opts *options, nvidiaCTKPath string) error {
return nil
}
cdilib := nvcdi.New(
log.Infof("Creating control device nodes at %v", opts.DriverRootCtrPath)
s, err := system.New()
if err != nil {
return fmt.Errorf("failed to create library: %v", err)
}
if err := s.CreateNVIDIAControlDeviceNodesAt(opts.DriverRootCtrPath); err != nil {
return fmt.Errorf("failed to create control device nodes: %v", err)
}
log.Info("Generating CDI spec for management containers")
cdilib, err := nvcdi.New(
nvcdi.WithMode(nvcdi.ModeManagement),
nvcdi.WithDriverRoot(opts.DriverRootCtrPath),
nvcdi.WithNVIDIACTKPath(nvidiaCTKPath),
nvcdi.WithVendor(opts.cdiVendor),
nvcdi.WithClass(opts.cdiClass),
)
if err != nil {
return fmt.Errorf("failed to create CDI library for management containers: %v", err)
}
spec, err := cdilib.GetSpec()
if err != nil {

View File

@@ -13,8 +13,8 @@
# limitations under the License.
LIB_NAME := nvidia-container-toolkit
LIB_VERSION := 1.13.0
LIB_TAG := rc.2
LIB_VERSION := 1.13.1
LIB_TAG :=
# The package version is the combination of the library version and tag.
# If the tag is specified the two components are joined with a tilde (~).
@@ -24,8 +24,8 @@ PACKAGE_REVISION := 1
# Specify the nvidia-docker2 and nvidia-container-runtime package versions.
# Note: The build tooling uses `LIB_TAG` above as the version tag.
# This is appended to the versions below if specified.
NVIDIA_DOCKER_VERSION := 2.12.0
NVIDIA_CONTAINER_RUNTIME_VERSION := 3.12.0
NVIDIA_DOCKER_VERSION := 2.13.0
NVIDIA_CONTAINER_RUNTIME_VERSION := 3.13.0
# Specify the expected libnvidia-container0 version for arm64-based ubuntu builds.
LIBNVIDIA_CONTAINER0_VERSION := 0.10.0+jetpack