From 3df0969349140b5becf1e4c2b3621d337b873a22 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 1 Jul 2021 16:16:33 +0200 Subject: [PATCH 1/6] Improve CI for container toolkit This change improves the CI for the container toolkit. The go targets are executed in a docker container which allows for reproducible behaviour on local systems as well as CI. The Makefile is updated to facilitate this. Signed-off-by: Evan Lezar --- .common-ci.yml | 91 +++++++++++++++++++++++++++++++++++++++++ .gitlab-ci.yml | 77 ++++------------------------------ Makefile | 80 ++++++++++++++++++++++++++++++++---- docker/Dockerfile.devel | 20 +++++++++ 4 files changed, 192 insertions(+), 76 deletions(-) create mode 100644 .common-ci.yml create mode 100644 docker/Dockerfile.devel diff --git a/.common-ci.yml b/.common-ci.yml new file mode 100644 index 00000000..40fc68e0 --- /dev/null +++ b/.common-ci.yml @@ -0,0 +1,91 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +default: + image: docker:stable + services: + - name: docker:stable-dind + command: ["--experimental"] + +variables: + IMAGE: "${CI_REGISTRY_IMAGE}" + IMAGE_TAG: "${CI_COMMIT_REF_SLUG}" + +build-dev-image: + stage: image + before_script: + - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" + script: + - apk --no-cache add make bash + - make .build-image + - make .push-build-image + +.requires-build-image: + variables: + SKIP_IMAGE_BUILD: "yes" + before_script: + - apk --no-cache add make bash + - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" + - make .pull-build-image + +.go-check: + extends: + - .requires-build-image + stage: go-checks + +fmt: + extends: + - .go-check + script: + - make docker-assert-fmt + +vet: + extends: + - .go-check + script: + - make docker-vet + +lint: + extends: + - .go-check + script: + - make docker-lint + allow_failure: true + +ineffassign: + extends: + - .go-check + script: + - make docker-ineffassign + allow_failure: true + +misspell: + extends: + - .go-check + script: + - make docker-misspell + +go-build: + extends: + - .requires-build-image + stage: go-build + script: + - make docker-build + +unit-tests: + extends: + - .requires-build-image + stage: unit-tests + script: + - make docker-coverage + diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 400f6a5e..619c7e66 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,86 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Build packages for all supported OS / ARCH combinations +include: + - .common-ci.yml stages: - - tests + - image + - go-checks + - go-build + - unit-tests + - test + - scan + - release - build-one - build-all -.tests-setup: &tests-setup - image: golang:1.14.4 - - rules: - - when: always - - variables: - GITHUB_ROOT: "github.com/NVIDIA" - PROJECT_GOPATH: "${GITHUB_ROOT}/nvidia-container-toolkit" - - before_script: - - mkdir -p ${GOPATH}/src/${GITHUB_ROOT} - - ln -s ${CI_PROJECT_DIR} ${GOPATH}/src/${PROJECT_GOPATH} - .build-setup: &build-setup - image: docker:19.03.8 - - services: - - name: docker:19.03.8-dind - command: ["--experimental"] - before_script: - apk update - apk upgrade - apk add coreutils build-base sed git bash make - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes -c yes -# Run a series of sanity-check tests over the code -lint: - <<: *tests-setup - stage: tests - script: - - GO111MODULE=off go get -u golang.org/x/lint/golint - - make lint - -vet: - <<: *tests-setup - stage: tests - script: - - make vet - -unit_test: - <<: *tests-setup - stage: tests - script: - - make test - -coverage: - <<: *tests-setup - stage: tests - script: - - make coverage - -fmt: - <<: *tests-setup - stage: tests - script: - - make assert-fmt - -ineffassign: - <<: *tests-setup - stage: tests - script: - - GO111MODULE=off go get -u github.com/gordonklaus/ineffassign - - make ineffassign - -misspell: - <<: *tests-setup - stage: tests - script: - - GO111MODULE=off go get -u github.com/client9/misspell/cmd/misspell - - make misspell - # build-one jobs build packages for a single OS / ARCH combination. # # They are run during the first stage of the pipeline as a smoke test to ensure diff --git a/Makefile b/Makefile index 7ff8edba..d262f2e4 100644 --- a/Makefile +++ b/Makefile @@ -27,14 +27,40 @@ MODULE := github.com/NVIDIA/nvidia-container-toolkit docker-native: include $(CURDIR)/docker/docker.mk +ifeq ($(IMAGE),) +REGISTRY ?= nvidia +IMAGE=$(REGISTRY)/container-toolkit +endif +IMAGE_TAG ?= $(GOLANG_VERSION) +BUILDIMAGE ?= $(IMAGE):$(IMAGE_TAG)-devel + +EXAMPLES := $(patsubst ./examples/%/,%,$(sort $(dir $(wildcard ./examples/*/)))) +EXAMPLE_TARGETS := $(patsubst %,example-%, $(EXAMPLES)) + +CHECK_TARGETS := assert-fmt vet lint ineffassign misspell +MAKE_TARGETS := binary build all check fmt lint-internal test examples coverage generate $(CHECK_TARGETS) + +TARGETS := $(MAKE_TARGETS) $(EXAMPLE_TARGETS) + +DOCKER_TARGETS := $(patsubst %,docker-%, $(TARGETS)) +.PHONY: $(TARGETS) $(DOCKER_TARGETS) + GOOS ?= linux binary: GOOS=$(GOOS) go build -ldflags "-s -w" -o "$(LIB_NAME)" $(MODULE)/cmd/$(LIB_NAME) -# Define the check targets for the Golang codebase -.PHONY: check fmt assert-fmt ineffassign lint misspell vet -check: assert-fmt lint misspell vet +build: + GOOS=$(GOOS) go build ./... + +examples: $(EXAMPLE_TARGETS) +$(EXAMPLE_TARGETS): example-%: + GOOS=$(GOOS) go build ./examples/$(*) + +all: check test build binary +check: $(CHECK_TARGETS) + +# Apply go fmt to the codebase fmt: go list -f '{{.Dir}}' $(MODULE)/... \ | xargs gofmt -s -l -w @@ -55,8 +81,12 @@ ineffassign: ineffassign $(MODULE)/... lint: - # We use `go list -f '{{.Dir}}' $(GOLANG_PKG_PATH)/...` to skip the `vendor` folder. - go list -f '{{.Dir}}' $(MODULE)/... | xargs golint -set_exit_status +# We use `go list -f '{{.Dir}}' $(MODULE)/...` to skip the `vendor` folder. + go list -f '{{.Dir}}' $(MODULE)/... | grep -v /internal/ | xargs golint -set_exit_status + +lint-internal: +# We use `go list -f '{{.Dir}}' $(MODULE)/...` to skip the `vendor` folder. + go list -f '{{.Dir}}' $(MODULE)/internal/... | xargs golint -set_exit_status misspell: misspell $(MODULE)/... @@ -65,8 +95,42 @@ vet: go vet $(MODULE)/... COVERAGE_FILE := coverage.out -test: - go test -coverprofile=$(COVERAGE_FILE) $(MODULE)/... +test: build + go test -v -coverprofile=$(COVERAGE_FILE) $(MODULE)/... coverage: test - go tool cover -func=$(COVERAGE_FILE) + cat $(COVERAGE_FILE) | grep -v "_mock.go" > $(COVERAGE_FILE).no-mocks + go tool cover -func=$(COVERAGE_FILE).no-mocks + +generate: + go generate $(MODULE)/... + +# Generate an image for containerized builds +# Note: This image is local only +.PHONY: .build-image .pull-build-image .push-build-image +.build-image: docker/Dockerfile.devel + if [ x"$(SKIP_IMAGE_BUILD)" = x"" ]; then \ + $(DOCKER) build \ + --progress=plain \ + --build-arg GOLANG_VERSION="$(GOLANG_VERSION)" \ + --tag $(BUILDIMAGE) \ + -f $(^) \ + docker; \ + fi + +.pull-build-image: + $(DOCKER) pull $(BUILDIMAGE) + +.push-build-image: + $(DOCKER) push $(BUILDIMAGE) + +$(DOCKER_TARGETS): docker-%: .build-image + @echo "Running 'make $(*)' in docker container $(BUILDIMAGE)" + $(DOCKER) run \ + --rm \ + -e GOCACHE=/tmp/.cache \ + -v $(PWD):$(PWD) \ + -w $(PWD) \ + --user $$(id -u):$$(id -g) \ + $(BUILDIMAGE) \ + make $(*) diff --git a/docker/Dockerfile.devel b/docker/Dockerfile.devel new file mode 100644 index 00000000..20e0447f --- /dev/null +++ b/docker/Dockerfile.devel @@ -0,0 +1,20 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +ARG GOLANG_VERSION=x.x.x +FROM golang:${GOLANG_VERSION} + +RUN go get -u golang.org/x/lint/golint +RUN go get -u github.com/matryer/moq +RUN go get -u github.com/gordonklaus/ineffassign +RUN go get -u github.com/client9/misspell/cmd/misspell \ No newline at end of file From 1ef4b1a14ac01e3e34ea0544e736ab3f156eee28 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 1 Jul 2021 16:24:10 +0200 Subject: [PATCH 2/6] Use extends keyword for build-one and build-all Signed-off-by: Evan Lezar --- .gitlab-ci.yml | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 619c7e66..ac26c474 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,7 +26,7 @@ stages: - build-one - build-all -.build-setup: &build-setup +.build-setup: before_script: - apk update - apk upgrade @@ -39,11 +39,12 @@ stages: # that we can successfully build packages on all of our architectures for a # single OS. They are triggered on any change to an MR. No artifacts are # produced as part of build-one jobs. -.build-one-setup: &build-one-setup - <<: *build-setup +.build-one-setup: + extends: + - .build-setup stage: build-one - only: - - merge_requests + rules: + - if: $CI_MERGE_REQUEST_ID # build-all jobs build packages for every OS / ARCH combination we support. # @@ -55,8 +56,9 @@ stages: # OS / ARCH combinations, so this is optimized to only run once per MR # (assuming it all passes). A full set of artifacts including the packages # built for each OS / ARCH are produced as a result of these jobs. -.build-all-setup: &build-all-setup - <<: *build-setup +.build-all-setup: + extends: + - .build-setup stage: build-all timeout: 2h 30m rules: @@ -78,43 +80,53 @@ stages: # The full set of build-one jobs organizes to build # ubuntu18.04 in parallel on each of our supported ARCHs. build-one-amd64: - <<: *build-one-setup + extends: + - .build-one-setup script: - make ubuntu18.04-amd64 + rules: + - when: always build-one-ppc64le: - <<: *build-one-setup + extends: + - .build-one-setup script: - make ubuntu18.04-ppc64le build-one-arm64: - <<: *build-one-setup + extends: + - .build-one-setup script: - make ubuntu18.04-arm64 # The full set of build-all jobs organized to # have builds for each ARCH run in parallel. build-all-amd64: - <<: *build-all-setup + extends: + - .build-all-setup script: - make docker-amd64 build-all-x86_64: - <<: *build-all-setup + extends: + - .build-all-setup script: - make docker-x86_64 build-all-ppc64le: - <<: *build-all-setup + extends: + - .build-all-setup script: - make docker-ppc64le build-all-arm64: - <<: *build-all-setup + extends: + - .build-all-setup script: - make docker-arm64 build-all-aarch64: - <<: *build-all-setup + extends: + - .build-all-setup script: - make docker-aarch64 From f828efcf64e6e38bcb47cf161fd3ccfe0e6edcd9 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 22 Jul 2021 17:44:59 +0200 Subject: [PATCH 3/6] Add support for NVIDIA_FABRIC_DEVICES This change adds support for the NVIDIA_FABRIC_DEVICES envvar. The (non-empty) value of this envvar is passed to the NVIDIA Container CLI using the --fabric-device command line flag and allows for nvswitch and nvlink devices to be mounted into the container. Signed-off-by: Evan Lezar --- .../container_config.go | 15 ++++++++++++ .../container_test.go | 24 +++++++++++++++++++ cmd/nvidia-container-toolkit/main.go | 3 +++ 3 files changed, 42 insertions(+) diff --git a/cmd/nvidia-container-toolkit/container_config.go b/cmd/nvidia-container-toolkit/container_config.go index dae4cc7b..b260e70f 100644 --- a/cmd/nvidia-container-toolkit/container_config.go +++ b/cmd/nvidia-container-toolkit/container_config.go @@ -23,6 +23,7 @@ const ( envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES" envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES" envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES" + envNVFabricDevices = "NVIDIA_FABRIC_DEVICES" envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" ) @@ -43,6 +44,7 @@ type nvidiaConfig struct { Devices string MigConfigDevices string MigMonitorDevices string + FabricDevices string DriverCapabilities string Requirements []string DisableRequire bool @@ -316,6 +318,13 @@ func getMigMonitorDevices(env map[string]string) *string { return nil } +func getFabricDevices(env map[string]string) *string { + if devices, ok := env[envNVFabricDevices]; ok { + return &devices + } + return nil +} + func getDriverCapabilities(env map[string]string, legacyImage bool) *string { // Grab a reference to the capabilities from the envvar // if it actually exists in the environment. @@ -394,6 +403,11 @@ func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mou driverCapabilities = *c } + var nvFabricDevices string + if d := getFabricDevices(env); d != nil { + nvFabricDevices = *d + } + requirements := getRequirements(env, legacyImage) // Don't fail on invalid values. @@ -403,6 +417,7 @@ func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mou Devices: devices, MigConfigDevices: migConfigDevices, MigMonitorDevices: migMonitorDevices, + FabricDevices: nvFabricDevices, DriverCapabilities: driverCapabilities, Requirements: requirements, DisableRequire: disableRequire, diff --git a/cmd/nvidia-container-toolkit/container_test.go b/cmd/nvidia-container-toolkit/container_test.go index 3cda6280..ff9e4905 100644 --- a/cmd/nvidia-container-toolkit/container_test.go +++ b/cmd/nvidia-container-toolkit/container_test.go @@ -403,6 +403,30 @@ func TestGetNvidiaConfig(t *testing.T) { privileged: false, expectedPanic: true, }, + { + description: "fabric devices selected", + env: map[string]string{ + envNVVisibleDevices: "all", + envNVFabricDevices: "all", + }, + expectedConfig: &nvidiaConfig{ + Devices: "all", + FabricDevices: "all", + DriverCapabilities: defaultDriverCapabilities, + }, + }, + { + description: "fabric devices selected empty", + env: map[string]string{ + envNVVisibleDevices: "all", + envNVFabricDevices: "", + }, + expectedConfig: &nvidiaConfig{ + Devices: "all", + FabricDevices: "", + DriverCapabilities: defaultDriverCapabilities, + }, + }, } for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { diff --git a/cmd/nvidia-container-toolkit/main.go b/cmd/nvidia-container-toolkit/main.go index 13f8197c..e520039e 100644 --- a/cmd/nvidia-container-toolkit/main.go +++ b/cmd/nvidia-container-toolkit/main.go @@ -132,6 +132,9 @@ func doPrestart() { if len(nvidia.MigMonitorDevices) > 0 { args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices)) } + if len(nvidia.FabricDevices) > 0 { + args = append(args, fmt.Sprintf("--fabric-device=%s", nvidia.FabricDevices)) + } for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") { if len(cap) == 0 { From 2001d66f9ba18e1a3dfedd6146bfeab4641c8f46 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 13 Aug 2021 13:45:32 +0200 Subject: [PATCH 4/6] Bump version to 1.6.0~rc.1 Signed-off-by: Evan Lezar --- Makefile | 4 ++-- packaging/debian/changelog | 6 ++++++ packaging/debian/control | 2 +- packaging/rpm/SPECS/nvidia-container-toolkit.spec | 6 ++++-- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index d262f2e4..8adca334 100644 --- a/Makefile +++ b/Makefile @@ -17,8 +17,8 @@ MKDIR ?= mkdir DIST_DIR ?= $(CURDIR)/dist LIB_NAME := nvidia-container-toolkit -LIB_VERSION := 1.5.1 -LIB_TAG ?= +LIB_VERSION := 1.6.0 +LIB_TAG ?= rc.1 GOLANG_VERSION := 1.16.3 MODULE := github.com/NVIDIA/nvidia-container-toolkit diff --git a/packaging/debian/changelog b/packaging/debian/changelog index 12b90f99..1c38d5fb 100644 --- a/packaging/debian/changelog +++ b/packaging/debian/changelog @@ -1,3 +1,9 @@ +nvidia-container-toolkit (1.6.0~rc.1-1) UNRELEASED; urgency=medium + + * Add support for NVIDIA_FABRIC_DEVICES to select nvlink and nvswitch devices + + -- NVIDIA CORPORATION Fri, 13 Aug 2021 09:00:00 -0700 + nvidia-container-toolkit (1.5.1-1) UNRELEASED; urgency=medium * Fix bug where Docker Swarm device selection is ignored if diff --git a/packaging/debian/control b/packaging/debian/control index 7dbec179..1aa40949 100644 --- a/packaging/debian/control +++ b/packaging/debian/control @@ -10,7 +10,7 @@ Build-Depends: debhelper (>= 9) Package: nvidia-container-toolkit Architecture: any -Depends: ${misc:Depends}, libnvidia-container-tools (>= 1.4.0), libnvidia-container-tools (<< 2.0.0) +Depends: ${misc:Depends}, libnvidia-container-tools (>= 1.5.0~rc.1), libnvidia-container-tools (<< 2.0.0) Breaks: nvidia-container-runtime (<< 2.0.0), nvidia-container-runtime-hook Replaces: nvidia-container-runtime (<< 2.0.0), nvidia-container-runtime-hook Description: NVIDIA container runtime hook diff --git a/packaging/rpm/SPECS/nvidia-container-toolkit.spec b/packaging/rpm/SPECS/nvidia-container-toolkit.spec index 09a6591c..70fe8ece 100644 --- a/packaging/rpm/SPECS/nvidia-container-toolkit.spec +++ b/packaging/rpm/SPECS/nvidia-container-toolkit.spec @@ -18,7 +18,7 @@ Source4: LICENSE Obsoletes: nvidia-container-runtime < 2.0.0, nvidia-container-runtime-hook Provides: nvidia-container-runtime-hook -Requires: libnvidia-container-tools >= 1.4.0, libnvidia-container-tools < 2.0.0 +Requires: libnvidia-container-tools >= 1.5.0-0.1.rc.1, libnvidia-container-tools < 2.0.0 %description Provides a OCI hook to enable GPU support in containers. @@ -53,8 +53,10 @@ rm -f %{_bindir}/nvidia-container-runtime-hook /usr/share/containers/oci/hooks.d/oci-nvidia-hook.json %changelog -* Mon Jun 14 2021 NVIDIA CORPORATION 1.5.1-1 +* Fri Aug 13 2021 NVIDIA CORPORATION 1.6.0-0.1.rc.1 +- Add support for NVIDIA_FABRIC_DEVICES to select nvlink and nvswitch devices +* Mon Jun 14 2021 NVIDIA CORPORATION 1.5.1-1 - Fix bug where Docker Swarm device selection is ignored if NVIDIA_VISIBLE_DEVICES is also set - Improve unit testing by using require package and adding coverage reports - Remove unneeded go dependencies by running go mod tidy From 620bd806e8840e255edaca807654988bc46b9988 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 18 Aug 2021 15:17:37 +0200 Subject: [PATCH 5/6] Revert "Bump version to 1.6.0~rc.1" This reverts commit 2001d66f9ba18e1a3dfedd6146bfeab4641c8f46. --- Makefile | 4 ++-- packaging/debian/changelog | 6 ------ packaging/debian/control | 2 +- packaging/rpm/SPECS/nvidia-container-toolkit.spec | 6 ++---- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 8adca334..d262f2e4 100644 --- a/Makefile +++ b/Makefile @@ -17,8 +17,8 @@ MKDIR ?= mkdir DIST_DIR ?= $(CURDIR)/dist LIB_NAME := nvidia-container-toolkit -LIB_VERSION := 1.6.0 -LIB_TAG ?= rc.1 +LIB_VERSION := 1.5.1 +LIB_TAG ?= GOLANG_VERSION := 1.16.3 MODULE := github.com/NVIDIA/nvidia-container-toolkit diff --git a/packaging/debian/changelog b/packaging/debian/changelog index 1c38d5fb..12b90f99 100644 --- a/packaging/debian/changelog +++ b/packaging/debian/changelog @@ -1,9 +1,3 @@ -nvidia-container-toolkit (1.6.0~rc.1-1) UNRELEASED; urgency=medium - - * Add support for NVIDIA_FABRIC_DEVICES to select nvlink and nvswitch devices - - -- NVIDIA CORPORATION Fri, 13 Aug 2021 09:00:00 -0700 - nvidia-container-toolkit (1.5.1-1) UNRELEASED; urgency=medium * Fix bug where Docker Swarm device selection is ignored if diff --git a/packaging/debian/control b/packaging/debian/control index 1aa40949..7dbec179 100644 --- a/packaging/debian/control +++ b/packaging/debian/control @@ -10,7 +10,7 @@ Build-Depends: debhelper (>= 9) Package: nvidia-container-toolkit Architecture: any -Depends: ${misc:Depends}, libnvidia-container-tools (>= 1.5.0~rc.1), libnvidia-container-tools (<< 2.0.0) +Depends: ${misc:Depends}, libnvidia-container-tools (>= 1.4.0), libnvidia-container-tools (<< 2.0.0) Breaks: nvidia-container-runtime (<< 2.0.0), nvidia-container-runtime-hook Replaces: nvidia-container-runtime (<< 2.0.0), nvidia-container-runtime-hook Description: NVIDIA container runtime hook diff --git a/packaging/rpm/SPECS/nvidia-container-toolkit.spec b/packaging/rpm/SPECS/nvidia-container-toolkit.spec index 70fe8ece..09a6591c 100644 --- a/packaging/rpm/SPECS/nvidia-container-toolkit.spec +++ b/packaging/rpm/SPECS/nvidia-container-toolkit.spec @@ -18,7 +18,7 @@ Source4: LICENSE Obsoletes: nvidia-container-runtime < 2.0.0, nvidia-container-runtime-hook Provides: nvidia-container-runtime-hook -Requires: libnvidia-container-tools >= 1.5.0-0.1.rc.1, libnvidia-container-tools < 2.0.0 +Requires: libnvidia-container-tools >= 1.4.0, libnvidia-container-tools < 2.0.0 %description Provides a OCI hook to enable GPU support in containers. @@ -53,10 +53,8 @@ rm -f %{_bindir}/nvidia-container-runtime-hook /usr/share/containers/oci/hooks.d/oci-nvidia-hook.json %changelog -* Fri Aug 13 2021 NVIDIA CORPORATION 1.6.0-0.1.rc.1 -- Add support for NVIDIA_FABRIC_DEVICES to select nvlink and nvswitch devices - * Mon Jun 14 2021 NVIDIA CORPORATION 1.5.1-1 + - Fix bug where Docker Swarm device selection is ignored if NVIDIA_VISIBLE_DEVICES is also set - Improve unit testing by using require package and adding coverage reports - Remove unneeded go dependencies by running go mod tidy From c2ac6db43b1954e30c8e251c9edcc96f3ded2019 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 18 Aug 2021 15:17:59 +0200 Subject: [PATCH 6/6] Revert "Add support for NVIDIA_FABRIC_DEVICES" This reverts commit f828efcf64e6e38bcb47cf161fd3ccfe0e6edcd9. --- .../container_config.go | 15 ------------ .../container_test.go | 24 ------------------- cmd/nvidia-container-toolkit/main.go | 3 --- 3 files changed, 42 deletions(-) diff --git a/cmd/nvidia-container-toolkit/container_config.go b/cmd/nvidia-container-toolkit/container_config.go index b260e70f..dae4cc7b 100644 --- a/cmd/nvidia-container-toolkit/container_config.go +++ b/cmd/nvidia-container-toolkit/container_config.go @@ -23,7 +23,6 @@ const ( envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES" envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES" envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES" - envNVFabricDevices = "NVIDIA_FABRIC_DEVICES" envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" ) @@ -44,7 +43,6 @@ type nvidiaConfig struct { Devices string MigConfigDevices string MigMonitorDevices string - FabricDevices string DriverCapabilities string Requirements []string DisableRequire bool @@ -318,13 +316,6 @@ func getMigMonitorDevices(env map[string]string) *string { return nil } -func getFabricDevices(env map[string]string) *string { - if devices, ok := env[envNVFabricDevices]; ok { - return &devices - } - return nil -} - func getDriverCapabilities(env map[string]string, legacyImage bool) *string { // Grab a reference to the capabilities from the envvar // if it actually exists in the environment. @@ -403,11 +394,6 @@ func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mou driverCapabilities = *c } - var nvFabricDevices string - if d := getFabricDevices(env); d != nil { - nvFabricDevices = *d - } - requirements := getRequirements(env, legacyImage) // Don't fail on invalid values. @@ -417,7 +403,6 @@ func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mou Devices: devices, MigConfigDevices: migConfigDevices, MigMonitorDevices: migMonitorDevices, - FabricDevices: nvFabricDevices, DriverCapabilities: driverCapabilities, Requirements: requirements, DisableRequire: disableRequire, diff --git a/cmd/nvidia-container-toolkit/container_test.go b/cmd/nvidia-container-toolkit/container_test.go index ff9e4905..3cda6280 100644 --- a/cmd/nvidia-container-toolkit/container_test.go +++ b/cmd/nvidia-container-toolkit/container_test.go @@ -403,30 +403,6 @@ func TestGetNvidiaConfig(t *testing.T) { privileged: false, expectedPanic: true, }, - { - description: "fabric devices selected", - env: map[string]string{ - envNVVisibleDevices: "all", - envNVFabricDevices: "all", - }, - expectedConfig: &nvidiaConfig{ - Devices: "all", - FabricDevices: "all", - DriverCapabilities: defaultDriverCapabilities, - }, - }, - { - description: "fabric devices selected empty", - env: map[string]string{ - envNVVisibleDevices: "all", - envNVFabricDevices: "", - }, - expectedConfig: &nvidiaConfig{ - Devices: "all", - FabricDevices: "", - DriverCapabilities: defaultDriverCapabilities, - }, - }, } for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { diff --git a/cmd/nvidia-container-toolkit/main.go b/cmd/nvidia-container-toolkit/main.go index e520039e..13f8197c 100644 --- a/cmd/nvidia-container-toolkit/main.go +++ b/cmd/nvidia-container-toolkit/main.go @@ -132,9 +132,6 @@ func doPrestart() { if len(nvidia.MigMonitorDevices) > 0 { args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices)) } - if len(nvidia.FabricDevices) > 0 { - args = append(args, fmt.Sprintf("--fabric-device=%s", nvidia.FabricDevices)) - } for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") { if len(cap) == 0 {