From db47b5827571511727ce3bca05804b078f7744c1 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 29 Sep 2022 14:39:56 +0200 Subject: [PATCH 1/9] Add utilities for driver capabilities to image packages Signed-off-by: Evan Lezar --- internal/config/image/capabilities.go | 54 +++++++++++++++++++++++++++ internal/config/image/cuda_image.go | 23 +++++++++--- 2 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 internal/config/image/capabilities.go diff --git a/internal/config/image/capabilities.go b/internal/config/image/capabilities.go new file mode 100644 index 00000000..2ff1f40b --- /dev/null +++ b/internal/config/image/capabilities.go @@ -0,0 +1,54 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package image + +// DriverCapability represents the possible values of NVIDIA_DRIVER_CAPABILITIES +type DriverCapability string + +// Constants for the supported driver capabilities +const ( + DriverCapabilityAll DriverCapability = "all" + DriverCapabilityCompat32 DriverCapability = "compat32" + DriverCapabilityCompute DriverCapability = "compute" + DriverCapabilityDisplay DriverCapability = "display" + DriverCapabilityGraphics DriverCapability = "graphics" + DriverCapabilityNgx DriverCapability = "ngx" + DriverCapabilityUtility DriverCapability = "utility" + DriverCapabilityVideo DriverCapability = "video" +) + +// DriverCapabilities represents the NVIDIA_DRIVER_CAPABILITIES set for the specified image. +type DriverCapabilities map[DriverCapability]bool + +// Has check whether the specified capability is selected. +func (c DriverCapabilities) Has(capability DriverCapability) bool { + if c[DriverCapabilityAll] { + return true + } + return c[capability] +} + +// Any checks whether any of the specified capabilites are set +func (c DriverCapabilities) Any(capabilities ...DriverCapability) bool { + for _, cap := range capabilities { + if c.Has(cap) { + return true + } + } + + return false +} diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index b9ad4c94..097f5d9e 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -26,11 +26,12 @@ import ( ) const ( - envCUDAVersion = "CUDA_VERSION" - envNVRequirePrefix = "NVIDIA_REQUIRE_" - envNVRequireCUDA = envNVRequirePrefix + "CUDA" - envNVRequireJetpack = envNVRequirePrefix + "JETPACK" - envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE" + envCUDAVersion = "CUDA_VERSION" + envNVRequirePrefix = "NVIDIA_REQUIRE_" + envNVRequireCUDA = envNVRequirePrefix + "CUDA" + envNVRequireJetpack = envNVRequirePrefix + "JETPACK" + envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE" + envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" ) // CUDA represents a CUDA image that can be used for GPU computing. This wraps @@ -142,6 +143,18 @@ func (i CUDA) DevicesFromEnvvars(envVars ...string) []string { return strings.Split(*devices, ",") } +// GetDriverCapabilities returns the requested driver capabilities. +func (i CUDA) GetDriverCapabilities() DriverCapabilities { + env := i[envNVDriverCapabilities] + + capabilites := make(DriverCapabilities) + for _, c := range strings.Split(env, ",") { + capabilites[DriverCapability(c)] = true + } + + return capabilites +} + func (i CUDA) legacyVersion() (string, error) { majorMinor, err := parseMajorMinorVersion(i[envCUDAVersion]) if err != nil { From aca0c7bc5a3f76da4e751963ccb52b4edd3a0ca4 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 26 Oct 2022 12:37:23 +0200 Subject: [PATCH 2/9] Add Devices abstraction to CUDA image This change adds a Devices abstraction to the CUDA image utilities. This allows for checking whether a devices is selected, for example. Signed-off-by: Evan Lezar --- .../container_config.go | 2 +- internal/config/image/cuda_image.go | 12 +- internal/config/image/devices.go | 125 ++++++++++++++++++ internal/modifier/cdi.go | 2 +- internal/modifier/csv.go | 2 +- internal/modifier/gds.go | 2 +- internal/modifier/graphics.go | 2 +- internal/modifier/mofed.go | 2 +- 8 files changed, 135 insertions(+), 14 deletions(-) create mode 100644 internal/config/image/devices.go diff --git a/cmd/nvidia-container-runtime-hook/container_config.go b/cmd/nvidia-container-runtime-hook/container_config.go index 2168c5e2..1956ba54 100644 --- a/cmd/nvidia-container-runtime-hook/container_config.go +++ b/cmd/nvidia-container-runtime-hook/container_config.go @@ -167,7 +167,7 @@ func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *stri // Build a list of envvars to consider. Note that the Swarm Resource envvars have a higher precedence. envVars := append(swarmResourceEnvvars, envNVVisibleDevices) - devices := image.DevicesFromEnvvars(envVars...) + devices := image.DevicesFromEnvvars(envVars...).List() if len(devices) == 0 { return nil } diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index 097f5d9e..ebd3fdd2 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -114,7 +114,7 @@ func (i CUDA) HasDisableRequire() bool { } // DevicesFromEnvvars returns the devices requested by the image through environment variables -func (i CUDA) DevicesFromEnvvars(envVars ...string) []string { +func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices { // Grab a reference to devices from the first envvar // in the list that actually exists in the environment. var devices *string @@ -127,20 +127,16 @@ func (i CUDA) DevicesFromEnvvars(envVars ...string) []string { // Environment variable unset with legacy image: default to "all". if devices == nil && i.IsLegacy() { - return []string{"all"} + return newVisibleDevices("all") } // Environment variable unset or empty or "void": return nil if devices == nil || len(*devices) == 0 || *devices == "void" { - return nil + return newVisibleDevices("void") } // Environment variable set to "none": reset to "". - if *devices == "none" { - return []string{""} - } - - return strings.Split(*devices, ",") + return newVisibleDevices(*devices) } // GetDriverCapabilities returns the requested driver capabilities. diff --git a/internal/config/image/devices.go b/internal/config/image/devices.go new file mode 100644 index 00000000..6f3d00b6 --- /dev/null +++ b/internal/config/image/devices.go @@ -0,0 +1,125 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package image + +import ( + "strings" +) + +// VisibleDevices represents the devices selected in a container image +// through the NVIDIA_VISIBLE_DEVICES or other environment variables +type VisibleDevices interface { + List() []string + Has(string) bool +} + +var _ VisibleDevices = (*all)(nil) +var _ VisibleDevices = (*none)(nil) +var _ VisibleDevices = (*void)(nil) +var _ VisibleDevices = (*devices)(nil) + +// newVisibleDevices creates a VisibleDevices based on the value of the specified envvar. +func newVisibleDevices(envvar string) VisibleDevices { + if envvar == "all" { + return all{} + } + if envvar == "none" { + return none{} + } + if envvar == "" || envvar == "void" { + return void{} + } + + return newDevices(envvar) +} + +type all struct{} + +// List returns ["all"] for all devices +func (a all) List() []string { + return []string{"all"} +} + +// Has for all devices is true for any id except the empty ID +func (a all) Has(id string) bool { + return id != "" +} + +type none struct{} + +// List returns [""] for the none devices +func (n none) List() []string { + return []string{""} +} + +// Has for none devices is false for any id +func (n none) Has(id string) bool { + return false +} + +type void struct { + none +} + +// List returns nil for the void devices +func (v void) List() []string { + return nil +} + +type devices struct { + len int + lookup map[string]int +} + +func newDevices(idOrCommaSeparated ...string) devices { + lookup := make(map[string]int) + + i := 0 + for _, commaSeparated := range idOrCommaSeparated { + for _, id := range strings.Split(commaSeparated, ",") { + lookup[id] = i + i++ + } + } + + d := devices{ + len: i, + lookup: lookup, + } + return d +} + +// List returns the list of requested devices +func (d devices) List() []string { + list := make([]string, d.len) + + for id, i := range d.lookup { + list[i] = id + } + + return list +} + +// Has checks whether the specified ID is in the set of requested devices +func (d devices) Has(id string) bool { + if id == "" { + return false + } + + _, exist := d.lookup[id] + return exist +} diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index 96d0c665..cffe2967 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -80,7 +80,7 @@ func getDevicesFromSpec(ociSpec oci.Spec) ([]string, error) { } uniqueDevices := make(map[string]struct{}) - for _, name := range append(envDevices, annotationDevices...) { + for _, name := range append(envDevices.List(), annotationDevices...) { if !cdi.IsQualifiedName(name) { name = cdi.QualifiedName("nvidia.com", "gpu", name) } diff --git a/internal/modifier/csv.go b/internal/modifier/csv.go index d2a2f7ad..57f8deea 100644 --- a/internal/modifier/csv.go +++ b/internal/modifier/csv.go @@ -55,7 +55,7 @@ func NewCSVModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) return nil, err } - if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices) == 0 { + if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/gds.go b/internal/modifier/gds.go index a55d2bef..78c0e38a 100644 --- a/internal/modifier/gds.go +++ b/internal/modifier/gds.go @@ -43,7 +43,7 @@ func NewGDSModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) return nil, err } - if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices) == 0 { + if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/graphics.go b/internal/modifier/graphics.go index 722c2ae0..f094d78f 100644 --- a/internal/modifier/graphics.go +++ b/internal/modifier/graphics.go @@ -40,7 +40,7 @@ func NewGraphicsModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci. return nil, err } - if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices) == 0 { + if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/mofed.go b/internal/modifier/mofed.go index 62235e7a..abdf8baf 100644 --- a/internal/modifier/mofed.go +++ b/internal/modifier/mofed.go @@ -43,7 +43,7 @@ func NewMOFEDModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spe return nil, err } - if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices) == 0 { + if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } From 9d6e2ff1b08d121f31dc2af87c0f7688ee9bdfe6 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 26 Oct 2022 12:35:24 +0200 Subject: [PATCH 3/9] Add internal proc package for processing GPU information files Signed-off-by: Evan Lezar --- internal/info/proc/information_files.go | 89 +++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 internal/info/proc/information_files.go diff --git a/internal/info/proc/information_files.go b/internal/info/proc/information_files.go new file mode 100644 index 00000000..dde77416 --- /dev/null +++ b/internal/info/proc/information_files.go @@ -0,0 +1,89 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package proc + +import ( + "bufio" + "fmt" + "io" + "os" + "path/filepath" + "strings" +) + +// GPUInfoField represents the field name for information specified in a GPU's information file +type GPUInfoField string + +// The following constants define the fields of interest from the GPU information file +const ( + GPUInfoModel = GPUInfoField("Model") + GPUInfoGPUUUID = GPUInfoField("GPU UUID") + GPUInfoBusLocation = GPUInfoField("Bus Location") + GPUInfoDeviceMinor = GPUInfoField("Device Minor") +) + +// GPUInfo stores the information for a GPU as determined from its associated information file +type GPUInfo map[GPUInfoField]string + +// GetInformationFilePaths returns the list of information files associated with NVIDIA GPUs. +func GetInformationFilePaths(root string) ([]string, error) { + return filepath.Glob(filepath.Join(root, "/proc/driver/nvidia/gpus/*/information")) +} + +// ParseGPUInformationFile parses the specified GPU information file and constructs a GPUInfo structure +func ParseGPUInformationFile(path string) (GPUInfo, error) { + infoFile, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("failed to open %v: %v", path, err) + } + defer infoFile.Close() + + return gpuInfoFrom(infoFile), nil +} + +// gpuInfoFrom parses a GPUInfo struct from the specified reader +// An information file has the following strucutre: +// $ cat /proc/driver/nvidia/gpus/0000\:06\:00.0/information +// Model: Tesla V100-SXM2-16GB +// IRQ: 408 +// GPU UUID: GPU-edfee158-11c1-52b8-0517-92f30e7fac88 +// Video BIOS: 88.00.41.00.01 +// Bus Type: PCIe +// DMA Size: 47 bits +// DMA Mask: 0x7fffffffffff +// Bus Location: 0000:06:00.0 +// Device Minor: 0 +// GPU Excluded: No +func gpuInfoFrom(reader io.Reader) GPUInfo { + info := make(GPUInfo) + scanner := bufio.NewScanner(reader) + for scanner.Scan() { + line := scanner.Text() + + parts := strings.SplitN(line, ":", 2) + if len(parts) != 2 { + continue + } + + field := GPUInfoField(parts[0]) + value := strings.TrimSpace(parts[1]) + + info[field] = value + } + + return info +} From 624b9d8ee609584f96c7471dbebffbad52a3dcdb Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 26 Oct 2022 12:35:58 +0200 Subject: [PATCH 4/9] Add internal drm package for determining DRM devices Signed-off-by: Evan Lezar --- internal/info/drm/drm_devices.go | 39 ++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 internal/info/drm/drm_devices.go diff --git a/internal/info/drm/drm_devices.go b/internal/info/drm/drm_devices.go new file mode 100644 index 00000000..3b8204bc --- /dev/null +++ b/internal/info/drm/drm_devices.go @@ -0,0 +1,39 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package drm + +import ( + "fmt" + "path/filepath" +) + +// GetDeviceNodesByBusID returns the DRM devices associated with the specified PCI bus ID +func GetDeviceNodesByBusID(busID string) ([]string, error) { + drmRoot := filepath.Join("/sys/bus/pci/devices", busID, "drm") + matches, err := filepath.Glob(fmt.Sprintf("%s/*", drmRoot)) + if err != nil { + return nil, err + } + + var drmDeviceNodes []string + for _, m := range matches { + drmDeviceNode := filepath.Join("/dev/dri", filepath.Base(m)) + drmDeviceNodes = append(drmDeviceNodes, drmDeviceNode) + } + + return drmDeviceNodes, nil +} From bc8a73dde4064109b991cc28b9e27feec994b54a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 26 Oct 2022 12:38:37 +0200 Subject: [PATCH 5/9] Add a Filter interface to the discover package This change adds support for filtering entities by specifying a filter. This can be used, for example, to check whether a mount or device has a particular property and removing it from the set of discovered entities if it does not. Signed-off-by: Evan Lezar --- internal/discover/filter.go | 53 +++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 internal/discover/filter.go diff --git a/internal/discover/filter.go b/internal/discover/filter.go new file mode 100644 index 00000000..853c73fb --- /dev/null +++ b/internal/discover/filter.go @@ -0,0 +1,53 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package discover + +import "github.com/sirupsen/logrus" + +// Filter defines an interface for filtering discovered entities +type Filter interface { + DeviceIsSelected(device Device) bool +} + +// filtered represents a filtered discoverer +type filtered struct { + Discover + logger *logrus.Logger + filter Filter +} + +// Devices returns a filtered list of devices based on the specified filter. +func (d filtered) Devices() ([]Device, error) { + devices, err := d.Discover.Devices() + if err != nil { + return nil, err + } + + if d.filter == nil { + return devices, nil + } + + var selected []Device + for _, device := range devices { + if d.filter.DeviceIsSelected(device) { + selected = append(selected, device) + } + d.logger.Debugf("skipping device %v", device) + } + + return selected, nil +} From eac4faddc61ea262c333434edf9902085a4273d4 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 24 Oct 2022 16:10:23 +0200 Subject: [PATCH 6/9] Use :: as link separator Signed-off-by: Evan Lezar --- cmd/nvidia-ctk/hook/create-symlinks/create-symlinks.go | 4 ++-- internal/discover/symlinks.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/nvidia-ctk/hook/create-symlinks/create-symlinks.go b/cmd/nvidia-ctk/hook/create-symlinks/create-symlinks.go index 9eed0976..8fe92e32 100644 --- a/cmd/nvidia-ctk/hook/create-symlinks/create-symlinks.go +++ b/cmd/nvidia-ctk/hook/create-symlinks/create-symlinks.go @@ -74,7 +74,7 @@ func (m command) build() *cli.Command { }, &cli.StringSliceFlag{ Name: "link", - Usage: "Specify a specific link to create. The link is specified as source:target", + Usage: "Specify a specific link to create. The link is specified as target::link", Destination: &cfg.links, }, &cli.StringFlag{ @@ -145,7 +145,7 @@ func (m command) run(c *cli.Context, cfg *config) error { links := cfg.links.Value() for _, l := range links { - parts := strings.Split(l, ":") + parts := strings.Split(l, "::") if len(parts) != 2 { m.logger.Warnf("Invalid link specification %v", l) continue diff --git a/internal/discover/symlinks.go b/internal/discover/symlinks.go index 6430fc15..b758100b 100644 --- a/internal/discover/symlinks.go +++ b/internal/discover/symlinks.go @@ -117,7 +117,7 @@ func (d symlinks) getSpecificLinkArgs() ([]string, error) { } linkPath := filepath.Join(filepath.Dir(m.Path), link) - links = append(links, "--link", fmt.Sprintf("%v:%v", target, linkPath)) + links = append(links, "--link", fmt.Sprintf("%v::%v", target, linkPath)) linkProcessed[link] = true } From cd7ee5a4357aebdfc427d561bfa61ede194a5178 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 29 Sep 2022 14:42:34 +0200 Subject: [PATCH 7/9] Add test for graphics modifier Signed-off-by: Evan Lezar --- internal/modifier/graphics.go | 38 ++++++++----- internal/modifier/graphics_test.go | 88 ++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 15 deletions(-) create mode 100644 internal/modifier/graphics_test.go diff --git a/internal/modifier/graphics.go b/internal/modifier/graphics.go index f094d78f..c604dff1 100644 --- a/internal/modifier/graphics.go +++ b/internal/modifier/graphics.go @@ -40,21 +40,8 @@ func NewGraphicsModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci. return nil, err } - if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { - logger.Infof("No modification required; no devices requested") - return nil, nil - } - - var hasGraphics bool - for _, c := range strings.Split(image["NVIDIA_DRIVER_CAPABILITIES"], ",") { - if c == "graphics" || c == "all" { - hasGraphics = true - break - } - } - - if !hasGraphics { - logger.Debugf("Capability %q not selected", "graphics") + if required, reason := requiresGraphicsModifier(image); !required { + logger.Infof("No graphics modifier required: %v", reason) return nil, nil } @@ -65,3 +52,24 @@ func NewGraphicsModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci. return NewModifierFromDiscoverer(logger, d) } + +// requiresGraphicsModifier determines whether a graphics modifier is required. +func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) { + if devices := cudaImage.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { + return false, "no devices requested" + } + + var hasGraphics bool + for _, c := range strings.Split(cudaImage["NVIDIA_DRIVER_CAPABILITIES"], ",") { + if c == "graphics" || c == "all" { + hasGraphics = true + break + } + } + + if !hasGraphics { + return false, fmt.Sprintf("Capability %q not selected", "graphics") + } + + return true, "" +} diff --git a/internal/modifier/graphics_test.go b/internal/modifier/graphics_test.go new file mode 100644 index 00000000..bb2b5cfb --- /dev/null +++ b/internal/modifier/graphics_test.go @@ -0,0 +1,88 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package modifier + +import ( + "testing" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" + "github.com/stretchr/testify/require" +) + +func TestGraphicsModifier(t *testing.T) { + testCases := []struct { + description string + cudaImage image.CUDA + expectedRequired bool + }{ + { + description: "empty image does not create modifier", + }, + { + description: "devices with no capabilities does not create modifier", + cudaImage: image.CUDA{ + "NVIDIA_VISIBLE_DEVICES": "all", + }, + }, + { + description: "devices with no non-graphics does not create modifier", + cudaImage: image.CUDA{ + "NVIDIA_VISIBLE_DEVICES": "all", + "NVIDIA_DRIVER_CAPABILITIES": "compute", + }, + }, + { + description: "devices with all capabilities creates modifier", + cudaImage: image.CUDA{ + "NVIDIA_VISIBLE_DEVICES": "all", + "NVIDIA_DRIVER_CAPABILITIES": "all", + }, + expectedRequired: true, + }, + { + description: "devices with graphics capability creates modifier", + cudaImage: image.CUDA{ + "NVIDIA_VISIBLE_DEVICES": "all", + "NVIDIA_DRIVER_CAPABILITIES": "graphics", + }, + expectedRequired: true, + }, + { + description: "devices with compute,graphics capability creates modifier", + cudaImage: image.CUDA{ + "NVIDIA_VISIBLE_DEVICES": "all", + "NVIDIA_DRIVER_CAPABILITIES": "compute,graphics", + }, + expectedRequired: true, + }, + { + description: "devices with display,graphics capability creates modifier", + cudaImage: image.CUDA{ + "NVIDIA_VISIBLE_DEVICES": "all", + "NVIDIA_DRIVER_CAPABILITIES": "display,graphics", + }, + expectedRequired: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + required, _ := requiresGraphicsModifier(tc.cudaImage) + require.EqualValues(t, tc.expectedRequired, required) + }) + } +} From 73e65edaa9c6366e1ac369786cfb397f60797066 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 29 Sep 2022 14:45:22 +0200 Subject: [PATCH 8/9] Also trigger graphics modifier for display capability Signed-off-by: Evan Lezar --- internal/modifier/graphics.go | 13 ++----------- internal/modifier/graphics_test.go | 8 ++++++++ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/internal/modifier/graphics.go b/internal/modifier/graphics.go index c604dff1..cb8daa71 100644 --- a/internal/modifier/graphics.go +++ b/internal/modifier/graphics.go @@ -18,7 +18,6 @@ package modifier import ( "fmt" - "strings" "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" @@ -59,16 +58,8 @@ func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) { return false, "no devices requested" } - var hasGraphics bool - for _, c := range strings.Split(cudaImage["NVIDIA_DRIVER_CAPABILITIES"], ",") { - if c == "graphics" || c == "all" { - hasGraphics = true - break - } - } - - if !hasGraphics { - return false, fmt.Sprintf("Capability %q not selected", "graphics") + if !cudaImage.GetDriverCapabilities().Any(image.DriverCapabilityGraphics, image.DriverCapabilityDisplay) { + return false, "no required capabilities requested" } return true, "" diff --git a/internal/modifier/graphics_test.go b/internal/modifier/graphics_test.go index bb2b5cfb..e062763d 100644 --- a/internal/modifier/graphics_test.go +++ b/internal/modifier/graphics_test.go @@ -69,6 +69,14 @@ func TestGraphicsModifier(t *testing.T) { }, expectedRequired: true, }, + { + description: "devices with display capability creates modifier", + cudaImage: image.CUDA{ + "NVIDIA_VISIBLE_DEVICES": "all", + "NVIDIA_DRIVER_CAPABILITIES": "display", + }, + expectedRequired: true, + }, { description: "devices with display,graphics capability creates modifier", cudaImage: image.CUDA{ From 76b69f45de187e2b73933d937c61ea3f02567c61 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 29 Sep 2022 14:47:04 +0200 Subject: [PATCH 9/9] Add discovery of DRM devices This change adds the discovery of DRM devices associated with requested devices. This means that the /dev/dri/card* and /dev/dri/renderD* devices associated with each requested NVIDIA GPU are injected into the container and that the /dev/dri/by-path symlinks associated with these devices are created in the container. Signed-off-by: Evan Lezar --- internal/discover/filter.go | 9 ++ internal/discover/graphics.go | 193 +++++++++++++++++++++++++++++++++- internal/modifier/graphics.go | 10 +- 3 files changed, 210 insertions(+), 2 deletions(-) diff --git a/internal/discover/filter.go b/internal/discover/filter.go index 853c73fb..ee4b78ae 100644 --- a/internal/discover/filter.go +++ b/internal/discover/filter.go @@ -30,6 +30,15 @@ type filtered struct { filter Filter } +// newFilteredDisoverer creates a discoverer that applies the specified filter to the returned entities of the discoverer +func newFilteredDisoverer(logger *logrus.Logger, applyTo Discover, filter Filter) Discover { + return filtered{ + Discover: applyTo, + logger: logger, + filter: filter, + } +} + // Devices returns a filtered list of devices based on the specified filter. func (d filtered) Devices() ([]Device, error) { devices, err := d.Discover.Devices() diff --git a/internal/discover/graphics.go b/internal/discover/graphics.go index 47d95a0d..a6659a3a 100644 --- a/internal/discover/graphics.go +++ b/internal/discover/graphics.go @@ -18,13 +18,21 @@ package discover import ( "fmt" + "os" + "path/filepath" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/sirupsen/logrus" ) // NewGraphicsDiscoverer returns the discoverer for graphics tools such as Vulkan. -func NewGraphicsDiscoverer(logger *logrus.Logger, root string) (Discover, error) { +func NewGraphicsDiscoverer(logger *logrus.Logger, devices image.VisibleDevices, cfg *Config) (Discover, error) { + root := cfg.Root + locator, err := lookup.NewLibraryLocator(logger, root) if err != nil { return nil, fmt.Errorf("failed to construct library locator: %v", err) @@ -54,10 +62,193 @@ func NewGraphicsDiscoverer(logger *logrus.Logger, root string) (Discover, error) }, ) + drmDeviceNodes, err := newDRMDeviceDiscoverer(logger, devices, root) + if err != nil { + return nil, fmt.Errorf("failed to create DRM device discoverer: %v", err) + } + + drmByPathSymlinks := newCreateDRMByPathSymlinks(logger, drmDeviceNodes, cfg) + discover := Merge( + Merge(drmDeviceNodes, drmByPathSymlinks), libraries, jsonMounts, ) return discover, nil } + +type drmDevicesByPath struct { + None + logger *logrus.Logger + lookup lookup.Locator + nvidiaCTKExecutablePath string + root string + devicesFrom Discover +} + +// newCreateDRMByPathSymlinks creates a discoverer for a hook to create the by-path symlinks for DRM devices discovered by the specified devices discoverer +func newCreateDRMByPathSymlinks(logger *logrus.Logger, devices Discover, cfg *Config) Discover { + d := drmDevicesByPath{ + logger: logger, + lookup: lookup.NewExecutableLocator(logger, cfg.Root), + nvidiaCTKExecutablePath: cfg.NVIDIAContainerToolkitCLIExecutablePath, + root: cfg.Root, + devicesFrom: devices, + } + + return &d +} + +// Hooks returns a hook to create the symlinks from the required CSV files +func (d drmDevicesByPath) Hooks() ([]Hook, error) { + devices, err := d.devicesFrom.Devices() + if err != nil { + return nil, fmt.Errorf("failed to discover devices for by-path symlinks: %v", err) + } + if len(devices) == 0 { + return nil, nil + } + + hookPath := nvidiaCTKDefaultFilePath + targets, err := d.lookup.Locate(d.nvidiaCTKExecutablePath) + if err != nil { + d.logger.Warnf("Failed to locate %v: %v", d.nvidiaCTKExecutablePath, err) + } else if len(targets) == 0 { + d.logger.Warnf("%v not found", d.nvidiaCTKExecutablePath) + } else { + d.logger.Debugf("Found %v candidates: %v", d.nvidiaCTKExecutablePath, targets) + hookPath = targets[0] + } + d.logger.Debugf("Using NVIDIA Container Toolkit CLI path %v", hookPath) + + args := []string{hookPath, "hook", "create-symlinks"} + links, err := d.getSpecificLinkArgs(devices) + if err != nil { + return nil, fmt.Errorf("failed to determine specific links: %v", err) + } + for _, l := range links { + args = append(args, "--link", l) + } + + h := Hook{ + Lifecycle: cdi.CreateContainerHook, + Path: hookPath, + Args: args, + } + + return []Hook{h}, nil +} + +// getSpecificLinkArgs returns the required specic links that need to be created +func (d drmDevicesByPath) getSpecificLinkArgs(devices []Device) ([]string, error) { + selectedDevices := make(map[string]bool) + for _, d := range devices { + selectedDevices[filepath.Base(d.HostPath)] = true + } + + linkLocator := lookup.NewFileLocator(d.logger, d.root) + candidates, err := linkLocator.Locate("/dev/dri/by-path/pci-*-*") + if err != nil { + return nil, fmt.Errorf("failed to locate devices by path: %v", err) + } + + var links []string + for _, c := range candidates { + device, err := os.Readlink(c) + if err != nil { + d.logger.Warningf("Failed to evaluate symlink %v; ignoring", c) + continue + } + + if selectedDevices[filepath.Base(device)] { + d.logger.Debugf("adding device symlink %v -> %v", c, device) + links = append(links, fmt.Sprintf("%v::%v", device, c)) + } + } + + return links, nil +} + +// newDRMDeviceDiscoverer creates a discoverer for the DRM devices associated with the requested devices. +func newDRMDeviceDiscoverer(logger *logrus.Logger, devices image.VisibleDevices, root string) (Discover, error) { + allDevices := NewDeviceDiscoverer( + logger, + lookup.NewCharDeviceLocator(logger, root), + root, + []string{ + "/dev/dri/card*", + "/dev/dri/renderD*", + }, + ) + + filter, err := newDRMDeviceFilter(logger, devices, root) + if err != nil { + return nil, fmt.Errorf("failed to construct DRM device filter: %v", err) + } + + // We return a discoverer that applies the DRM device filter created above to all discovered DRM device nodes. + d := newFilteredDisoverer( + logger, + allDevices, + filter, + ) + + return d, err +} + +// newDRMDeviceFilter creates a filter that matches DRM devices nodes for the visible devices. +func newDRMDeviceFilter(logger *logrus.Logger, devices image.VisibleDevices, root string) (Filter, error) { + gpuInformationPaths, err := proc.GetInformationFilePaths(root) + if err != nil { + return nil, fmt.Errorf("failed to read GPU information: %v", err) + } + + var selectedBusIds []string + for _, f := range gpuInformationPaths { + info, err := proc.ParseGPUInformationFile(f) + if err != nil { + return nil, fmt.Errorf("failed to parse %v: %v", f, err) + } + uuid := info[proc.GPUInfoGPUUUID] + busID := info[proc.GPUInfoBusLocation] + minor := info[proc.GPUInfoDeviceMinor] + + if devices.Has(minor) || devices.Has(uuid) || devices.Has(busID) { + selectedBusIds = append(selectedBusIds, busID) + } + } + + filter := make(selectDeviceByPath) + for _, busID := range selectedBusIds { + drmDeviceNodes, err := drm.GetDeviceNodesByBusID(busID) + if err != nil { + return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", busID, err) + } + for _, drmDeviceNode := range drmDeviceNodes { + filter[filepath.Join(drmDeviceNode)] = true + } + } + + return filter, nil +} + +// selectDeviceByPath is a filter that allows devices to be selected by the path +type selectDeviceByPath map[string]bool + +var _ Filter = (*selectDeviceByPath)(nil) + +// DeviceIsSelected determines whether the device's path has been selected +func (s selectDeviceByPath) DeviceIsSelected(device Device) bool { + return s[device.Path] +} + +// MountIsSelected is always true +func (s selectDeviceByPath) MountIsSelected(Mount) bool { + return true +} + +// HookIsSelected is always true +func (s selectDeviceByPath) HookIsSelected(Hook) bool { + return true +} diff --git a/internal/modifier/graphics.go b/internal/modifier/graphics.go index cb8daa71..76fec386 100644 --- a/internal/modifier/graphics.go +++ b/internal/modifier/graphics.go @@ -44,7 +44,15 @@ func NewGraphicsModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci. return nil, nil } - d, err := discover.NewGraphicsDiscoverer(logger, cfg.NVIDIAContainerCLIConfig.Root) + config := &discover.Config{ + Root: cfg.NVIDIAContainerCLIConfig.Root, + NVIDIAContainerToolkitCLIExecutablePath: cfg.NVIDIACTKConfig.Path, + } + d, err := discover.NewGraphicsDiscoverer( + logger, + image.DevicesFromEnvvars(visibleDevicesEnvvar), + config, + ) if err != nil { return nil, fmt.Errorf("failed to construct discoverer: %v", err) }