From 60c1df4e9c5b5fd5cda1bbb3bb2930c931cacd43 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 23 Nov 2022 16:31:35 +0100 Subject: [PATCH 1/7] Remove unneeded workaround for CDI edit generation Signed-off-by: Evan Lezar --- internal/edits/device.go | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/internal/edits/device.go b/internal/edits/device.go index aca095e3..8f5418de 100644 --- a/internal/edits/device.go +++ b/internal/edits/device.go @@ -17,13 +17,9 @@ package edits import ( - "fmt" - "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/container-orchestrated-devices/container-device-interface/specs-go" - - "github.com/opencontainers/runc/libcontainer/devices" ) type device discover.Device @@ -46,21 +42,9 @@ func (d device) toEdits() (*cdi.ContainerEdits, error) { // toSpec converts a discovered Device to a CDI Spec Device. Note // that missing info is filled in when edits are applied by querying the Device node. func (d device) toSpec() (*specs.DeviceNode, error) { - // NOTE: This mirrors what cri-o does. - // https://github.com/cri-o/cri-o/blob/ca3bb80a3dda0440659fcf8da8ed6f23211de94e/internal/config/device/device.go#L93 - // This can be removed once https://github.com/container-orchestrated-devices/container-device-interface/issues/72 is addressed - dev, err := devices.DeviceFromPath(d.HostPath, "rwm") - if err != nil { - return nil, fmt.Errorf("failed to query device node %v: %v", d.HostPath, err) - } s := specs.DeviceNode{ + HostPath: d.HostPath, Path: d.Path, - Type: string(dev.Type), - Major: dev.Major, - Minor: dev.Minor, - FileMode: &dev.FileMode, - UID: &dev.Uid, - GID: &dev.Gid, } return &s, nil From 029a04c37d11126d182a46ba5d020de118b5442a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 24 Nov 2022 14:29:18 +0100 Subject: [PATCH 2/7] Use blank device hostPath if same as Path The HostPath field was added in the v0.5.0 CDI specification. The cdi package uses strict unmarshalling when loading specs from file causing failures for unexpected fields. Since the behaviour for HostPath == "" and HostPath == Path are equivalent, we clear HostPath if it is equal to Path to ensure compatibility with the widest range of specs. This allows, for example, a v0.4.0 spec to be generated as required. Signed-off-by: Evan Lezar --- internal/edits/device.go | 11 +++++- internal/edits/device_test.go | 69 +++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 internal/edits/device_test.go diff --git a/internal/edits/device.go b/internal/edits/device.go index 8f5418de..72e6b06e 100644 --- a/internal/edits/device.go +++ b/internal/edits/device.go @@ -42,8 +42,17 @@ func (d device) toEdits() (*cdi.ContainerEdits, error) { // toSpec converts a discovered Device to a CDI Spec Device. Note // that missing info is filled in when edits are applied by querying the Device node. func (d device) toSpec() (*specs.DeviceNode, error) { + // The HostPath field was added in the v0.5.0 CDI specification. + // The cdi package uses strict unmarshalling when loading specs from file causing failures for + // unexpected fields. + // Since the behaviour for HostPath == "" and HostPath == Path are equivalent, we clear HostPath + // if it is equal to Path to ensure compatibility with the widest range of specs. + hostPath := d.HostPath + if hostPath == d.Path { + hostPath = "" + } s := specs.DeviceNode{ - HostPath: d.HostPath, + HostPath: hostPath, Path: d.Path, } diff --git a/internal/edits/device_test.go b/internal/edits/device_test.go new file mode 100644 index 00000000..44c35098 --- /dev/null +++ b/internal/edits/device_test.go @@ -0,0 +1,69 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package edits + +import ( + "fmt" + "testing" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/container-orchestrated-devices/container-device-interface/specs-go" + "github.com/stretchr/testify/require" +) + +func TestDeviceToSpec(t *testing.T) { + testCases := []struct { + device discover.Device + expected *specs.DeviceNode + }{ + { + device: discover.Device{ + Path: "/foo", + }, + expected: &specs.DeviceNode{ + Path: "/foo", + }, + }, + { + device: discover.Device{ + Path: "/foo", + HostPath: "/foo", + }, + expected: &specs.DeviceNode{ + Path: "/foo", + }, + }, + { + device: discover.Device{ + Path: "/foo", + HostPath: "/not/foo", + }, + expected: &specs.DeviceNode{ + Path: "/foo", + HostPath: "/not/foo", + }, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + spec, err := device(tc.device).toSpec() + require.NoError(t, err) + require.EqualValues(t, tc.expected, spec) + }) + } +} From 0b6cd7e90e09ce0e2cbdf17fb873f8f827e370f5 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 23 Nov 2022 16:25:11 +0100 Subject: [PATCH 3/7] Add FromDiscoverer function to generate container edits Signed-off-by: Evan Lezar --- internal/edits/edits.go | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/internal/edits/edits.go b/internal/edits/edits.go index 14e79b7b..3d7560d5 100644 --- a/internal/edits/edits.go +++ b/internal/edits/edits.go @@ -34,6 +34,20 @@ type edits struct { // NewSpecEdits creates a SpecModifier that defines the required OCI spec edits (as CDI ContainerEdits) from the specified // discoverer. func NewSpecEdits(logger *logrus.Logger, d discover.Discover) (oci.SpecModifier, error) { + c, err := FromDiscoverer(d) + if err != nil { + return nil, fmt.Errorf("error constructing container edits: %v", err) + } + e := edits{ + ContainerEdits: *c, + logger: logger, + } + + return &e, nil +} + +// FromDiscoverer creates CDI container edits for the specified discoverer. +func FromDiscoverer(d discover.Discover) (*cdi.ContainerEdits, error) { devices, err := d.Devices() if err != nil { return nil, fmt.Errorf("failed to discover devices: %v", err) @@ -66,12 +80,7 @@ func NewSpecEdits(logger *logrus.Logger, d discover.Discover) (oci.SpecModifier, c.Append(hook(h).toEdits()) } - e := edits{ - ContainerEdits: c, - logger: logger, - } - - return &e, nil + return &c, nil } // Modify applies the defined edits to the incoming OCI spec From 9b33c34a57e769d7580b704445a41a6494c4b41f Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 23 Nov 2022 16:26:30 +0100 Subject: [PATCH 4/7] Allow graphics mount discoverer to be instantiated independently Signed-off-by: Evan Lezar --- internal/discover/graphics.go | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/internal/discover/graphics.go b/internal/discover/graphics.go index 99b2a70b..c24cb4ad 100644 --- a/internal/discover/graphics.go +++ b/internal/discover/graphics.go @@ -33,6 +33,28 @@ import ( func NewGraphicsDiscoverer(logger *logrus.Logger, devices image.VisibleDevices, cfg *Config) (Discover, error) { root := cfg.Root + mounts, err := NewGraphicsMountsDiscoverer(logger, root) + if err != nil { + return nil, fmt.Errorf("failed to create mounts discoverer: %v", err) + } + + drmDeviceNodes, err := newDRMDeviceDiscoverer(logger, devices, root) + if err != nil { + return nil, fmt.Errorf("failed to create DRM device discoverer: %v", err) + } + + drmByPathSymlinks := newCreateDRMByPathSymlinks(logger, drmDeviceNodes, cfg) + + discover := Merge( + Merge(drmDeviceNodes, drmByPathSymlinks), + mounts, + ) + + return discover, nil +} + +// NewGraphicsMountsDiscoverer creates a discoverer for the mounts required by graphics tools such as vulkan. +func NewGraphicsMountsDiscoverer(logger *logrus.Logger, root string) (Discover, error) { locator, err := lookup.NewLibraryLocator(logger, root) if err != nil { return nil, fmt.Errorf("failed to construct library locator: %v", err) @@ -62,15 +84,7 @@ func NewGraphicsDiscoverer(logger *logrus.Logger, devices image.VisibleDevices, }, ) - drmDeviceNodes, err := newDRMDeviceDiscoverer(logger, devices, root) - if err != nil { - return nil, fmt.Errorf("failed to create DRM device discoverer: %v", err) - } - - drmByPathSymlinks := newCreateDRMByPathSymlinks(logger, drmDeviceNodes, cfg) - discover := Merge( - Merge(drmDeviceNodes, drmByPathSymlinks), libraries, jsonMounts, ) From d51c8fcfa7a9cabb1dc296d6bbc51caf63c8ecb2 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 23 Nov 2022 16:28:28 +0100 Subject: [PATCH 5/7] Add utility function to generatee nvidia-ctk OCI hook Signed-off-by: Evan Lezar --- internal/discover/ldconfig.go | 36 +++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/internal/discover/ldconfig.go b/internal/discover/ldconfig.go index af74645d..d42fa4bf 100644 --- a/internal/discover/ldconfig.go +++ b/internal/discover/ldconfig.go @@ -67,27 +67,43 @@ func (d ldconfig) Hooks() ([]Hook, error) { } // CreateLDCacheUpdateHook locates the NVIDIA Container Toolkit CLI and creates a hook for updating the LD Cache -func CreateLDCacheUpdateHook(logger *logrus.Logger, lookup lookup.Locator, execuable string, defaultPath string, libraries []string) Hook { +func CreateLDCacheUpdateHook(logger *logrus.Logger, lookup lookup.Locator, executable string, defaultPath string, libraries []string) Hook { + var args []string + for _, f := range uniqueFolders(libraries) { + args = append(args, "--folder", f) + } + + hook := CreateNvidiaCTKHook( + logger, + lookup, + executable, + defaultPath, + "update-ldcache", + args..., + ) + + return hook + +} + +// CreateNvidiaCTKHook creates a hook which invokes the NVIDIA Container CLI hook subcommand. +func CreateNvidiaCTKHook(logger *logrus.Logger, lookup lookup.Locator, executable string, defaultPath string, hookName string, additionalArgs ...string) Hook { hookPath := defaultPath - targets, err := lookup.Locate(execuable) + targets, err := lookup.Locate(executable) if err != nil { - logger.Warnf("Failed to locate %v: %v", execuable, err) + logger.Warnf("Failed to locate %v: %v", executable, err) } else if len(targets) == 0 { - logger.Warnf("%v not found", execuable) + logger.Warnf("%v not found", executable) } else { - logger.Debugf("Found %v candidates: %v", execuable, targets) + logger.Debugf("Found %v candidates: %v", executable, targets) hookPath = targets[0] } logger.Debugf("Using NVIDIA Container Toolkit CLI path %v", hookPath) - args := []string{filepath.Base(hookPath), "hook", "update-ldcache"} - for _, f := range uniqueFolders(libraries) { - args = append(args, "--folder", f) - } return Hook{ Lifecycle: cdi.CreateContainerHook, Path: hookPath, - Args: args, + Args: append([]string{filepath.Base(hookPath), "hook", hookName}, additionalArgs...), } } From e4e1de82ecc1ea50d0a1d652c05868be85ebc64c Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 23 Nov 2022 16:29:18 +0100 Subject: [PATCH 6/7] Refactor nvidia-ctk cdi generate command This change refactors the generation of CDI specifications to use discoverers and generate the CDI specifications from these discoverers. This allows for better reuse. Signed-off-by: Evan Lezar --- cmd/nvidia-ctk/cdi/generate/common.go | 60 +++ .../cdi/generate/device-folder-permissions.go | 117 +++++ cmd/nvidia-ctk/cdi/generate/device.go | 57 +++ cmd/nvidia-ctk/cdi/generate/driver.go | 148 ++++++ cmd/nvidia-ctk/cdi/generate/full-gpu.go | 148 ++++++ cmd/nvidia-ctk/cdi/generate/generate.go | 461 ++++-------------- cmd/nvidia-ctk/cdi/generate/ipc.go | 39 ++ cmd/nvidia-ctk/cdi/generate/mig-device.go | 84 ++++ 8 files changed, 748 insertions(+), 366 deletions(-) create mode 100644 cmd/nvidia-ctk/cdi/generate/common.go create mode 100644 cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go create mode 100644 cmd/nvidia-ctk/cdi/generate/device.go create mode 100644 cmd/nvidia-ctk/cdi/generate/driver.go create mode 100644 cmd/nvidia-ctk/cdi/generate/full-gpu.go create mode 100644 cmd/nvidia-ctk/cdi/generate/ipc.go create mode 100644 cmd/nvidia-ctk/cdi/generate/mig-device.go diff --git a/cmd/nvidia-ctk/cdi/generate/common.go b/cmd/nvidia-ctk/cdi/generate/common.go new file mode 100644 index 00000000..3e6cf944 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/common.go @@ -0,0 +1,60 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +// NewCommonDiscoverer returns a discoverer for entities that are not associated with a specific CDI device. +// This includes driver libraries and meta devices, for example. +func NewCommonDiscoverer(logger *logrus.Logger, root string, nvmllib nvml.Interface) (discover.Discover, error) { + metaDevices := discover.NewDeviceDiscoverer( + logger, + lookup.NewCharDeviceLocator(logger, root), + root, + []string{ + "/dev/nvidia-modeset", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-uvm", + "/dev/nvidiactl", + }, + ) + + graphicsMounts, err := discover.NewGraphicsMountsDiscoverer(logger, root) + if err != nil { + return nil, fmt.Errorf("error constructing discoverer for graphics mounts: %v", err) + } + + driverFiles, err := NewDriverDiscoverer(logger, root, nvmllib) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err) + } + + d := discover.Merge( + metaDevices, + graphicsMounts, + driverFiles, + ) + + return d, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go b/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go new file mode 100644 index 00000000..9359d116 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go @@ -0,0 +1,117 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/container-orchestrated-devices/container-device-interface/specs-go" + "github.com/sirupsen/logrus" +) + +type deviceFolderPermissions struct { + logger *logrus.Logger + root string + foldersByMode map[string][]string +} + +var _ discover.Discover = (*deviceFolderPermissions)(nil) + +// NewDeviceFolderPermissionHookDiscoverer creates a discoverer that can be used to update the permissions for the parent folders of nested device nodes from the specified set of device specs. +// This works around an issue with rootless podman when using crun as a low-level runtime. +// See https://github.com/containers/crun/issues/1047 +// TODO: This currently assumes `root == ""` +func NewDeviceFolderPermissionHookDiscoverer(logger *logrus.Logger, root string, deviceSpecs []specs.Device) (discover.Discover, error) { + var paths []string + seen := make(map[string]bool) + + for _, device := range deviceSpecs { + for _, dn := range device.ContainerEdits.DeviceNodes { + if !strings.HasPrefix(dn.Path, "/dev") { + logger.Warningf("Skipping unexpected device folder path for device %v", dn) + continue + } + for df := filepath.Dir(dn.Path); df != "/dev"; df = filepath.Dir(df) { + if seen[df] { + continue + } + paths = append(paths, df) + seen[df] = true + } + } + } + + foldersByMode := make(map[string][]string) + for _, p := range paths { + info, err := os.Stat(p) + if err != nil { + return nil, fmt.Errorf("failed to get info for path %v: %v", p, err) + } + mode := fmt.Sprintf("%o", info.Mode().Perm()) + foldersByMode[mode] = append(foldersByMode[mode], p) + } + + d := &deviceFolderPermissions{ + logger: logger, + root: root, + foldersByMode: foldersByMode, + } + + return d, nil +} + +// Devices are empty for this discoverer +func (d *deviceFolderPermissions) Devices() ([]discover.Device, error) { + return nil, nil +} + +// Hooks returns a set of hooks that sets the file modes of parent folders for device nodes. +// One hook is returned per mode. +func (d *deviceFolderPermissions) Hooks() ([]discover.Hook, error) { + locator := lookup.NewExecutableLocator(d.logger, d.root) + + var hooks []discover.Hook + for mode, folders := range d.foldersByMode { + args := []string{"--mode", mode} + for _, folder := range folders { + args = append(args, "--path", folder) + } + + hook := discover.CreateNvidiaCTKHook( + d.logger, + locator, + nvidiaCTKExecutable, + nvidiaCTKDefaultFilePath, + "chmod", + args..., + ) + + hooks = append(hooks, hook) + } + + return hooks, nil +} + +// Mounts are empty for this discoverer +func (d *deviceFolderPermissions) Mounts() ([]discover.Mount, error) { + return nil, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/device.go b/cmd/nvidia-ctk/cdi/generate/device.go new file mode 100644 index 00000000..3d908ace --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/device.go @@ -0,0 +1,57 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "path/filepath" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/sirupsen/logrus" +) + +// deviceDiscoverer defines a discoverer for device nodes +type deviceDiscoverer struct { + logger *logrus.Logger + root string + deviceNodePaths []string +} + +var _ discover.Discover = (*deviceDiscoverer)(nil) + +// Devices returns the device nodes for the full GPU. +func (d *deviceDiscoverer) Devices() ([]discover.Device, error) { + var deviceNodes []discover.Device + for _, dn := range d.deviceNodePaths { + deviceNode := discover.Device{ + HostPath: filepath.Join(d.root, dn), + Path: dn, + } + deviceNodes = append(deviceNodes, deviceNode) + } + + return deviceNodes, nil +} + +// Hooks returns no hooks for a device discoverer +func (d *deviceDiscoverer) Hooks() ([]discover.Hook, error) { + return nil, nil +} + +// Mounts returns no mounts for a device discoverer +func (d *deviceDiscoverer) Mounts() ([]discover.Mount, error) { + return nil, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/driver.go b/cmd/nvidia-ctk/cdi/generate/driver.go new file mode 100644 index 00000000..c5aec328 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/driver.go @@ -0,0 +1,148 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +type driverLibraries struct { + logger *logrus.Logger + root string + libraries []string +} + +var _ discover.Discover = (*driverLibraries)(nil) + +// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation. +// The supplied NVML Library is used to query the expected driver version. +func NewDriverDiscoverer(logger *logrus.Logger, root string, nvmllib nvml.Interface) (discover.Discover, error) { + libraries, err := NewDriverLibraryDiscoverer(logger, root, nvmllib) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) + } + + binaries := discover.NewMounts( + logger, + lookup.NewExecutableLocator(logger, root), + root, + []string{ + "nvidia-smi", /* System management interface */ + "nvidia-debugdump", /* GPU coredump utility */ + "nvidia-persistenced", /* Persistence mode utility */ + "nvidia-cuda-mps-control", /* Multi process service CLI */ + "nvidia-cuda-mps-server", /* Multi process service server */ + }, + ) + + d := discover.Merge( + libraries, + binaries, + ) + + return d, nil +} + +// NewDriverLibraryDiscoverer creates a discoverer for the libraries associated with the specified driver version. +func NewDriverLibraryDiscoverer(logger *logrus.Logger, root string, nvmllib nvml.Interface) (discover.Discover, error) { + version, r := nvmllib.SystemGetDriverVersion() + if r != nvml.SUCCESS { + return nil, fmt.Errorf("failed to determine driver version: %v", r) + } + + libraries, err := findVersionLibs(logger, root, version) + if err != nil { + return nil, fmt.Errorf("failed to get libraries for driver version: %v", r) + } + + d := driverLibraries{ + logger: logger, + root: root, + libraries: libraries, + } + + return &d, nil +} + +// Devices are empty for this discoverer +func (d *driverLibraries) Devices() ([]discover.Device, error) { + return nil, nil +} + +// Mounts returns the mounts for the driver libraries +func (d *driverLibraries) Mounts() ([]discover.Mount, error) { + var mounts []discover.Mount + for _, d := range d.libraries { + mount := discover.Mount{ + HostPath: d, + Path: d, + } + mounts = append(mounts, mount) + } + + return mounts, nil +} + +// Hooks returns a hook that updates the LDCache for the specified driver library paths. +func (d *driverLibraries) Hooks() ([]discover.Hook, error) { + locator := lookup.NewExecutableLocator(d.logger, d.root) + + hook := discover.CreateLDCacheUpdateHook( + d.logger, + locator, + nvidiaCTKExecutable, + nvidiaCTKDefaultFilePath, + d.libraries, + ) + + return []discover.Hook{hook}, nil +} + +func findVersionLibs(logger *logrus.Logger, root string, version string) ([]string, error) { + logger.Infof("Using driver version %v", version) + + cache, err := ldcache.New(logger, root) + if err != nil { + return nil, fmt.Errorf("failed to load ldcache: %v", err) + } + + libs32, libs64 := cache.List() + + var libs []string + for _, l := range libs64 { + if strings.HasSuffix(l, version) { + logger.Infof("found 64-bit driver lib: %v", l) + libs = append(libs, l) + } + } + + for _, l := range libs32 { + if strings.HasSuffix(l, version) { + logger.Infof("found 32-bit driver lib: %v", l) + libs = append(libs, l) + } + } + + return libs, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/full-gpu.go b/cmd/nvidia-ctk/cdi/generate/full-gpu.go new file mode 100644 index 00000000..e9de7cb8 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/full-gpu.go @@ -0,0 +1,148 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +// fullGPUDiscoverer wraps a deviceDiscoverer and adds specifics required for discovering full GPUs +type fullGPUDiscoverer struct { + deviceDiscoverer + + pciBusID string +} + +var _ discover.Discover = (*fullGPUDiscoverer)(nil) + +// NewFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device. +func NewFullGPUDiscoverer(logger *logrus.Logger, root string, d device.Device) (discover.Discover, error) { + // TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface. + // This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin. + minor, ret := d.GetMinorNumber() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) + } + path := fmt.Sprintf("/dev/nvidia%d", minor) + + pciInfo, ret := d.GetPciInfo() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting PCI info for device: %v", ret) + } + pciBusID := getBusID(pciInfo) + + drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID) + if err != nil { + return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err) + } + + deviceNodePaths := append([]string{path}, drmDeviceNodes...) + + device := fullGPUDiscoverer{ + deviceDiscoverer: deviceDiscoverer{ + logger: logger, + root: root, + deviceNodePaths: deviceNodePaths, + }, + pciBusID: pciBusID, + } + + return &device, nil +} + +// Hooks returns the hooks for the GPU device. +// The following hooks are detected: +// 1. A hook to create /dev/dri/by-path symlinks +func (d *fullGPUDiscoverer) Hooks() ([]discover.Hook, error) { + links, err := d.deviceNodeLinks() + if err != nil { + return nil, fmt.Errorf("failed to discover DRA device links: %v", err) + } + if len(links) == 0 { + return nil, nil + } + + hookPath := "nvidia-ctk" + args := []string{hookPath, "hook", "create-symlinks"} + for _, l := range links { + args = append(args, "--link", l) + } + + var hooks []discover.Hook + hook := discover.Hook{ + Lifecycle: "createContainer", + Path: hookPath, + Args: args, + } + hooks = append(hooks, hook) + + return hooks, nil +} + +// Mounts returns an empty slice for a full GPU +func (d *fullGPUDiscoverer) Mounts() ([]discover.Mount, error) { + return nil, nil +} + +func (d *fullGPUDiscoverer) deviceNodeLinks() ([]string, error) { + candidates := []string{ + fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID), + fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID), + } + + var links []string + for _, c := range candidates { + linkPath := filepath.Join(d.root, c) + device, err := os.Readlink(linkPath) + if err != nil { + d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath) + continue + } + + d.logger.Debugf("adding device symlink %v -> %v", linkPath, device) + links = append(links, fmt.Sprintf("%v::%v", device, linkPath)) + } + + return links, nil +} + +// getBusID provides a utility function that returns the string representation of the bus ID. +func getBusID(p nvml.PciInfo) string { + var bytes []byte + for _, b := range p.BusId { + if byte(b) == '\x00' { + break + } + bytes = append(bytes, byte(b)) + } + id := strings.ToLower(string(bytes)) + + if id != "0000" { + id = strings.TrimPrefix(id, "0000") + } + + return id +} diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 184e9b91..544f1219 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -23,10 +23,8 @@ import ( "path/filepath" "strings" - "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" - "github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache" - "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" specs "github.com/container-orchestrated-devices/container-device-interface/specs-go" "github.com/sirupsen/logrus" @@ -195,406 +193,137 @@ func (m command) generateSpec() (*specs.Spec, error) { devicelib := device.New(device.WithNvml(nvmllib)) + deviceSpecs, err := m.generateDeviceSpecs(devicelib) + if err != nil { + return nil, fmt.Errorf("failed to create device CDI specs: %v", err) + } + + allDevice := createAllDevice(deviceSpecs) + + deviceSpecs = append(deviceSpecs, allDevice) + + allEdits := cdi.ContainerEdits{} + + ipcs, err := NewIPCDiscoverer(m.logger, "") + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err) + } + + ipcEdits, err := edits.FromDiscoverer(ipcs) + if err != nil { + return nil, fmt.Errorf("failed to create container edits for IPC sockets: %v", err) + } + // TODO: We should not have to update this after the fact + for _, s := range ipcEdits.Mounts { + s.Options = append(s.Options, "noexec") + } + + allEdits.Append(ipcEdits) + + common, err := NewCommonDiscoverer(m.logger, "", nvmllib) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err) + } + + deviceFolderPermissionHooks, err := NewDeviceFolderPermissionHookDiscoverer(m.logger, "", deviceSpecs) + if err != nil { + return nil, fmt.Errorf("failed to generated permission hooks for device nodes: %v", err) + } + + commonEdits, err := edits.FromDiscoverer(discover.Merge(common, deviceFolderPermissionHooks)) + if err != nil { + return nil, fmt.Errorf("failed to create container edits for common entities: %v", err) + } + + allEdits.Append(commonEdits) + + // Construct the spec + // TODO: Use the code to determine the minimal version spec := specs.Spec{ Version: "0.4.0", Kind: "nvidia.com/gpu", - ContainerEdits: specs.ContainerEdits{}, + Devices: deviceSpecs, + ContainerEdits: *allEdits.ContainerEdits, } + + return &spec, nil +} + +func (m command) generateDeviceSpecs(devicelib device.Interface) ([]specs.Device, error) { + var deviceSpecs []specs.Device + err := devicelib.VisitDevices(func(i int, d device.Device) error { - isMig, err := d.IsMigEnabled() + isMigEnabled, err := d.IsMigEnabled() if err != nil { return fmt.Errorf("failed to check whether device is MIG device: %v", err) } - if isMig { + if isMigEnabled { return nil } - device, err := generateEditsForDevice(newGPUDevice(i, d)) + device, err := NewFullGPUDiscoverer(m.logger, "", d) if err != nil { - return fmt.Errorf("failed to generate CDI spec for device %v: %v", i, err) + return fmt.Errorf("failed to create device: %v", err) } - graphicsEdits, err := m.editsForGraphicsDevice(d) + deviceEdits, err := edits.FromDiscoverer(device) if err != nil { - return fmt.Errorf("failed to generate CDI spec for DRM devices associated with device %v: %v", i, err) + return fmt.Errorf("failed to create container edits for device: %v", err) } - // We add the device nodes and hooks edits for the DRM devices; Mounts are added globally - for _, dn := range graphicsEdits.DeviceNodes { - device.ContainerEdits.DeviceNodes = append(device.ContainerEdits.DeviceNodes, dn) - } - for _, h := range graphicsEdits.Hooks { - device.ContainerEdits.Hooks = append(device.ContainerEdits.Hooks, h) + deviceSpec := specs.Device{ + Name: fmt.Sprintf("gpu%d", i), + ContainerEdits: *deviceEdits.ContainerEdits, } - spec.Devices = append(spec.Devices, device) + deviceSpecs = append(deviceSpecs, deviceSpec) return nil }) if err != nil { return nil, fmt.Errorf("failed to generate CDI spec for GPU devices: %v", err) } - err = devicelib.VisitMigDevices(func(i int, d device.Device, j int, m device.MigDevice) error { - device, err := generateEditsForDevice(newMigDevice(i, j, m)) + err = devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error { + device, err := NewMigDeviceDiscoverer(m.logger, "", d, mig) if err != nil { - return fmt.Errorf("failed to generate CDI spec for device %v: %v", i, err) + return fmt.Errorf("failed to create MIG device: %v", err) } - spec.Devices = append(spec.Devices, device) + deviceEdits, err := edits.FromDiscoverer(device) + if err != nil { + return fmt.Errorf("failed to create container edits for MIG device: %v", err) + } + + deviceSpec := specs.Device{ + Name: fmt.Sprintf("mig%v:%v", i, j), + ContainerEdits: *deviceEdits.ContainerEdits, + } + + deviceSpecs = append(deviceSpecs, deviceSpec) return nil }) if err != nil { return nil, fmt.Errorf("falied to generate CDI spec for MIG devices: %v", err) } - // We create an "all" device with all the discovered device nodes - var allDeviceNodes []*specs.DeviceNode - for _, d := range spec.Devices { - for _, dn := range d.ContainerEdits.DeviceNodes { - allDeviceNodes = append(allDeviceNodes, dn) + return deviceSpecs, nil +} + +// createAllDevice creates an 'all' device which combines the edits from the previous devices +func createAllDevice(deviceSpecs []specs.Device) specs.Device { + edits := cdi.ContainerEdits{} + + for _, d := range deviceSpecs { + edit := cdi.ContainerEdits{ + ContainerEdits: &d.ContainerEdits, } + edits.Append(&edit) } + all := specs.Device{ - Name: "all", - ContainerEdits: specs.ContainerEdits{ - DeviceNodes: allDeviceNodes, - }, + Name: "all", + ContainerEdits: *edits.ContainerEdits, } - - spec.Devices = append(spec.Devices, all) - spec.ContainerEdits.DeviceNodes = m.getExistingMetaDeviceNodes() - - libraries, err := m.findLibs(nvmllib) - if err != nil { - return nil, fmt.Errorf("failed to locate driver libraries: %v", err) - } - - binaries, err := m.findBinaries() - if err != nil { - return nil, fmt.Errorf("failed to locate driver binaries: %v", err) - } - - ipcs, err := m.findIPC() - if err != nil { - return nil, fmt.Errorf("failed to locate driver IPC sockets: %v", err) - } - - graphicsEdits, err := m.editsForGraphicsDevice(nil) - if err != nil { - return nil, fmt.Errorf("failed to generate edits for graphics libraries: %v", err) - } - - libOptions := []string{ - "ro", - "nosuid", - "nodev", - "bind", - } - ipcOptions := append(libOptions, "noexec") - - spec.ContainerEdits.Mounts = append( - generateMountsForPaths(libOptions, libraries, binaries), - generateMountsForPaths(ipcOptions, ipcs)..., - ) - - spec.ContainerEdits.Mounts = append(spec.ContainerEdits.Mounts, graphicsEdits.Mounts...) - - ldcacheUpdateHook := m.generateUpdateLdCacheHook(libraries) - - deviceFolderPermissionHooks, err := m.generateDeviceFolderPermissionHooks(ldcacheUpdateHook.Path, allDeviceNodes) - if err != nil { - return nil, fmt.Errorf("failed to generated permission hooks for device nodes: %v", err) - } - - spec.ContainerEdits.Hooks = append([]*specs.Hook{ldcacheUpdateHook}, deviceFolderPermissionHooks...) - - return &spec, nil -} - -func generateEditsForDevice(name string, d deviceInfo) (specs.Device, error) { - deviceNodePaths, err := d.GetDeviceNodes() - if err != nil { - return specs.Device{}, fmt.Errorf("failed to get paths for device: %v", err) - } - - deviceNodes := getDeviceNodesFromPaths(deviceNodePaths) - - device := specs.Device{ - Name: name, - ContainerEdits: specs.ContainerEdits{ - DeviceNodes: deviceNodes, - }, - } - - return device, nil -} - -func (m command) editsForGraphicsDevice(device device.Device) (*specs.ContainerEdits, error) { - selectedDevice := image.NewVisibleDevices("none") - if device != nil { - uuid, ret := device.GetUUID() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting device UUID: %v", ret) - } - selectedDevice = image.NewVisibleDevices(uuid) - } - cfg := discover.Config{ - Root: "", - NVIDIAContainerToolkitCLIExecutablePath: "nvidia-ctk", - } - // Create a discoverer for the single device: - d, err := discover.NewGraphicsDiscoverer(m.logger, selectedDevice, &cfg) - if err != nil { - return nil, fmt.Errorf("error constructing discoverer: %v", err) - } - - devices, err := d.Devices() - if err != nil { - return nil, fmt.Errorf("error getting DRM devices: %v", err) - } - - var deviceNodes []*specs.DeviceNode - for _, d := range devices { - dn := specs.DeviceNode{ - Path: d.Path, - HostPath: d.HostPath, - } - deviceNodes = append(deviceNodes, &dn) - } - - hooks, err := d.Hooks() - if err != nil { - return nil, fmt.Errorf("error getting hooks: %v", err) - } - - var cdiHooks []*specs.Hook - for _, h := range hooks { - cdiHook := specs.Hook{ - HookName: h.Lifecycle, - Path: h.Path, - Args: h.Args, - } - cdiHooks = append(cdiHooks, &cdiHook) - } - - mounts, err := d.Mounts() - if err != nil { - return nil, fmt.Errorf("error getting mounts: %v", err) - } - - var cdiMounts []*specs.Mount - for _, m := range mounts { - cdiMount := specs.Mount{ - ContainerPath: m.Path, - HostPath: m.HostPath, - Options: []string{ - "ro", - "nosuid", - "nodev", - "bind", - }, - Type: "bind", - } - cdiMounts = append(cdiMounts, &cdiMount) - } - - edits := specs.ContainerEdits{ - DeviceNodes: deviceNodes, - Hooks: cdiHooks, - Mounts: cdiMounts, - } - - return &edits, nil -} - -func (m command) getExistingMetaDeviceNodes() []*specs.DeviceNode { - metaDeviceNodePaths := []string{ - "/dev/nvidia-modeset", - "/dev/nvidia-uvm-tools", - "/dev/nvidia-uvm", - "/dev/nvidiactl", - } - - var existingDeviceNodePaths []string - for _, p := range metaDeviceNodePaths { - if _, err := os.Stat(p); err != nil { - m.logger.Infof("Ignoring missing meta device %v", p) - continue - } - existingDeviceNodePaths = append(existingDeviceNodePaths, p) - } - - return getDeviceNodesFromPaths(existingDeviceNodePaths) -} - -func getDeviceNodesFromPaths(deviceNodePaths []string) []*specs.DeviceNode { - var deviceNodes []*specs.DeviceNode - for _, p := range deviceNodePaths { - deviceNode := specs.DeviceNode{ - Path: p, - } - deviceNodes = append(deviceNodes, &deviceNode) - } - - return deviceNodes -} - -func (m command) findLibs(nvmllib nvml.Interface) ([]string, error) { - version, r := nvmllib.SystemGetDriverVersion() - if r != nvml.SUCCESS { - return nil, fmt.Errorf("failed to determine driver version: %v", r) - } - m.logger.Infof("Using driver version %v", version) - - cache, err := ldcache.New(m.logger, "") - if err != nil { - return nil, fmt.Errorf("failed to load ldcache: %v", err) - } - - libs32, libs64 := cache.List() - - var libs []string - for _, l := range libs64 { - if strings.HasSuffix(l, version) { - m.logger.Infof("found 64-bit driver lib: %v", l) - libs = append(libs, l) - } - } - - for _, l := range libs32 { - if strings.HasSuffix(l, version) { - m.logger.Infof("found 32-bit driver lib: %v", l) - libs = append(libs, l) - } - } - - return libs, nil -} - -func (m command) findBinaries() ([]string, error) { - candidates := []string{ - "nvidia-smi", /* System management interface */ - "nvidia-debugdump", /* GPU coredump utility */ - "nvidia-persistenced", /* Persistence mode utility */ - "nvidia-cuda-mps-control", /* Multi process service CLI */ - "nvidia-cuda-mps-server", /* Multi process service server */ - } - - locator := lookup.NewExecutableLocator(m.logger, "") - - var binaries []string - for _, c := range candidates { - targets, err := locator.Locate(c) - if err != nil { - m.logger.Warningf("skipping %v: %v", c, err) - continue - } - - binaries = append(binaries, targets[0]) - } - return binaries, nil -} - -func (m command) findIPC() ([]string, error) { - candidates := []string{ - "/var/run/nvidia-persistenced/socket", - "/var/run/nvidia-fabricmanager/socket", - // TODO: This can be controlled by the NV_MPS_PIPE_DIR envvar - "/tmp/nvidia-mps", - } - - locator := lookup.NewFileLocator(m.logger, "") - - var ipcs []string - for _, c := range candidates { - targets, err := locator.Locate(c) - if err != nil { - m.logger.Warningf("skipping %v: %v", c, err) - continue - } - - ipcs = append(ipcs, targets[0]) - } - return ipcs, nil -} - -func generateMountsForPaths(options []string, pathSets ...[]string) []*specs.Mount { - var mounts []*specs.Mount - for _, paths := range pathSets { - for _, p := range paths { - mount := specs.Mount{ - HostPath: p, - // We may want to adjust the container path - ContainerPath: p, - Type: "bind", - Options: options, - } - mounts = append(mounts, &mount) - } - } - return mounts -} - -func (m command) generateUpdateLdCacheHook(libraries []string) *specs.Hook { - locator := lookup.NewExecutableLocator(m.logger, "") - - hook := discover.CreateLDCacheUpdateHook( - m.logger, - locator, - nvidiaCTKExecutable, - nvidiaCTKDefaultFilePath, - libraries, - ) - return &specs.Hook{ - HookName: hook.Lifecycle, - Path: hook.Path, - Args: hook.Args, - } -} - -func (m command) generateDeviceFolderPermissionHooks(nvidiaCTKPath string, deviceNodes []*specs.DeviceNode) ([]*specs.Hook, error) { - var deviceFolders []string - seen := make(map[string]bool) - - for _, dn := range deviceNodes { - if !strings.HasPrefix(dn.Path, "/dev") { - m.logger.Warningf("Skipping unexpected device folder path for device %v", dn.Path) - continue - } - for df := filepath.Dir(dn.Path); df != "/dev"; df = filepath.Dir(df) { - if seen[df] { - continue - } - deviceFolders = append(deviceFolders, df) - seen[df] = true - } - } - - foldersByMode := make(map[string][]string) - for _, p := range deviceFolders { - info, err := os.Stat(p) - if err != nil { - return nil, fmt.Errorf("failed to get info for path %v: %v", p, err) - } - mode := fmt.Sprintf("%o", info.Mode().Perm()) - foldersByMode[mode] = append(foldersByMode[mode], p) - } - - var hooks []*specs.Hook - for mode, folders := range foldersByMode { - args := []string{filepath.Base(nvidiaCTKPath), "hook", "chmod", "--mode", mode} - for _, folder := range folders { - args = append(args, "--path", folder) - } - hook := specs.Hook{ - HookName: cdi.CreateContainerHook, - Path: nvidiaCTKPath, - Args: args, - } - - hooks = append(hooks, &hook) - } - - return hooks, nil + return all } // createParentDirsIfRequired creates the parent folders of the specified path if requried. diff --git a/cmd/nvidia-ctk/cdi/generate/ipc.go b/cmd/nvidia-ctk/cdi/generate/ipc.go new file mode 100644 index 00000000..0f5157ee --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/ipc.go @@ -0,0 +1,39 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/sirupsen/logrus" +) + +// NewIPCDiscoverer creats a discoverer for NVIDIA IPC sockets. +func NewIPCDiscoverer(logger *logrus.Logger, root string) (discover.Discover, error) { + d := discover.NewMounts( + logger, + lookup.NewFileLocator(logger, root), + root, + []string{ + "/var/run/nvidia-persistenced/socket", + "/var/run/nvidia-fabricmanager/socket", + "/tmp/nvidia-mps", + }, + ) + + return d, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/mig-device.go b/cmd/nvidia-ctk/cdi/generate/mig-device.go new file mode 100644 index 00000000..76a4df1f --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/mig-device.go @@ -0,0 +1,84 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +// migDeviceDiscoverer wraps a deviceDiscoverer and adds specifics required for discovering MIG devices. +type migDeviceDiscoverer struct { + deviceDiscoverer +} + +var _ discover.Discover = (*migDeviceDiscoverer)(nil) + +// NewMigDeviceDiscoverer creates a discoverer for the specified mig device and its parent. +func NewMigDeviceDiscoverer(logger *logrus.Logger, root string, parent device.Device, d device.MigDevice) (discover.Discover, error) { + minor, ret := parent.GetMinorNumber() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) + } + parentPath := fmt.Sprintf("/dev/nvidia%d", minor) + + migCaps, err := nvcaps.NewMigCaps() + if err != nil { + return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) + } + + gi, ret := d.GetGpuInstanceId() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret) + } + + ci, ret := d.GetComputeInstanceId() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret) + } + + giCap := nvcaps.NewGPUInstanceCap(minor, gi) + giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) + if err != nil { + return nil, fmt.Errorf("failed to get GI cap device path: %v", err) + } + + ciCap := nvcaps.NewComputeInstanceCap(minor, gi, ci) + ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) + if err != nil { + return nil, fmt.Errorf("failed to get CI cap device path: %v", err) + } + + m := migDeviceDiscoverer{ + deviceDiscoverer: deviceDiscoverer{ + logger: logger, + root: root, + deviceNodePaths: []string{ + parentPath, + giCapDevicePath, + ciCapDevicePath, + }, + }, + } + + return &m, nil +} From 46667b5a8ca1e76d9396cd41c625f606a0392914 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 1 Dec 2022 17:51:29 +0100 Subject: [PATCH 7/7] Remove unused code Signed-off-by: Evan Lezar --- cmd/nvidia-ctk/cdi/generate/nvml_devices.go | 123 -------------------- 1 file changed, 123 deletions(-) delete mode 100644 cmd/nvidia-ctk/cdi/generate/nvml_devices.go diff --git a/cmd/nvidia-ctk/cdi/generate/nvml_devices.go b/cmd/nvidia-ctk/cdi/generate/nvml_devices.go deleted file mode 100644 index 7b89203a..00000000 --- a/cmd/nvidia-ctk/cdi/generate/nvml_devices.go +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package generate - -import ( - "fmt" - - "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" - "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" - "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" -) - -// nvmlDevice wraps an nvml.Device with more functions. -type nvmlDevice struct { - nvml.Device -} - -// nvmlMigDevice allows for specific functions of nvmlDevice to be overridden. -type nvmlMigDevice nvmlDevice - -// deviceInfo defines the information the required to construct a Device -type deviceInfo interface { - GetUUID() (string, error) - GetDeviceNodes() ([]string, error) -} - -var _ deviceInfo = (*nvmlDevice)(nil) -var _ deviceInfo = (*nvmlMigDevice)(nil) - -func newGPUDevice(i int, gpu device.Device) (string, nvmlDevice) { - return fmt.Sprintf("gpu%v", i), nvmlDevice{gpu} -} - -func newMigDevice(i int, j int, mig device.MigDevice) (string, nvmlMigDevice) { - return fmt.Sprintf("mig%v:%v", i, j), nvmlMigDevice{mig} -} - -// GetUUID returns the UUID of the device -func (d nvmlDevice) GetUUID() (string, error) { - uuid, ret := d.Device.GetUUID() - if ret != nvml.SUCCESS { - return "", ret - } - return uuid, nil -} - -// GetUUID returns the UUID of the device -func (d nvmlMigDevice) GetUUID() (string, error) { - return nvmlDevice(d).GetUUID() -} - -// GetDeviceNodes returns the device node paths for a GPU device -func (d nvmlDevice) GetDeviceNodes() ([]string, error) { - minor, ret := d.GetMinorNumber() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) - } - path := fmt.Sprintf("/dev/nvidia%d", minor) - - return []string{path}, nil -} - -// GetDeviceNodes returns the device node paths for a MIG device -func (d nvmlMigDevice) GetDeviceNodes() ([]string, error) { - parent, ret := d.GetDeviceHandleFromMigDeviceHandle() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting parent device: %v", ret) - } - minor, ret := parent.GetMinorNumber() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) - } - parentPath := fmt.Sprintf("/dev/nvidia%d", minor) - - migCaps, err := nvcaps.NewMigCaps() - if err != nil { - return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) - } - - gi, ret := d.GetGpuInstanceId() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret) - } - - ci, ret := d.GetComputeInstanceId() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret) - } - - giCap := nvcaps.NewGPUInstanceCap(minor, gi) - giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) - if err != nil { - return nil, fmt.Errorf("failed to get GI cap device path: %v", err) - } - - ciCap := nvcaps.NewComputeInstanceCap(minor, gi, ci) - ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) - if err != nil { - return nil, fmt.Errorf("failed to get CI cap device path: %v", err) - } - - devicePaths := []string{ - parentPath, - giCapDevicePath, - ciCapDevicePath, - } - - return devicePaths, nil -}