diff --git a/cmd/nvidia-ctk/cdi/generate/common.go b/cmd/nvidia-ctk/cdi/generate/common.go new file mode 100644 index 00000000..3e6cf944 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/common.go @@ -0,0 +1,60 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +// NewCommonDiscoverer returns a discoverer for entities that are not associated with a specific CDI device. +// This includes driver libraries and meta devices, for example. +func NewCommonDiscoverer(logger *logrus.Logger, root string, nvmllib nvml.Interface) (discover.Discover, error) { + metaDevices := discover.NewDeviceDiscoverer( + logger, + lookup.NewCharDeviceLocator(logger, root), + root, + []string{ + "/dev/nvidia-modeset", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-uvm", + "/dev/nvidiactl", + }, + ) + + graphicsMounts, err := discover.NewGraphicsMountsDiscoverer(logger, root) + if err != nil { + return nil, fmt.Errorf("error constructing discoverer for graphics mounts: %v", err) + } + + driverFiles, err := NewDriverDiscoverer(logger, root, nvmllib) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err) + } + + d := discover.Merge( + metaDevices, + graphicsMounts, + driverFiles, + ) + + return d, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go b/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go new file mode 100644 index 00000000..9359d116 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go @@ -0,0 +1,117 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/container-orchestrated-devices/container-device-interface/specs-go" + "github.com/sirupsen/logrus" +) + +type deviceFolderPermissions struct { + logger *logrus.Logger + root string + foldersByMode map[string][]string +} + +var _ discover.Discover = (*deviceFolderPermissions)(nil) + +// NewDeviceFolderPermissionHookDiscoverer creates a discoverer that can be used to update the permissions for the parent folders of nested device nodes from the specified set of device specs. +// This works around an issue with rootless podman when using crun as a low-level runtime. +// See https://github.com/containers/crun/issues/1047 +// TODO: This currently assumes `root == ""` +func NewDeviceFolderPermissionHookDiscoverer(logger *logrus.Logger, root string, deviceSpecs []specs.Device) (discover.Discover, error) { + var paths []string + seen := make(map[string]bool) + + for _, device := range deviceSpecs { + for _, dn := range device.ContainerEdits.DeviceNodes { + if !strings.HasPrefix(dn.Path, "/dev") { + logger.Warningf("Skipping unexpected device folder path for device %v", dn) + continue + } + for df := filepath.Dir(dn.Path); df != "/dev"; df = filepath.Dir(df) { + if seen[df] { + continue + } + paths = append(paths, df) + seen[df] = true + } + } + } + + foldersByMode := make(map[string][]string) + for _, p := range paths { + info, err := os.Stat(p) + if err != nil { + return nil, fmt.Errorf("failed to get info for path %v: %v", p, err) + } + mode := fmt.Sprintf("%o", info.Mode().Perm()) + foldersByMode[mode] = append(foldersByMode[mode], p) + } + + d := &deviceFolderPermissions{ + logger: logger, + root: root, + foldersByMode: foldersByMode, + } + + return d, nil +} + +// Devices are empty for this discoverer +func (d *deviceFolderPermissions) Devices() ([]discover.Device, error) { + return nil, nil +} + +// Hooks returns a set of hooks that sets the file modes of parent folders for device nodes. +// One hook is returned per mode. +func (d *deviceFolderPermissions) Hooks() ([]discover.Hook, error) { + locator := lookup.NewExecutableLocator(d.logger, d.root) + + var hooks []discover.Hook + for mode, folders := range d.foldersByMode { + args := []string{"--mode", mode} + for _, folder := range folders { + args = append(args, "--path", folder) + } + + hook := discover.CreateNvidiaCTKHook( + d.logger, + locator, + nvidiaCTKExecutable, + nvidiaCTKDefaultFilePath, + "chmod", + args..., + ) + + hooks = append(hooks, hook) + } + + return hooks, nil +} + +// Mounts are empty for this discoverer +func (d *deviceFolderPermissions) Mounts() ([]discover.Mount, error) { + return nil, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/device.go b/cmd/nvidia-ctk/cdi/generate/device.go new file mode 100644 index 00000000..3d908ace --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/device.go @@ -0,0 +1,57 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "path/filepath" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/sirupsen/logrus" +) + +// deviceDiscoverer defines a discoverer for device nodes +type deviceDiscoverer struct { + logger *logrus.Logger + root string + deviceNodePaths []string +} + +var _ discover.Discover = (*deviceDiscoverer)(nil) + +// Devices returns the device nodes for the full GPU. +func (d *deviceDiscoverer) Devices() ([]discover.Device, error) { + var deviceNodes []discover.Device + for _, dn := range d.deviceNodePaths { + deviceNode := discover.Device{ + HostPath: filepath.Join(d.root, dn), + Path: dn, + } + deviceNodes = append(deviceNodes, deviceNode) + } + + return deviceNodes, nil +} + +// Hooks returns no hooks for a device discoverer +func (d *deviceDiscoverer) Hooks() ([]discover.Hook, error) { + return nil, nil +} + +// Mounts returns no mounts for a device discoverer +func (d *deviceDiscoverer) Mounts() ([]discover.Mount, error) { + return nil, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/driver.go b/cmd/nvidia-ctk/cdi/generate/driver.go new file mode 100644 index 00000000..c5aec328 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/driver.go @@ -0,0 +1,148 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +type driverLibraries struct { + logger *logrus.Logger + root string + libraries []string +} + +var _ discover.Discover = (*driverLibraries)(nil) + +// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation. +// The supplied NVML Library is used to query the expected driver version. +func NewDriverDiscoverer(logger *logrus.Logger, root string, nvmllib nvml.Interface) (discover.Discover, error) { + libraries, err := NewDriverLibraryDiscoverer(logger, root, nvmllib) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) + } + + binaries := discover.NewMounts( + logger, + lookup.NewExecutableLocator(logger, root), + root, + []string{ + "nvidia-smi", /* System management interface */ + "nvidia-debugdump", /* GPU coredump utility */ + "nvidia-persistenced", /* Persistence mode utility */ + "nvidia-cuda-mps-control", /* Multi process service CLI */ + "nvidia-cuda-mps-server", /* Multi process service server */ + }, + ) + + d := discover.Merge( + libraries, + binaries, + ) + + return d, nil +} + +// NewDriverLibraryDiscoverer creates a discoverer for the libraries associated with the specified driver version. +func NewDriverLibraryDiscoverer(logger *logrus.Logger, root string, nvmllib nvml.Interface) (discover.Discover, error) { + version, r := nvmllib.SystemGetDriverVersion() + if r != nvml.SUCCESS { + return nil, fmt.Errorf("failed to determine driver version: %v", r) + } + + libraries, err := findVersionLibs(logger, root, version) + if err != nil { + return nil, fmt.Errorf("failed to get libraries for driver version: %v", r) + } + + d := driverLibraries{ + logger: logger, + root: root, + libraries: libraries, + } + + return &d, nil +} + +// Devices are empty for this discoverer +func (d *driverLibraries) Devices() ([]discover.Device, error) { + return nil, nil +} + +// Mounts returns the mounts for the driver libraries +func (d *driverLibraries) Mounts() ([]discover.Mount, error) { + var mounts []discover.Mount + for _, d := range d.libraries { + mount := discover.Mount{ + HostPath: d, + Path: d, + } + mounts = append(mounts, mount) + } + + return mounts, nil +} + +// Hooks returns a hook that updates the LDCache for the specified driver library paths. +func (d *driverLibraries) Hooks() ([]discover.Hook, error) { + locator := lookup.NewExecutableLocator(d.logger, d.root) + + hook := discover.CreateLDCacheUpdateHook( + d.logger, + locator, + nvidiaCTKExecutable, + nvidiaCTKDefaultFilePath, + d.libraries, + ) + + return []discover.Hook{hook}, nil +} + +func findVersionLibs(logger *logrus.Logger, root string, version string) ([]string, error) { + logger.Infof("Using driver version %v", version) + + cache, err := ldcache.New(logger, root) + if err != nil { + return nil, fmt.Errorf("failed to load ldcache: %v", err) + } + + libs32, libs64 := cache.List() + + var libs []string + for _, l := range libs64 { + if strings.HasSuffix(l, version) { + logger.Infof("found 64-bit driver lib: %v", l) + libs = append(libs, l) + } + } + + for _, l := range libs32 { + if strings.HasSuffix(l, version) { + logger.Infof("found 32-bit driver lib: %v", l) + libs = append(libs, l) + } + } + + return libs, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/full-gpu.go b/cmd/nvidia-ctk/cdi/generate/full-gpu.go new file mode 100644 index 00000000..e9de7cb8 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/full-gpu.go @@ -0,0 +1,148 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +// fullGPUDiscoverer wraps a deviceDiscoverer and adds specifics required for discovering full GPUs +type fullGPUDiscoverer struct { + deviceDiscoverer + + pciBusID string +} + +var _ discover.Discover = (*fullGPUDiscoverer)(nil) + +// NewFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device. +func NewFullGPUDiscoverer(logger *logrus.Logger, root string, d device.Device) (discover.Discover, error) { + // TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface. + // This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin. + minor, ret := d.GetMinorNumber() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) + } + path := fmt.Sprintf("/dev/nvidia%d", minor) + + pciInfo, ret := d.GetPciInfo() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting PCI info for device: %v", ret) + } + pciBusID := getBusID(pciInfo) + + drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID) + if err != nil { + return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err) + } + + deviceNodePaths := append([]string{path}, drmDeviceNodes...) + + device := fullGPUDiscoverer{ + deviceDiscoverer: deviceDiscoverer{ + logger: logger, + root: root, + deviceNodePaths: deviceNodePaths, + }, + pciBusID: pciBusID, + } + + return &device, nil +} + +// Hooks returns the hooks for the GPU device. +// The following hooks are detected: +// 1. A hook to create /dev/dri/by-path symlinks +func (d *fullGPUDiscoverer) Hooks() ([]discover.Hook, error) { + links, err := d.deviceNodeLinks() + if err != nil { + return nil, fmt.Errorf("failed to discover DRA device links: %v", err) + } + if len(links) == 0 { + return nil, nil + } + + hookPath := "nvidia-ctk" + args := []string{hookPath, "hook", "create-symlinks"} + for _, l := range links { + args = append(args, "--link", l) + } + + var hooks []discover.Hook + hook := discover.Hook{ + Lifecycle: "createContainer", + Path: hookPath, + Args: args, + } + hooks = append(hooks, hook) + + return hooks, nil +} + +// Mounts returns an empty slice for a full GPU +func (d *fullGPUDiscoverer) Mounts() ([]discover.Mount, error) { + return nil, nil +} + +func (d *fullGPUDiscoverer) deviceNodeLinks() ([]string, error) { + candidates := []string{ + fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID), + fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID), + } + + var links []string + for _, c := range candidates { + linkPath := filepath.Join(d.root, c) + device, err := os.Readlink(linkPath) + if err != nil { + d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath) + continue + } + + d.logger.Debugf("adding device symlink %v -> %v", linkPath, device) + links = append(links, fmt.Sprintf("%v::%v", device, linkPath)) + } + + return links, nil +} + +// getBusID provides a utility function that returns the string representation of the bus ID. +func getBusID(p nvml.PciInfo) string { + var bytes []byte + for _, b := range p.BusId { + if byte(b) == '\x00' { + break + } + bytes = append(bytes, byte(b)) + } + id := strings.ToLower(string(bytes)) + + if id != "0000" { + id = strings.TrimPrefix(id, "0000") + } + + return id +} diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 184e9b91..544f1219 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -23,10 +23,8 @@ import ( "path/filepath" "strings" - "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" - "github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache" - "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" specs "github.com/container-orchestrated-devices/container-device-interface/specs-go" "github.com/sirupsen/logrus" @@ -195,406 +193,137 @@ func (m command) generateSpec() (*specs.Spec, error) { devicelib := device.New(device.WithNvml(nvmllib)) + deviceSpecs, err := m.generateDeviceSpecs(devicelib) + if err != nil { + return nil, fmt.Errorf("failed to create device CDI specs: %v", err) + } + + allDevice := createAllDevice(deviceSpecs) + + deviceSpecs = append(deviceSpecs, allDevice) + + allEdits := cdi.ContainerEdits{} + + ipcs, err := NewIPCDiscoverer(m.logger, "") + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err) + } + + ipcEdits, err := edits.FromDiscoverer(ipcs) + if err != nil { + return nil, fmt.Errorf("failed to create container edits for IPC sockets: %v", err) + } + // TODO: We should not have to update this after the fact + for _, s := range ipcEdits.Mounts { + s.Options = append(s.Options, "noexec") + } + + allEdits.Append(ipcEdits) + + common, err := NewCommonDiscoverer(m.logger, "", nvmllib) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err) + } + + deviceFolderPermissionHooks, err := NewDeviceFolderPermissionHookDiscoverer(m.logger, "", deviceSpecs) + if err != nil { + return nil, fmt.Errorf("failed to generated permission hooks for device nodes: %v", err) + } + + commonEdits, err := edits.FromDiscoverer(discover.Merge(common, deviceFolderPermissionHooks)) + if err != nil { + return nil, fmt.Errorf("failed to create container edits for common entities: %v", err) + } + + allEdits.Append(commonEdits) + + // Construct the spec + // TODO: Use the code to determine the minimal version spec := specs.Spec{ Version: "0.4.0", Kind: "nvidia.com/gpu", - ContainerEdits: specs.ContainerEdits{}, + Devices: deviceSpecs, + ContainerEdits: *allEdits.ContainerEdits, } + + return &spec, nil +} + +func (m command) generateDeviceSpecs(devicelib device.Interface) ([]specs.Device, error) { + var deviceSpecs []specs.Device + err := devicelib.VisitDevices(func(i int, d device.Device) error { - isMig, err := d.IsMigEnabled() + isMigEnabled, err := d.IsMigEnabled() if err != nil { return fmt.Errorf("failed to check whether device is MIG device: %v", err) } - if isMig { + if isMigEnabled { return nil } - device, err := generateEditsForDevice(newGPUDevice(i, d)) + device, err := NewFullGPUDiscoverer(m.logger, "", d) if err != nil { - return fmt.Errorf("failed to generate CDI spec for device %v: %v", i, err) + return fmt.Errorf("failed to create device: %v", err) } - graphicsEdits, err := m.editsForGraphicsDevice(d) + deviceEdits, err := edits.FromDiscoverer(device) if err != nil { - return fmt.Errorf("failed to generate CDI spec for DRM devices associated with device %v: %v", i, err) + return fmt.Errorf("failed to create container edits for device: %v", err) } - // We add the device nodes and hooks edits for the DRM devices; Mounts are added globally - for _, dn := range graphicsEdits.DeviceNodes { - device.ContainerEdits.DeviceNodes = append(device.ContainerEdits.DeviceNodes, dn) - } - for _, h := range graphicsEdits.Hooks { - device.ContainerEdits.Hooks = append(device.ContainerEdits.Hooks, h) + deviceSpec := specs.Device{ + Name: fmt.Sprintf("gpu%d", i), + ContainerEdits: *deviceEdits.ContainerEdits, } - spec.Devices = append(spec.Devices, device) + deviceSpecs = append(deviceSpecs, deviceSpec) return nil }) if err != nil { return nil, fmt.Errorf("failed to generate CDI spec for GPU devices: %v", err) } - err = devicelib.VisitMigDevices(func(i int, d device.Device, j int, m device.MigDevice) error { - device, err := generateEditsForDevice(newMigDevice(i, j, m)) + err = devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error { + device, err := NewMigDeviceDiscoverer(m.logger, "", d, mig) if err != nil { - return fmt.Errorf("failed to generate CDI spec for device %v: %v", i, err) + return fmt.Errorf("failed to create MIG device: %v", err) } - spec.Devices = append(spec.Devices, device) + deviceEdits, err := edits.FromDiscoverer(device) + if err != nil { + return fmt.Errorf("failed to create container edits for MIG device: %v", err) + } + + deviceSpec := specs.Device{ + Name: fmt.Sprintf("mig%v:%v", i, j), + ContainerEdits: *deviceEdits.ContainerEdits, + } + + deviceSpecs = append(deviceSpecs, deviceSpec) return nil }) if err != nil { return nil, fmt.Errorf("falied to generate CDI spec for MIG devices: %v", err) } - // We create an "all" device with all the discovered device nodes - var allDeviceNodes []*specs.DeviceNode - for _, d := range spec.Devices { - for _, dn := range d.ContainerEdits.DeviceNodes { - allDeviceNodes = append(allDeviceNodes, dn) + return deviceSpecs, nil +} + +// createAllDevice creates an 'all' device which combines the edits from the previous devices +func createAllDevice(deviceSpecs []specs.Device) specs.Device { + edits := cdi.ContainerEdits{} + + for _, d := range deviceSpecs { + edit := cdi.ContainerEdits{ + ContainerEdits: &d.ContainerEdits, } + edits.Append(&edit) } + all := specs.Device{ - Name: "all", - ContainerEdits: specs.ContainerEdits{ - DeviceNodes: allDeviceNodes, - }, + Name: "all", + ContainerEdits: *edits.ContainerEdits, } - - spec.Devices = append(spec.Devices, all) - spec.ContainerEdits.DeviceNodes = m.getExistingMetaDeviceNodes() - - libraries, err := m.findLibs(nvmllib) - if err != nil { - return nil, fmt.Errorf("failed to locate driver libraries: %v", err) - } - - binaries, err := m.findBinaries() - if err != nil { - return nil, fmt.Errorf("failed to locate driver binaries: %v", err) - } - - ipcs, err := m.findIPC() - if err != nil { - return nil, fmt.Errorf("failed to locate driver IPC sockets: %v", err) - } - - graphicsEdits, err := m.editsForGraphicsDevice(nil) - if err != nil { - return nil, fmt.Errorf("failed to generate edits for graphics libraries: %v", err) - } - - libOptions := []string{ - "ro", - "nosuid", - "nodev", - "bind", - } - ipcOptions := append(libOptions, "noexec") - - spec.ContainerEdits.Mounts = append( - generateMountsForPaths(libOptions, libraries, binaries), - generateMountsForPaths(ipcOptions, ipcs)..., - ) - - spec.ContainerEdits.Mounts = append(spec.ContainerEdits.Mounts, graphicsEdits.Mounts...) - - ldcacheUpdateHook := m.generateUpdateLdCacheHook(libraries) - - deviceFolderPermissionHooks, err := m.generateDeviceFolderPermissionHooks(ldcacheUpdateHook.Path, allDeviceNodes) - if err != nil { - return nil, fmt.Errorf("failed to generated permission hooks for device nodes: %v", err) - } - - spec.ContainerEdits.Hooks = append([]*specs.Hook{ldcacheUpdateHook}, deviceFolderPermissionHooks...) - - return &spec, nil -} - -func generateEditsForDevice(name string, d deviceInfo) (specs.Device, error) { - deviceNodePaths, err := d.GetDeviceNodes() - if err != nil { - return specs.Device{}, fmt.Errorf("failed to get paths for device: %v", err) - } - - deviceNodes := getDeviceNodesFromPaths(deviceNodePaths) - - device := specs.Device{ - Name: name, - ContainerEdits: specs.ContainerEdits{ - DeviceNodes: deviceNodes, - }, - } - - return device, nil -} - -func (m command) editsForGraphicsDevice(device device.Device) (*specs.ContainerEdits, error) { - selectedDevice := image.NewVisibleDevices("none") - if device != nil { - uuid, ret := device.GetUUID() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting device UUID: %v", ret) - } - selectedDevice = image.NewVisibleDevices(uuid) - } - cfg := discover.Config{ - Root: "", - NVIDIAContainerToolkitCLIExecutablePath: "nvidia-ctk", - } - // Create a discoverer for the single device: - d, err := discover.NewGraphicsDiscoverer(m.logger, selectedDevice, &cfg) - if err != nil { - return nil, fmt.Errorf("error constructing discoverer: %v", err) - } - - devices, err := d.Devices() - if err != nil { - return nil, fmt.Errorf("error getting DRM devices: %v", err) - } - - var deviceNodes []*specs.DeviceNode - for _, d := range devices { - dn := specs.DeviceNode{ - Path: d.Path, - HostPath: d.HostPath, - } - deviceNodes = append(deviceNodes, &dn) - } - - hooks, err := d.Hooks() - if err != nil { - return nil, fmt.Errorf("error getting hooks: %v", err) - } - - var cdiHooks []*specs.Hook - for _, h := range hooks { - cdiHook := specs.Hook{ - HookName: h.Lifecycle, - Path: h.Path, - Args: h.Args, - } - cdiHooks = append(cdiHooks, &cdiHook) - } - - mounts, err := d.Mounts() - if err != nil { - return nil, fmt.Errorf("error getting mounts: %v", err) - } - - var cdiMounts []*specs.Mount - for _, m := range mounts { - cdiMount := specs.Mount{ - ContainerPath: m.Path, - HostPath: m.HostPath, - Options: []string{ - "ro", - "nosuid", - "nodev", - "bind", - }, - Type: "bind", - } - cdiMounts = append(cdiMounts, &cdiMount) - } - - edits := specs.ContainerEdits{ - DeviceNodes: deviceNodes, - Hooks: cdiHooks, - Mounts: cdiMounts, - } - - return &edits, nil -} - -func (m command) getExistingMetaDeviceNodes() []*specs.DeviceNode { - metaDeviceNodePaths := []string{ - "/dev/nvidia-modeset", - "/dev/nvidia-uvm-tools", - "/dev/nvidia-uvm", - "/dev/nvidiactl", - } - - var existingDeviceNodePaths []string - for _, p := range metaDeviceNodePaths { - if _, err := os.Stat(p); err != nil { - m.logger.Infof("Ignoring missing meta device %v", p) - continue - } - existingDeviceNodePaths = append(existingDeviceNodePaths, p) - } - - return getDeviceNodesFromPaths(existingDeviceNodePaths) -} - -func getDeviceNodesFromPaths(deviceNodePaths []string) []*specs.DeviceNode { - var deviceNodes []*specs.DeviceNode - for _, p := range deviceNodePaths { - deviceNode := specs.DeviceNode{ - Path: p, - } - deviceNodes = append(deviceNodes, &deviceNode) - } - - return deviceNodes -} - -func (m command) findLibs(nvmllib nvml.Interface) ([]string, error) { - version, r := nvmllib.SystemGetDriverVersion() - if r != nvml.SUCCESS { - return nil, fmt.Errorf("failed to determine driver version: %v", r) - } - m.logger.Infof("Using driver version %v", version) - - cache, err := ldcache.New(m.logger, "") - if err != nil { - return nil, fmt.Errorf("failed to load ldcache: %v", err) - } - - libs32, libs64 := cache.List() - - var libs []string - for _, l := range libs64 { - if strings.HasSuffix(l, version) { - m.logger.Infof("found 64-bit driver lib: %v", l) - libs = append(libs, l) - } - } - - for _, l := range libs32 { - if strings.HasSuffix(l, version) { - m.logger.Infof("found 32-bit driver lib: %v", l) - libs = append(libs, l) - } - } - - return libs, nil -} - -func (m command) findBinaries() ([]string, error) { - candidates := []string{ - "nvidia-smi", /* System management interface */ - "nvidia-debugdump", /* GPU coredump utility */ - "nvidia-persistenced", /* Persistence mode utility */ - "nvidia-cuda-mps-control", /* Multi process service CLI */ - "nvidia-cuda-mps-server", /* Multi process service server */ - } - - locator := lookup.NewExecutableLocator(m.logger, "") - - var binaries []string - for _, c := range candidates { - targets, err := locator.Locate(c) - if err != nil { - m.logger.Warningf("skipping %v: %v", c, err) - continue - } - - binaries = append(binaries, targets[0]) - } - return binaries, nil -} - -func (m command) findIPC() ([]string, error) { - candidates := []string{ - "/var/run/nvidia-persistenced/socket", - "/var/run/nvidia-fabricmanager/socket", - // TODO: This can be controlled by the NV_MPS_PIPE_DIR envvar - "/tmp/nvidia-mps", - } - - locator := lookup.NewFileLocator(m.logger, "") - - var ipcs []string - for _, c := range candidates { - targets, err := locator.Locate(c) - if err != nil { - m.logger.Warningf("skipping %v: %v", c, err) - continue - } - - ipcs = append(ipcs, targets[0]) - } - return ipcs, nil -} - -func generateMountsForPaths(options []string, pathSets ...[]string) []*specs.Mount { - var mounts []*specs.Mount - for _, paths := range pathSets { - for _, p := range paths { - mount := specs.Mount{ - HostPath: p, - // We may want to adjust the container path - ContainerPath: p, - Type: "bind", - Options: options, - } - mounts = append(mounts, &mount) - } - } - return mounts -} - -func (m command) generateUpdateLdCacheHook(libraries []string) *specs.Hook { - locator := lookup.NewExecutableLocator(m.logger, "") - - hook := discover.CreateLDCacheUpdateHook( - m.logger, - locator, - nvidiaCTKExecutable, - nvidiaCTKDefaultFilePath, - libraries, - ) - return &specs.Hook{ - HookName: hook.Lifecycle, - Path: hook.Path, - Args: hook.Args, - } -} - -func (m command) generateDeviceFolderPermissionHooks(nvidiaCTKPath string, deviceNodes []*specs.DeviceNode) ([]*specs.Hook, error) { - var deviceFolders []string - seen := make(map[string]bool) - - for _, dn := range deviceNodes { - if !strings.HasPrefix(dn.Path, "/dev") { - m.logger.Warningf("Skipping unexpected device folder path for device %v", dn.Path) - continue - } - for df := filepath.Dir(dn.Path); df != "/dev"; df = filepath.Dir(df) { - if seen[df] { - continue - } - deviceFolders = append(deviceFolders, df) - seen[df] = true - } - } - - foldersByMode := make(map[string][]string) - for _, p := range deviceFolders { - info, err := os.Stat(p) - if err != nil { - return nil, fmt.Errorf("failed to get info for path %v: %v", p, err) - } - mode := fmt.Sprintf("%o", info.Mode().Perm()) - foldersByMode[mode] = append(foldersByMode[mode], p) - } - - var hooks []*specs.Hook - for mode, folders := range foldersByMode { - args := []string{filepath.Base(nvidiaCTKPath), "hook", "chmod", "--mode", mode} - for _, folder := range folders { - args = append(args, "--path", folder) - } - hook := specs.Hook{ - HookName: cdi.CreateContainerHook, - Path: nvidiaCTKPath, - Args: args, - } - - hooks = append(hooks, &hook) - } - - return hooks, nil + return all } // createParentDirsIfRequired creates the parent folders of the specified path if requried. diff --git a/cmd/nvidia-ctk/cdi/generate/ipc.go b/cmd/nvidia-ctk/cdi/generate/ipc.go new file mode 100644 index 00000000..0f5157ee --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/ipc.go @@ -0,0 +1,39 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/sirupsen/logrus" +) + +// NewIPCDiscoverer creats a discoverer for NVIDIA IPC sockets. +func NewIPCDiscoverer(logger *logrus.Logger, root string) (discover.Discover, error) { + d := discover.NewMounts( + logger, + lookup.NewFileLocator(logger, root), + root, + []string{ + "/var/run/nvidia-persistenced/socket", + "/var/run/nvidia-fabricmanager/socket", + "/tmp/nvidia-mps", + }, + ) + + return d, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/mig-device.go b/cmd/nvidia-ctk/cdi/generate/mig-device.go new file mode 100644 index 00000000..76a4df1f --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/mig-device.go @@ -0,0 +1,84 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "fmt" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +// migDeviceDiscoverer wraps a deviceDiscoverer and adds specifics required for discovering MIG devices. +type migDeviceDiscoverer struct { + deviceDiscoverer +} + +var _ discover.Discover = (*migDeviceDiscoverer)(nil) + +// NewMigDeviceDiscoverer creates a discoverer for the specified mig device and its parent. +func NewMigDeviceDiscoverer(logger *logrus.Logger, root string, parent device.Device, d device.MigDevice) (discover.Discover, error) { + minor, ret := parent.GetMinorNumber() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) + } + parentPath := fmt.Sprintf("/dev/nvidia%d", minor) + + migCaps, err := nvcaps.NewMigCaps() + if err != nil { + return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) + } + + gi, ret := d.GetGpuInstanceId() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret) + } + + ci, ret := d.GetComputeInstanceId() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret) + } + + giCap := nvcaps.NewGPUInstanceCap(minor, gi) + giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) + if err != nil { + return nil, fmt.Errorf("failed to get GI cap device path: %v", err) + } + + ciCap := nvcaps.NewComputeInstanceCap(minor, gi, ci) + ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) + if err != nil { + return nil, fmt.Errorf("failed to get CI cap device path: %v", err) + } + + m := migDeviceDiscoverer{ + deviceDiscoverer: deviceDiscoverer{ + logger: logger, + root: root, + deviceNodePaths: []string{ + parentPath, + giCapDevicePath, + ciCapDevicePath, + }, + }, + } + + return &m, nil +}