diff --git a/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go b/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go index 418a4a3d..6aca4f3d 100644 --- a/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go +++ b/cmd/nvidia-ctk/cdi/generate/device-folder-permissions.go @@ -17,9 +17,12 @@ package generate import ( + "fmt" "path/filepath" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" + "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/container-orchestrated-devices/container-device-interface/specs-go" "github.com/sirupsen/logrus" ) @@ -33,6 +36,16 @@ type deviceFolderPermissions struct { var _ discover.Discover = (*deviceFolderPermissions)(nil) +// GetDeviceFolderPermissionHookEdits gets the edits required for device folder permissions discoverer +func GetDeviceFolderPermissionHookEdits(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, deviceSpecs []specs.Device) (*cdi.ContainerEdits, error) { + deviceFolderPermissionHooks, err := NewDeviceFolderPermissionHookDiscoverer(logger, driverRoot, nvidiaCTKPath, deviceSpecs) + if err != nil { + return nil, fmt.Errorf("failed to generated permission hooks for device nodes: %v", err) + } + + return edits.FromDiscoverer(deviceFolderPermissionHooks) +} + // NewDeviceFolderPermissionHookDiscoverer creates a discoverer that can be used to update the permissions for the parent folders of nested device nodes from the specified set of device specs. // This works around an issue with rootless podman when using crun as a low-level runtime. // See https://github.com/containers/crun/issues/1047 diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index c46eb147..e84dad49 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -23,8 +23,8 @@ import ( "path/filepath" "strings" - "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" specs "github.com/container-orchestrated-devices/container-device-interface/specs-go" "github.com/sirupsen/logrus" @@ -90,7 +90,7 @@ func (m command) build() *cli.Command { &cli.StringFlag{ Name: "device-name-strategy", Usage: "Specify the strategy for generating device names. One of [index | uuid | type-index]", - Value: deviceNameStrategyIndex, + Value: nvcdi.DeviceNameStrategyIndex, Destination: &cfg.deviceNameStrategy, }, &cli.StringFlag{ @@ -117,7 +117,7 @@ func (m command) validateFlags(r *cli.Context, cfg *config) error { return fmt.Errorf("invalid output format: %v", cfg.format) } - _, err := newDeviceNamer(cfg.deviceNameStrategy) + _, err := nvcdi.NewDeviceNamer(cfg.deviceNameStrategy) if err != nil { return err } @@ -126,16 +126,7 @@ func (m command) validateFlags(r *cli.Context, cfg *config) error { } func (m command) run(c *cli.Context, cfg *config) error { - deviceNamer, err := newDeviceNamer(cfg.deviceNameStrategy) - if err != nil { - return fmt.Errorf("failed to create device namer: %v", err) - } - - spec, err := m.generateSpec( - cfg.driverRoot, - discover.FindNvidiaCTK(m.logger, cfg.nvidiaCTKPath), - deviceNamer, - ) + spec, err := m.generateSpec(cfg) if err != nil { return fmt.Errorf("failed to generate CDI spec: %v", err) } @@ -214,7 +205,12 @@ func writeToOutput(format string, data []byte, output io.Writer) error { return nil } -func (m command) generateSpec(driverRoot string, nvidiaCTKPath string, namer deviceNamer) (*specs.Spec, error) { +func (m command) generateSpec(cfg *config) (*specs.Spec, error) { + deviceNamer, err := nvcdi.NewDeviceNamer(cfg.deviceNameStrategy) + if err != nil { + return nil, fmt.Errorf("failed to create device namer: %v", err) + } + nvmllib := nvml.New() if r := nvmllib.Init(); r != nvml.SUCCESS { return nil, r @@ -223,7 +219,16 @@ func (m command) generateSpec(driverRoot string, nvidiaCTKPath string, namer dev devicelib := device.New(device.WithNvml(nvmllib)) - deviceSpecs, err := m.generateDeviceSpecs(devicelib, driverRoot, nvidiaCTKPath, namer) + cdilib := nvcdi.New( + nvcdi.WithLogger(m.logger), + nvcdi.WithDriverRoot(cfg.driverRoot), + nvcdi.WithNVIDIACTKPath(cfg.nvidiaCTKPath), + nvcdi.WithDeviceNamer(deviceNamer), + nvcdi.WithDeviceLib(devicelib), + nvcdi.WithNvmlLib(nvmllib), + ) + + deviceSpecs, err := cdilib.GetAllDeviceSpecs() if err != nil { return nil, fmt.Errorf("failed to create device CDI specs: %v", err) } @@ -232,20 +237,16 @@ func (m command) generateSpec(driverRoot string, nvidiaCTKPath string, namer dev deviceSpecs = append(deviceSpecs, allDevice) - common, err := NewCommonDiscoverer(m.logger, driverRoot, nvidiaCTKPath, nvmllib) + commonEdits, err := cdilib.GetCommonEdits() if err != nil { - return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err) + return nil, fmt.Errorf("failed to create edits common for entities: %v", err) + } + deviceFolderPermissionEdits, err := GetDeviceFolderPermissionHookEdits(m.logger, cfg.driverRoot, cfg.nvidiaCTKPath, deviceSpecs) + if err != nil { + return nil, fmt.Errorf("failed to generated edits for device folder permissions: %v", err) } - deviceFolderPermissionHooks, err := NewDeviceFolderPermissionHookDiscoverer(m.logger, driverRoot, nvidiaCTKPath, deviceSpecs) - if err != nil { - return nil, fmt.Errorf("failed to generated permission hooks for device nodes: %v", err) - } - - commonEdits, err := edits.FromDiscoverer(discover.Merge(common, deviceFolderPermissionHooks)) - if err != nil { - return nil, fmt.Errorf("failed to create container edits for common entities: %v", err) - } + commonEdits.Append(deviceFolderPermissionEdits) // We construct the spec and determine the minimum required version based on the specification. spec := specs.Spec{ @@ -266,73 +267,6 @@ func (m command) generateSpec(driverRoot string, nvidiaCTKPath string, namer dev return &spec, nil } -func (m command) generateDeviceSpecs(devicelib device.Interface, driverRoot string, nvidiaCTKPath string, namer deviceNamer) ([]specs.Device, error) { - var deviceSpecs []specs.Device - - err := devicelib.VisitDevices(func(i int, d device.Device) error { - isMigEnabled, err := d.IsMigEnabled() - if err != nil { - return fmt.Errorf("failed to check whether device is MIG device: %v", err) - } - if isMigEnabled { - return nil - } - device, err := NewFullGPUDiscoverer(m.logger, driverRoot, nvidiaCTKPath, d) - if err != nil { - return fmt.Errorf("failed to create device: %v", err) - } - - deviceEdits, err := edits.FromDiscoverer(device) - if err != nil { - return fmt.Errorf("failed to create container edits for device: %v", err) - } - - deviceName, err := namer.GetDeviceName(i, d) - if err != nil { - return fmt.Errorf("failed to get device name: %v", err) - } - deviceSpec := specs.Device{ - Name: deviceName, - ContainerEdits: *deviceEdits.ContainerEdits, - } - - deviceSpecs = append(deviceSpecs, deviceSpec) - return nil - }) - if err != nil { - return nil, fmt.Errorf("failed to generate CDI spec for GPU devices: %v", err) - } - - err = devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error { - device, err := NewMigDeviceDiscoverer(m.logger, "", d, mig) - if err != nil { - return fmt.Errorf("failed to create MIG device: %v", err) - } - - deviceEdits, err := edits.FromDiscoverer(device) - if err != nil { - return fmt.Errorf("failed to create container edits for MIG device: %v", err) - } - - deviceName, err := namer.GetMigDeviceName(i, j, mig) - if err != nil { - return fmt.Errorf("failed to get device name: %v", err) - } - deviceSpec := specs.Device{ - Name: deviceName, - ContainerEdits: *deviceEdits.ContainerEdits, - } - - deviceSpecs = append(deviceSpecs, deviceSpec) - return nil - }) - if err != nil { - return nil, fmt.Errorf("falied to generate CDI spec for MIG devices: %v", err) - } - - return deviceSpecs, nil -} - // createAllDevice creates an 'all' device which combines the edits from the previous devices func createAllDevice(deviceSpecs []specs.Device) specs.Device { edits := edits.NewContainerEdits() diff --git a/cmd/nvidia-ctk/cdi/generate/mig-device.go b/cmd/nvidia-ctk/cdi/generate/mig-device.go deleted file mode 100644 index ad18e6ad..00000000 --- a/cmd/nvidia-ctk/cdi/generate/mig-device.go +++ /dev/null @@ -1,75 +0,0 @@ -/** -# Copyright (c) NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -**/ - -package generate - -import ( - "fmt" - - "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" - "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" - "github.com/sirupsen/logrus" - "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" - "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" -) - -// NewMigDeviceDiscoverer creates a discoverer for the specified mig device and its parent. -func NewMigDeviceDiscoverer(logger *logrus.Logger, driverRoot string, parent device.Device, d device.MigDevice) (discover.Discover, error) { - minor, ret := parent.GetMinorNumber() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) - } - parentPath := fmt.Sprintf("/dev/nvidia%d", minor) - - migCaps, err := nvcaps.NewMigCaps() - if err != nil { - return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) - } - - gi, ret := d.GetGpuInstanceId() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret) - } - - ci, ret := d.GetComputeInstanceId() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret) - } - - giCap := nvcaps.NewGPUInstanceCap(minor, gi) - giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) - if err != nil { - return nil, fmt.Errorf("failed to get GI cap device path: %v", err) - } - - ciCap := nvcaps.NewComputeInstanceCap(minor, gi, ci) - ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) - if err != nil { - return nil, fmt.Errorf("failed to get CI cap device path: %v", err) - } - - deviceNodes := discover.NewCharDeviceDiscoverer( - logger, - []string{ - parentPath, - giCapDevicePath, - ciCapDevicePath, - }, - driverRoot, - ) - - return deviceNodes, nil -} diff --git a/pkg/nvcdi/api.go b/pkg/nvcdi/api.go new file mode 100644 index 00000000..0fd07e72 --- /dev/null +++ b/pkg/nvcdi/api.go @@ -0,0 +1,33 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvcdi + +import ( + "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" + "github.com/container-orchestrated-devices/container-device-interface/specs-go" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" +) + +// Interface defines the API for the nvcdi package +type Interface interface { + GetCommonEdits() (*cdi.ContainerEdits, error) + GetAllDeviceSpecs() ([]specs.Device, error) + GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) + GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) + GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) + GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) +} diff --git a/cmd/nvidia-ctk/cdi/generate/common.go b/pkg/nvcdi/common.go similarity index 72% rename from cmd/nvidia-ctk/cdi/generate/common.go rename to pkg/nvcdi/common.go index 136e1264..1d04d420 100644 --- a/cmd/nvidia-ctk/cdi/generate/common.go +++ b/pkg/nvcdi/common.go @@ -14,20 +14,33 @@ # limitations under the License. **/ -package generate +package nvcdi import ( "fmt" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" + "github.com/sirupsen/logrus" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" ) -// NewCommonDiscoverer returns a discoverer for entities that are not associated with a specific CDI device. +// GetCommonEdits generates a CDI specification that can be used for ANY devices +func (l *nvcdilib) GetCommonEdits() (*cdi.ContainerEdits, error) { + common, err := newCommonDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath, l.nvmllib) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err) + } + + return edits.FromDiscoverer(common) +} + +// newCommonDiscoverer returns a discoverer for entities that are not associated with a specific CDI device. // This includes driver libraries and meta devices, for example. -func NewCommonDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, nvmllib nvml.Interface) (discover.Discover, error) { +func newCommonDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, nvmllib nvml.Interface) (discover.Discover, error) { metaDevices := discover.NewDeviceDiscoverer( logger, lookup.NewCharDeviceLocator( diff --git a/cmd/nvidia-ctk/cdi/generate/driver.go b/pkg/nvcdi/driver.go similarity index 99% rename from cmd/nvidia-ctk/cdi/generate/driver.go rename to pkg/nvcdi/driver.go index 8a6af8de..e939ee76 100644 --- a/cmd/nvidia-ctk/cdi/generate/driver.go +++ b/pkg/nvcdi/driver.go @@ -14,7 +14,7 @@ # limitations under the License. **/ -package generate +package nvcdi import ( "fmt" diff --git a/cmd/nvidia-ctk/cdi/generate/full-gpu.go b/pkg/nvcdi/full-gpu.go similarity index 74% rename from cmd/nvidia-ctk/cdi/generate/full-gpu.go rename to pkg/nvcdi/full-gpu.go index 52039eaa..7e61477c 100644 --- a/cmd/nvidia-ctk/cdi/generate/full-gpu.go +++ b/pkg/nvcdi/full-gpu.go @@ -14,7 +14,7 @@ # limitations under the License. **/ -package generate +package nvcdi import ( "fmt" @@ -23,12 +23,50 @@ import ( "strings" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" + "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" + "github.com/container-orchestrated-devices/container-device-interface/specs-go" "github.com/sirupsen/logrus" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" ) +// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. +func (l *nvcdilib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) { + edits, err := l.GetGPUDeviceEdits(d) + if err != nil { + return nil, fmt.Errorf("failed to get edits for device: %v", err) + } + + name, err := l.deviceNamer.GetDeviceName(i, d) + if err != nil { + return nil, fmt.Errorf("failed to get device name: %v", err) + } + + spec := specs.Device{ + Name: name, + ContainerEdits: *edits.ContainerEdits, + } + + return &spec, nil +} + +// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'. +func (l *nvcdilib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error) { + device, err := newFullGPUDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath, d) + if err != nil { + return nil, fmt.Errorf("failed to create device discoverer: %v", err) + } + + editsForDevice, err := edits.FromDiscoverer(device) + if err != nil { + return nil, fmt.Errorf("failed to create container edits for device: %v", err) + } + + return editsForDevice, nil +} + // byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links type byPathHookDiscoverer struct { logger *logrus.Logger @@ -39,8 +77,8 @@ type byPathHookDiscoverer struct { var _ discover.Discover = (*byPathHookDiscoverer)(nil) -// NewFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device. -func NewFullGPUDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, d device.Device) (discover.Discover, error) { +// newFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device. +func newFullGPUDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, d device.Device) (discover.Discover, error) { // TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface. // This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin. minor, ret := d.GetMinorNumber() diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go new file mode 100644 index 00000000..c2348d4e --- /dev/null +++ b/pkg/nvcdi/lib.go @@ -0,0 +1,114 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvcdi + +import ( + "fmt" + + "github.com/container-orchestrated-devices/container-device-interface/specs-go" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +type nvcdilib struct { + logger *logrus.Logger + nvmllib nvml.Interface + devicelib device.Interface + deviceNamer DeviceNamer + driverRoot string + nvidiaCTKPath string +} + +// New creates a new nvcdi library +func New(opts ...Option) Interface { + l := &nvcdilib{} + + if l.nvmllib == nil { + l.nvmllib = nvml.New() + } + if l.devicelib == nil { + l.devicelib = device.New(device.WithNvml(l.nvmllib)) + } + if l.logger == nil { + l.logger = logrus.StandardLogger() + } + if l.deviceNamer == nil { + l.deviceNamer, _ = NewDeviceNamer(DeviceNameStrategyIndex) + } + if l.driverRoot == "" { + l.driverRoot = "/" + } + if l.nvidiaCTKPath == "" { + l.nvidiaCTKPath = "/usr/bin/nvidia-ctk" + } + + return l +} + +// GetAllDeviceSpecs returns the device specs for all available devices. +func (l *nvcdilib) GetAllDeviceSpecs() ([]specs.Device, error) { + var deviceSpecs []specs.Device + + gpuDeviceSpecs, err := l.getGPUDeviceSpecs() + if err != nil { + return nil, err + } + deviceSpecs = append(deviceSpecs, gpuDeviceSpecs...) + + migDeviceSpecs, err := l.getMigDeviceSpecs() + if err != nil { + return nil, err + } + deviceSpecs = append(deviceSpecs, migDeviceSpecs...) + + return deviceSpecs, nil +} + +func (l *nvcdilib) getGPUDeviceSpecs() ([]specs.Device, error) { + var deviceSpecs []specs.Device + err := l.devicelib.VisitDevices(func(i int, d device.Device) error { + deviceSpec, err := l.GetGPUDeviceSpecs(i, d) + if err != nil { + return err + } + deviceSpecs = append(deviceSpecs, *deviceSpec) + + return nil + }) + if err != nil { + return nil, fmt.Errorf("failed to generate CDI edits for GPU devices: %v", err) + } + return deviceSpecs, err +} + +func (l *nvcdilib) getMigDeviceSpecs() ([]specs.Device, error) { + var deviceSpecs []specs.Device + err := l.devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error { + deviceSpec, err := l.GetMIGDeviceSpecs(i, d, j, mig) + if err != nil { + return err + } + deviceSpecs = append(deviceSpecs, *deviceSpec) + + return nil + }) + if err != nil { + return nil, fmt.Errorf("failed to generate CDI edits for GPU devices: %v", err) + } + return deviceSpecs, err +} diff --git a/pkg/nvcdi/mig-device.go b/pkg/nvcdi/mig-device.go new file mode 100644 index 00000000..3d0a91f2 --- /dev/null +++ b/pkg/nvcdi/mig-device.go @@ -0,0 +1,124 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvcdi + +import ( + "fmt" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" + "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" + "github.com/container-orchestrated-devices/container-device-interface/specs-go" + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. +func (l *nvcdilib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) (*specs.Device, error) { + edits, err := l.GetMIGDeviceEdits(d, mig) + if err != nil { + return nil, fmt.Errorf("failed to get edits for device: %v", err) + } + + name, err := l.deviceNamer.GetMigDeviceName(i, d, j, mig) + if err != nil { + return nil, fmt.Errorf("failed to get device name: %v", err) + } + + spec := specs.Device{ + Name: name, + ContainerEdits: *edits.ContainerEdits, + } + + return &spec, nil +} + +// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'. +func (l *nvcdilib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) (*cdi.ContainerEdits, error) { + gpu, ret := parent.GetMinorNumber() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU minor: %v", ret) + } + + gi, ret := mig.GetGpuInstanceId() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret) + } + + ci, ret := mig.GetComputeInstanceId() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret) + } + + editsForDevice, err := GetEditsForComputeInstance(l.logger, l.driverRoot, gpu, gi, ci) + if err != nil { + return nil, fmt.Errorf("failed to create container edits for MIG device: %v", err) + } + + return editsForDevice, nil +} + +// GetEditsForComputeInstance returns the CDI edits for a particular compute instance defined by the (gpu, gi, ci) tuple +func GetEditsForComputeInstance(logger *logrus.Logger, driverRoot string, gpu int, gi int, ci int) (*cdi.ContainerEdits, error) { + computeInstance, err := newComputeInstanceDiscoverer(logger, driverRoot, gpu, gi, ci) + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for Compute Instance: %v", err) + } + + editsForDevice, err := edits.FromDiscoverer(computeInstance) + if err != nil { + return nil, fmt.Errorf("failed to create container edits for Compute Instance: %v", err) + } + + return editsForDevice, nil +} + +// newComputeInstanceDiscoverer returns a discoverer for the specified compute instance +func newComputeInstanceDiscoverer(logger *logrus.Logger, driverRoot string, gpu int, gi int, ci int) (discover.Discover, error) { + parentPath := fmt.Sprintf("/dev/nvidia%d", gpu) + + migCaps, err := nvcaps.NewMigCaps() + if err != nil { + return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) + } + + giCap := nvcaps.NewGPUInstanceCap(gpu, gi) + giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) + if err != nil { + return nil, fmt.Errorf("failed to get GI cap device path: %v", err) + } + + ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci) + ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) + if err != nil { + return nil, fmt.Errorf("failed to get CI cap device path: %v", err) + } + + deviceNodes := discover.NewCharDeviceDiscoverer( + logger, + []string{ + parentPath, + giCapDevicePath, + ciCapDevicePath, + }, + driverRoot, + ) + + return deviceNodes, nil +} diff --git a/cmd/nvidia-ctk/cdi/generate/namer.go b/pkg/nvcdi/namer.go similarity index 67% rename from cmd/nvidia-ctk/cdi/generate/namer.go rename to pkg/nvcdi/namer.go index 06e90ea4..e7b850da 100644 --- a/cmd/nvidia-ctk/cdi/generate/namer.go +++ b/pkg/nvcdi/namer.go @@ -14,7 +14,7 @@ # limitations under the License. **/ -package generate +package nvcdi import ( "fmt" @@ -23,15 +23,20 @@ import ( "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" ) -type deviceNamer interface { +// DeviceNamer is an interface for getting device names +type DeviceNamer interface { GetDeviceName(int, device.Device) (string, error) - GetMigDeviceName(int, int, device.MigDevice) (string, error) + GetMigDeviceName(int, device.Device, int, device.MigDevice) (string, error) } +// Supported device naming strategies const ( - deviceNameStrategyIndex = "index" - deviceNameStrategyTypeIndex = "type-index" - deviceNameStrategyUUID = "uuid" + // DeviceNameStrategyIndex generates devices names such as 0 or 1:0 + DeviceNameStrategyIndex = "index" + // DeviceNameStrategyTypeIndex generates devices names such as gpu0 or mig1:0 + DeviceNameStrategyTypeIndex = "type-index" + // DeviceNameStrategyUUID uses the device UUID as the name + DeviceNameStrategyUUID = "uuid" ) type deviceNameIndex struct { @@ -40,15 +45,15 @@ type deviceNameIndex struct { } type deviceNameUUID struct{} -// newDeviceNamer creates a Device Namer based on the supplied strategy. +// NewDeviceNamer creates a Device Namer based on the supplied strategy. // This namer can be used to construct the names for MIG and GPU devices when generating the CDI spec. -func newDeviceNamer(strategy string) (deviceNamer, error) { +func NewDeviceNamer(strategy string) (DeviceNamer, error) { switch strategy { - case deviceNameStrategyIndex: + case DeviceNameStrategyIndex: return deviceNameIndex{}, nil - case deviceNameStrategyTypeIndex: + case DeviceNameStrategyTypeIndex: return deviceNameIndex{gpuPrefix: "gpu", migPrefix: "mig"}, nil - case deviceNameStrategyUUID: + case DeviceNameStrategyUUID: return deviceNameUUID{}, nil } @@ -61,7 +66,7 @@ func (s deviceNameIndex) GetDeviceName(i int, d device.Device) (string, error) { } // GetMigDeviceName returns the name for the specified device based on the naming strategy -func (s deviceNameIndex) GetMigDeviceName(i int, j int, d device.MigDevice) (string, error) { +func (s deviceNameIndex) GetMigDeviceName(i int, d device.Device, j int, mig device.MigDevice) (string, error) { return fmt.Sprintf("%s%d:%d", s.migPrefix, i, j), nil } @@ -75,8 +80,8 @@ func (s deviceNameUUID) GetDeviceName(i int, d device.Device) (string, error) { } // GetMigDeviceName returns the name for the specified device based on the naming strategy -func (s deviceNameUUID) GetMigDeviceName(i int, j int, d device.MigDevice) (string, error) { - uuid, ret := d.GetUUID() +func (s deviceNameUUID) GetMigDeviceName(i int, d device.Device, j int, mig device.MigDevice) (string, error) { + uuid, ret := mig.GetUUID() if ret != nvml.SUCCESS { return "", fmt.Errorf("failed to get device UUID: %v", ret) } diff --git a/pkg/nvcdi/options.go b/pkg/nvcdi/options.go new file mode 100644 index 00000000..50b699a6 --- /dev/null +++ b/pkg/nvcdi/options.go @@ -0,0 +1,68 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvcdi + +import ( + "github.com/sirupsen/logrus" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +// Option is a function that configures the nvcdilib +type Option func(*nvcdilib) + +// WithDeviceLib sets the device library for the library +func WithDeviceLib(devicelib device.Interface) Option { + return func(l *nvcdilib) { + l.devicelib = devicelib + } +} + +// WithDeviceNamer sets the device namer for the library +func WithDeviceNamer(namer DeviceNamer) Option { + return func(l *nvcdilib) { + l.deviceNamer = namer + } +} + +// WithDriverRoot sets the driver root for the library +func WithDriverRoot(root string) Option { + return func(l *nvcdilib) { + l.driverRoot = root + } +} + +// WithLogger sets the logger for the library +func WithLogger(logger *logrus.Logger) Option { + return func(l *nvcdilib) { + l.logger = logger + } +} + +// WithNVIDIACTKPath sets the path to the NVIDIA Container Toolkit CLI path for the library +func WithNVIDIACTKPath(path string) Option { + return func(l *nvcdilib) { + l.nvidiaCTKPath = path + } +} + +// WithNvmlLib sets the nvml library for the library +func WithNvmlLib(nvmllib nvml.Interface) Option { + return func(l *nvcdilib) { + l.nvmllib = nvmllib + } +}