From b42a5d3e3a07ae956bd3128031bb6424ebf1ba28 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 11 Jun 2024 15:12:57 +0200 Subject: [PATCH 1/2] [no-relnote] Refactor dGPU device discovery Signed-off-by: Evan Lezar --- .../platform-support/dgpu/by-path-hooks.go | 117 +++++++++++++ internal/platform-support/dgpu/dgpu.go | 39 +++++ internal/platform-support/dgpu/nvml.go | 84 +++++++++ internal/platform-support/dgpu/nvml_test.go | 87 +++++++++ internal/platform-support/dgpu/options.go | 50 ++++++ pkg/nvcdi/full-gpu-nvml.go | 165 ++---------------- 6 files changed, 389 insertions(+), 153 deletions(-) create mode 100644 internal/platform-support/dgpu/by-path-hooks.go create mode 100644 internal/platform-support/dgpu/dgpu.go create mode 100644 internal/platform-support/dgpu/nvml.go create mode 100644 internal/platform-support/dgpu/nvml_test.go create mode 100644 internal/platform-support/dgpu/options.go diff --git a/internal/platform-support/dgpu/by-path-hooks.go b/internal/platform-support/dgpu/by-path-hooks.go new file mode 100644 index 00000000..cd38f5e7 --- /dev/null +++ b/internal/platform-support/dgpu/by-path-hooks.go @@ -0,0 +1,117 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" +) + +// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links +type byPathHookDiscoverer struct { + logger logger.Interface + devRoot string + nvidiaCDIHookPath string + pciBusID string + deviceNodes discover.Discover +} + +var _ discover.Discover = (*byPathHookDiscoverer)(nil) + +// Devices returns the empty list for the by-path hook discoverer +func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) { + return nil, nil +} + +// Hooks returns the hooks for the GPU device. +// The following hooks are detected: +// 1. A hook to create /dev/dri/by-path symlinks +func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) { + links, err := d.deviceNodeLinks() + if err != nil { + return nil, fmt.Errorf("failed to discover DRA device links: %v", err) + } + if len(links) == 0 { + return nil, nil + } + + var args []string + for _, l := range links { + args = append(args, "--link", l) + } + + hook := discover.CreateNvidiaCDIHook( + d.nvidiaCDIHookPath, + "create-symlinks", + args..., + ) + + return []discover.Hook{hook}, nil +} + +// Mounts returns an empty slice for a full GPU +func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) { + return nil, nil +} + +func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) { + devices, err := d.deviceNodes.Devices() + if err != nil { + return nil, fmt.Errorf("failed to discover device nodes: %v", err) + } + + if len(devices) == 0 { + return nil, nil + } + + selectedDevices := make(map[string]bool) + for _, d := range devices { + selectedDevices[d.HostPath] = true + } + + candidates := []string{ + fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID), + fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID), + } + + var links []string + for _, c := range candidates { + linkPath := filepath.Join(d.devRoot, c) + device, err := os.Readlink(linkPath) + if err != nil { + d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath) + continue + } + + deviceNode := device + if !filepath.IsAbs(device) { + deviceNode = filepath.Join(filepath.Dir(linkPath), device) + } + if !selectedDevices[deviceNode] { + d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode) + continue + } + d.logger.Debugf("adding device symlink %v -> %v", linkPath, device) + links = append(links, fmt.Sprintf("%v::%v", device, linkPath)) + } + + return links, nil +} diff --git a/internal/platform-support/dgpu/dgpu.go b/internal/platform-support/dgpu/dgpu.go new file mode 100644 index 00000000..6d2da823 --- /dev/null +++ b/internal/platform-support/dgpu/dgpu.go @@ -0,0 +1,39 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" +) + +// NewForDevice creates a discoverer for the specified Device. +// nvsandboxutils is used for discovery if specified, otherwise NVML is used. +func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) { + o := &options{} + for _, opt := range opts { + opt(o) + } + + if o.logger == nil { + o.logger = logger.New() + } + + return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d}) +} diff --git a/internal/platform-support/dgpu/nvml.go b/internal/platform-support/dgpu/nvml.go new file mode 100644 index 00000000..14e81b9b --- /dev/null +++ b/internal/platform-support/dgpu/nvml.go @@ -0,0 +1,84 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "fmt" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvml/pkg/nvml" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" +) + +type requiredInfo interface { + GetMinorNumber() (int, error) + GetPCIBusID() (string, error) +} + +func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) { + minor, err := d.GetMinorNumber() + if err != nil { + return nil, fmt.Errorf("error getting GPU device minor number: %w", err) + } + path := fmt.Sprintf("/dev/nvidia%d", minor) + + pciBusID, err := d.GetPCIBusID() + if err != nil { + return nil, fmt.Errorf("error getting PCI info for device: %w", err) + } + + drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID) + if err != nil { + return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err) + } + + deviceNodePaths := append([]string{path}, drmDeviceNodes...) + + deviceNodes := discover.NewCharDeviceDiscoverer( + o.logger, + o.devRoot, + deviceNodePaths, + ) + + byPathHooks := &byPathHookDiscoverer{ + logger: o.logger, + devRoot: o.devRoot, + nvidiaCDIHookPath: o.nvidiaCDIHookPath, + pciBusID: pciBusID, + deviceNodes: deviceNodes, + } + + dd := discover.Merge( + deviceNodes, + byPathHooks, + ) + return dd, nil +} + +type toRequiredInfo struct { + device.Device +} + +func (d *toRequiredInfo) GetMinorNumber() (int, error) { + minor, ret := d.Device.GetMinorNumber() + if ret != nvml.SUCCESS { + return 0, ret + } + return minor, nil +} diff --git a/internal/platform-support/dgpu/nvml_test.go b/internal/platform-support/dgpu/nvml_test.go new file mode 100644 index 00000000..3e8306c9 --- /dev/null +++ b/internal/platform-support/dgpu/nvml_test.go @@ -0,0 +1,87 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "testing" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + testlog "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" +) + +// TODO: In order to properly test this, we need a mechanism to inject / +// override the char device discoverer. +func TestNewNvmlDGPUDiscoverer(t *testing.T) { + logger, _ := testlog.NewNullLogger() + + nvmllib := &mock.Interface{} + devicelib := device.New( + nvmllib, + ) + + testCases := []struct { + description string + device nvml.Device + expectedError error + expectedDevices []discover.Device + expectedHooks []discover.Hook + expectedMounts []discover.Mount + }{ + { + description: "", + device: &mock.Device{ + GetMinorNumberFunc: func() (int, nvml.Return) { + return 3, nvml.SUCCESS + }, + GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { + var busID [32]int8 + for i, b := range []byte("00000000:45:00:00") { + busID[i] = int8(b) + } + info := nvml.PciInfo{ + BusId: busID, + } + return info, nvml.SUCCESS + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + o := &options{logger: logger} + + device, err := devicelib.NewDevice(tc.device) + require.NoError(t, err) + + d, err := o.newNvmlDGPUDiscoverer(&toRequiredInfo{device}) + require.ErrorIs(t, err, tc.expectedError) + + devices, _ := d.Devices() + require.EqualValues(t, tc.expectedDevices, devices) + hooks, _ := d.Hooks() + require.EqualValues(t, tc.expectedHooks, hooks) + mounts, _ := d.Mounts() + require.EqualValues(t, tc.expectedMounts, mounts) + }) + } +} diff --git a/internal/platform-support/dgpu/options.go b/internal/platform-support/dgpu/options.go new file mode 100644 index 00000000..cea58c6d --- /dev/null +++ b/internal/platform-support/dgpu/options.go @@ -0,0 +1,50 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" +) + +type options struct { + logger logger.Interface + devRoot string + nvidiaCDIHookPath string +} + +type Option func(*options) + +// WithDevRoot sets the root where /dev is located. +func WithDevRoot(root string) Option { + return func(l *options) { + l.devRoot = root + } +} + +// WithLogger sets the logger for the library +func WithLogger(logger logger.Interface) Option { + return func(l *options) { + l.logger = logger + } +} + +// WithNVIDIACDIHookPath sets the path to the NVIDIA Container Toolkit CLI path for the library +func WithNVIDIACDIHookPath(path string) Option { + return func(l *options) { + l.nvidiaCDIHookPath = path + } +} diff --git a/pkg/nvcdi/full-gpu-nvml.go b/pkg/nvcdi/full-gpu-nvml.go index aa46c4c6..881d102d 100644 --- a/pkg/nvcdi/full-gpu-nvml.go +++ b/pkg/nvcdi/full-gpu-nvml.go @@ -18,19 +18,14 @@ package nvcdi import ( "fmt" - "os" - "path/filepath" - "strings" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - "github.com/NVIDIA/go-nvml/pkg/nvml" "tags.cncf.io/container-device-interface/pkg/cdi" "tags.cncf.io/container-device-interface/specs-go" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" - "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" - "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu" ) // GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. @@ -58,7 +53,7 @@ func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, err // GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'. func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error) { - device, err := newFullGPUDiscoverer(l.logger, l.devRoot, l.nvidiaCDIHookPath, d) + device, err := l.newFullGPUDiscoverer(d) if err != nil { return nil, fmt.Errorf("failed to create device discoverer: %v", err) } @@ -71,164 +66,28 @@ func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error return editsForDevice, nil } -// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links -type byPathHookDiscoverer struct { - logger logger.Interface - devRoot string - nvidiaCDIHookPath string - pciBusID string - deviceNodes discover.Discover -} - -var _ discover.Discover = (*byPathHookDiscoverer)(nil) - // newFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device. -func newFullGPUDiscoverer(logger logger.Interface, devRoot string, nvidiaCDIHookPath string, d device.Device) (discover.Discover, error) { - // TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface. - // This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin. - minor, ret := d.GetMinorNumber() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) - } - path := fmt.Sprintf("/dev/nvidia%d", minor) - - pciInfo, ret := d.GetPciInfo() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting PCI info for device: %v", ret) - } - pciBusID := getBusID(pciInfo) - - drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID) - if err != nil { - return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err) - } - - deviceNodePaths := append([]string{path}, drmDeviceNodes...) - - deviceNodes := discover.NewCharDeviceDiscoverer( - logger, - devRoot, - deviceNodePaths, +func (l *nvmllib) newFullGPUDiscoverer(d device.Device) (discover.Discover, error) { + deviceNodes, err := dgpu.NewForDevice(d, + dgpu.WithDevRoot(l.devRoot), + dgpu.WithLogger(l.logger), + dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath), ) - - byPathHooks := &byPathHookDiscoverer{ - logger: logger, - devRoot: devRoot, - nvidiaCDIHookPath: nvidiaCDIHookPath, - pciBusID: pciBusID, - deviceNodes: deviceNodes, + if err != nil { + return nil, fmt.Errorf("failed to create device discoverer: %v", err) } deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer( - logger, - devRoot, - nvidiaCDIHookPath, + l.logger, + l.devRoot, + l.nvidiaCDIHookPath, deviceNodes, ) dd := discover.Merge( deviceNodes, - byPathHooks, deviceFolderPermissionHooks, ) return dd, nil } - -// Devices returns the empty list for the by-path hook discoverer -func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) { - return nil, nil -} - -// Hooks returns the hooks for the GPU device. -// The following hooks are detected: -// 1. A hook to create /dev/dri/by-path symlinks -func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) { - links, err := d.deviceNodeLinks() - if err != nil { - return nil, fmt.Errorf("failed to discover DRA device links: %v", err) - } - if len(links) == 0 { - return nil, nil - } - - var args []string - for _, l := range links { - args = append(args, "--link", l) - } - - hook := discover.CreateNvidiaCDIHook( - d.nvidiaCDIHookPath, - "create-symlinks", - args..., - ) - - return []discover.Hook{hook}, nil -} - -// Mounts returns an empty slice for a full GPU -func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) { - return nil, nil -} - -func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) { - devices, err := d.deviceNodes.Devices() - if err != nil { - return nil, fmt.Errorf("failed to discover device nodes: %v", err) - } - - if len(devices) == 0 { - return nil, nil - } - - selectedDevices := make(map[string]bool) - for _, d := range devices { - selectedDevices[d.HostPath] = true - } - - candidates := []string{ - fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID), - fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID), - } - - var links []string - for _, c := range candidates { - linkPath := filepath.Join(d.devRoot, c) - device, err := os.Readlink(linkPath) - if err != nil { - d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath) - continue - } - - deviceNode := device - if !filepath.IsAbs(device) { - deviceNode = filepath.Join(filepath.Dir(linkPath), device) - } - if !selectedDevices[deviceNode] { - d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode) - continue - } - d.logger.Debugf("adding device symlink %v -> %v", linkPath, device) - links = append(links, fmt.Sprintf("%v::%v", device, linkPath)) - } - - return links, nil -} - -// getBusID provides a utility function that returns the string representation of the bus ID. -func getBusID(p nvml.PciInfo) string { - var bytes []byte - for _, b := range p.BusId { - if byte(b) == '\x00' { - break - } - bytes = append(bytes, byte(b)) - } - id := strings.ToLower(string(bytes)) - - if id != "0000" { - id = strings.TrimPrefix(id, "0000") - } - - return id -} From be11cf428b1af06a41f319cfab0f06f1942aa1da Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 2 Jul 2024 16:49:15 +0200 Subject: [PATCH 2/2] [no-relnote] Add MIG discoverer to dgpu package Signed-off-by: Evan Lezar --- internal/platform-support/dgpu/dgpu.go | 20 +++++- internal/platform-support/dgpu/nvml.go | 90 +++++++++++++++++++++++++- pkg/nvcdi/mig-device-nvml.go | 74 +++------------------ 3 files changed, 114 insertions(+), 70 deletions(-) diff --git a/internal/platform-support/dgpu/dgpu.go b/internal/platform-support/dgpu/dgpu.go index 6d2da823..00982a62 100644 --- a/internal/platform-support/dgpu/dgpu.go +++ b/internal/platform-support/dgpu/dgpu.go @@ -24,7 +24,6 @@ import ( ) // NewForDevice creates a discoverer for the specified Device. -// nvsandboxutils is used for discovery if specified, otherwise NVML is used. func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) { o := &options{} for _, opt := range opts { @@ -37,3 +36,22 @@ func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) { return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d}) } + +// NewForDevice creates a discoverer for the specified device and its associated MIG device. +func NewForMigDevice(d device.Device, mig device.MigDevice, opts ...Option) (discover.Discover, error) { + o := &options{} + for _, opt := range opts { + opt(o) + } + + if o.logger == nil { + o.logger = logger.New() + } + + return o.newNvmlMigDiscoverer( + &toRequiredMigInfo{ + MigDevice: mig, + parent: &toRequiredInfo{d}, + }, + ) +} diff --git a/internal/platform-support/dgpu/nvml.go b/internal/platform-support/dgpu/nvml.go index 14e81b9b..e4b67641 100644 --- a/internal/platform-support/dgpu/nvml.go +++ b/internal/platform-support/dgpu/nvml.go @@ -24,19 +24,20 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" ) type requiredInfo interface { GetMinorNumber() (int, error) GetPCIBusID() (string, error) + getDevNodePath() (string, error) } func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) { - minor, err := d.GetMinorNumber() + path, err := d.getDevNodePath() if err != nil { - return nil, fmt.Errorf("error getting GPU device minor number: %w", err) + return nil, fmt.Errorf("error getting device node path: %w", err) } - path := fmt.Sprintf("/dev/nvidia%d", minor) pciBusID, err := d.GetPCIBusID() if err != nil { @@ -71,6 +72,52 @@ func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, erro return dd, nil } +type requiredMigInfo interface { + getPlacementInfo() (int, int, int, error) + getDevNodePath() (string, error) +} + +func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, error) { + gpu, gi, ci, err := d.getPlacementInfo() + if err != nil { + return nil, fmt.Errorf("error getting placement info: %w", err) + } + + migCaps, err := nvcaps.NewMigCaps() + if err != nil { + return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) + } + + giCap := nvcaps.NewGPUInstanceCap(gpu, gi) + giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) + if err != nil { + return nil, fmt.Errorf("failed to get GI cap device path: %v", err) + } + + ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci) + ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) + if err != nil { + return nil, fmt.Errorf("failed to get CI cap device path: %v", err) + } + + parentPath, err := d.getDevNodePath() + if err != nil { + return nil, err + } + + deviceNodes := discover.NewCharDeviceDiscoverer( + o.logger, + o.devRoot, + []string{ + parentPath, + giCapDevicePath, + ciCapDevicePath, + }, + ) + + return deviceNodes, nil +} + type toRequiredInfo struct { device.Device } @@ -82,3 +129,40 @@ func (d *toRequiredInfo) GetMinorNumber() (int, error) { } return minor, nil } + +func (d *toRequiredInfo) getDevNodePath() (string, error) { + minor, err := d.GetMinorNumber() + if err != nil { + return "", fmt.Errorf("error getting GPU device minor number: %w", err) + } + path := fmt.Sprintf("/dev/nvidia%d", minor) + return path, nil +} + +type toRequiredMigInfo struct { + device.MigDevice + parent requiredInfo +} + +func (d *toRequiredMigInfo) getPlacementInfo() (int, int, int, error) { + gpu, ret := d.parent.GetMinorNumber() + if ret != nvml.SUCCESS { + return 0, 0, 0, fmt.Errorf("error getting GPU minor: %v", ret) + } + + gi, ret := d.GetGpuInstanceId() + if ret != nvml.SUCCESS { + return 0, 0, 0, fmt.Errorf("error getting GPU Instance ID: %v", ret) + } + + ci, ret := d.GetComputeInstanceId() + if ret != nvml.SUCCESS { + return 0, 0, 0, fmt.Errorf("error getting Compute Instance ID: %v", ret) + } + + return gpu, gi, ci, nil +} + +func (d *toRequiredMigInfo) getDevNodePath() (string, error) { + return d.parent.getDevNodePath() +} diff --git a/pkg/nvcdi/mig-device-nvml.go b/pkg/nvcdi/mig-device-nvml.go index d67d7d46..91fe879c 100644 --- a/pkg/nvcdi/mig-device-nvml.go +++ b/pkg/nvcdi/mig-device-nvml.go @@ -20,14 +20,11 @@ import ( "fmt" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - "github.com/NVIDIA/go-nvml/pkg/nvml" "tags.cncf.io/container-device-interface/pkg/cdi" "tags.cncf.io/container-device-interface/specs-go" - "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" - "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" - "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu" ) // GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. @@ -54,74 +51,19 @@ func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.Mi // GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'. func (l *nvmllib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) (*cdi.ContainerEdits, error) { - gpu, ret := parent.GetMinorNumber() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU minor: %v", ret) - } - - gi, ret := mig.GetGpuInstanceId() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret) - } - - ci, ret := mig.GetComputeInstanceId() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret) - } - - editsForDevice, err := l.GetEditsForComputeInstance(gpu, gi, ci) + deviceNodes, err := dgpu.NewForMigDevice(parent, mig, + dgpu.WithDevRoot(l.devRoot), + dgpu.WithLogger(l.logger), + dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath), + ) if err != nil { - return nil, fmt.Errorf("failed to create container edits for MIG device: %v", err) + return nil, fmt.Errorf("failed to create device discoverer: %v", err) } - return editsForDevice, nil -} - -// GetEditsForComputeInstance returns the CDI edits for a particular compute instance defined by the (gpu, gi, ci) tuple -func (l *nvmllib) GetEditsForComputeInstance(gpu int, gi int, ci int) (*cdi.ContainerEdits, error) { - computeInstance, err := newComputeInstanceDiscoverer(l.logger, l.devRoot, gpu, gi, ci) - if err != nil { - return nil, fmt.Errorf("failed to create discoverer for Compute Instance: %v", err) - } - - editsForDevice, err := edits.FromDiscoverer(computeInstance) + editsForDevice, err := edits.FromDiscoverer(deviceNodes) if err != nil { return nil, fmt.Errorf("failed to create container edits for Compute Instance: %v", err) } return editsForDevice, nil } - -// newComputeInstanceDiscoverer returns a discoverer for the specified compute instance -func newComputeInstanceDiscoverer(logger logger.Interface, devRoot string, gpu int, gi int, ci int) (discover.Discover, error) { - parentPath := fmt.Sprintf("/dev/nvidia%d", gpu) - - migCaps, err := nvcaps.NewMigCaps() - if err != nil { - return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) - } - - giCap := nvcaps.NewGPUInstanceCap(gpu, gi) - giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) - if err != nil { - return nil, fmt.Errorf("failed to get GI cap device path: %v", err) - } - - ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci) - ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) - if err != nil { - return nil, fmt.Errorf("failed to get CI cap device path: %v", err) - } - - deviceNodes := discover.NewCharDeviceDiscoverer( - logger, - devRoot, - []string{ - parentPath, - giCapDevicePath, - ciCapDevicePath, - }, - ) - - return deviceNodes, nil -}