From b42a5d3e3a07ae956bd3128031bb6424ebf1ba28 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 11 Jun 2024 15:12:57 +0200 Subject: [PATCH] [no-relnote] Refactor dGPU device discovery Signed-off-by: Evan Lezar --- .../platform-support/dgpu/by-path-hooks.go | 117 +++++++++++++ internal/platform-support/dgpu/dgpu.go | 39 +++++ internal/platform-support/dgpu/nvml.go | 84 +++++++++ internal/platform-support/dgpu/nvml_test.go | 87 +++++++++ internal/platform-support/dgpu/options.go | 50 ++++++ pkg/nvcdi/full-gpu-nvml.go | 165 ++---------------- 6 files changed, 389 insertions(+), 153 deletions(-) create mode 100644 internal/platform-support/dgpu/by-path-hooks.go create mode 100644 internal/platform-support/dgpu/dgpu.go create mode 100644 internal/platform-support/dgpu/nvml.go create mode 100644 internal/platform-support/dgpu/nvml_test.go create mode 100644 internal/platform-support/dgpu/options.go diff --git a/internal/platform-support/dgpu/by-path-hooks.go b/internal/platform-support/dgpu/by-path-hooks.go new file mode 100644 index 00000000..cd38f5e7 --- /dev/null +++ b/internal/platform-support/dgpu/by-path-hooks.go @@ -0,0 +1,117 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" +) + +// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links +type byPathHookDiscoverer struct { + logger logger.Interface + devRoot string + nvidiaCDIHookPath string + pciBusID string + deviceNodes discover.Discover +} + +var _ discover.Discover = (*byPathHookDiscoverer)(nil) + +// Devices returns the empty list for the by-path hook discoverer +func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) { + return nil, nil +} + +// Hooks returns the hooks for the GPU device. +// The following hooks are detected: +// 1. A hook to create /dev/dri/by-path symlinks +func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) { + links, err := d.deviceNodeLinks() + if err != nil { + return nil, fmt.Errorf("failed to discover DRA device links: %v", err) + } + if len(links) == 0 { + return nil, nil + } + + var args []string + for _, l := range links { + args = append(args, "--link", l) + } + + hook := discover.CreateNvidiaCDIHook( + d.nvidiaCDIHookPath, + "create-symlinks", + args..., + ) + + return []discover.Hook{hook}, nil +} + +// Mounts returns an empty slice for a full GPU +func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) { + return nil, nil +} + +func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) { + devices, err := d.deviceNodes.Devices() + if err != nil { + return nil, fmt.Errorf("failed to discover device nodes: %v", err) + } + + if len(devices) == 0 { + return nil, nil + } + + selectedDevices := make(map[string]bool) + for _, d := range devices { + selectedDevices[d.HostPath] = true + } + + candidates := []string{ + fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID), + fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID), + } + + var links []string + for _, c := range candidates { + linkPath := filepath.Join(d.devRoot, c) + device, err := os.Readlink(linkPath) + if err != nil { + d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath) + continue + } + + deviceNode := device + if !filepath.IsAbs(device) { + deviceNode = filepath.Join(filepath.Dir(linkPath), device) + } + if !selectedDevices[deviceNode] { + d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode) + continue + } + d.logger.Debugf("adding device symlink %v -> %v", linkPath, device) + links = append(links, fmt.Sprintf("%v::%v", device, linkPath)) + } + + return links, nil +} diff --git a/internal/platform-support/dgpu/dgpu.go b/internal/platform-support/dgpu/dgpu.go new file mode 100644 index 00000000..6d2da823 --- /dev/null +++ b/internal/platform-support/dgpu/dgpu.go @@ -0,0 +1,39 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" +) + +// NewForDevice creates a discoverer for the specified Device. +// nvsandboxutils is used for discovery if specified, otherwise NVML is used. +func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) { + o := &options{} + for _, opt := range opts { + opt(o) + } + + if o.logger == nil { + o.logger = logger.New() + } + + return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d}) +} diff --git a/internal/platform-support/dgpu/nvml.go b/internal/platform-support/dgpu/nvml.go new file mode 100644 index 00000000..14e81b9b --- /dev/null +++ b/internal/platform-support/dgpu/nvml.go @@ -0,0 +1,84 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "fmt" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvml/pkg/nvml" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" +) + +type requiredInfo interface { + GetMinorNumber() (int, error) + GetPCIBusID() (string, error) +} + +func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) { + minor, err := d.GetMinorNumber() + if err != nil { + return nil, fmt.Errorf("error getting GPU device minor number: %w", err) + } + path := fmt.Sprintf("/dev/nvidia%d", minor) + + pciBusID, err := d.GetPCIBusID() + if err != nil { + return nil, fmt.Errorf("error getting PCI info for device: %w", err) + } + + drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID) + if err != nil { + return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err) + } + + deviceNodePaths := append([]string{path}, drmDeviceNodes...) + + deviceNodes := discover.NewCharDeviceDiscoverer( + o.logger, + o.devRoot, + deviceNodePaths, + ) + + byPathHooks := &byPathHookDiscoverer{ + logger: o.logger, + devRoot: o.devRoot, + nvidiaCDIHookPath: o.nvidiaCDIHookPath, + pciBusID: pciBusID, + deviceNodes: deviceNodes, + } + + dd := discover.Merge( + deviceNodes, + byPathHooks, + ) + return dd, nil +} + +type toRequiredInfo struct { + device.Device +} + +func (d *toRequiredInfo) GetMinorNumber() (int, error) { + minor, ret := d.Device.GetMinorNumber() + if ret != nvml.SUCCESS { + return 0, ret + } + return minor, nil +} diff --git a/internal/platform-support/dgpu/nvml_test.go b/internal/platform-support/dgpu/nvml_test.go new file mode 100644 index 00000000..3e8306c9 --- /dev/null +++ b/internal/platform-support/dgpu/nvml_test.go @@ -0,0 +1,87 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "testing" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + testlog "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" +) + +// TODO: In order to properly test this, we need a mechanism to inject / +// override the char device discoverer. +func TestNewNvmlDGPUDiscoverer(t *testing.T) { + logger, _ := testlog.NewNullLogger() + + nvmllib := &mock.Interface{} + devicelib := device.New( + nvmllib, + ) + + testCases := []struct { + description string + device nvml.Device + expectedError error + expectedDevices []discover.Device + expectedHooks []discover.Hook + expectedMounts []discover.Mount + }{ + { + description: "", + device: &mock.Device{ + GetMinorNumberFunc: func() (int, nvml.Return) { + return 3, nvml.SUCCESS + }, + GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { + var busID [32]int8 + for i, b := range []byte("00000000:45:00:00") { + busID[i] = int8(b) + } + info := nvml.PciInfo{ + BusId: busID, + } + return info, nvml.SUCCESS + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + o := &options{logger: logger} + + device, err := devicelib.NewDevice(tc.device) + require.NoError(t, err) + + d, err := o.newNvmlDGPUDiscoverer(&toRequiredInfo{device}) + require.ErrorIs(t, err, tc.expectedError) + + devices, _ := d.Devices() + require.EqualValues(t, tc.expectedDevices, devices) + hooks, _ := d.Hooks() + require.EqualValues(t, tc.expectedHooks, hooks) + mounts, _ := d.Mounts() + require.EqualValues(t, tc.expectedMounts, mounts) + }) + } +} diff --git a/internal/platform-support/dgpu/options.go b/internal/platform-support/dgpu/options.go new file mode 100644 index 00000000..cea58c6d --- /dev/null +++ b/internal/platform-support/dgpu/options.go @@ -0,0 +1,50 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dgpu + +import ( + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" +) + +type options struct { + logger logger.Interface + devRoot string + nvidiaCDIHookPath string +} + +type Option func(*options) + +// WithDevRoot sets the root where /dev is located. +func WithDevRoot(root string) Option { + return func(l *options) { + l.devRoot = root + } +} + +// WithLogger sets the logger for the library +func WithLogger(logger logger.Interface) Option { + return func(l *options) { + l.logger = logger + } +} + +// WithNVIDIACDIHookPath sets the path to the NVIDIA Container Toolkit CLI path for the library +func WithNVIDIACDIHookPath(path string) Option { + return func(l *options) { + l.nvidiaCDIHookPath = path + } +} diff --git a/pkg/nvcdi/full-gpu-nvml.go b/pkg/nvcdi/full-gpu-nvml.go index aa46c4c6..881d102d 100644 --- a/pkg/nvcdi/full-gpu-nvml.go +++ b/pkg/nvcdi/full-gpu-nvml.go @@ -18,19 +18,14 @@ package nvcdi import ( "fmt" - "os" - "path/filepath" - "strings" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - "github.com/NVIDIA/go-nvml/pkg/nvml" "tags.cncf.io/container-device-interface/pkg/cdi" "tags.cncf.io/container-device-interface/specs-go" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" - "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" - "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu" ) // GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. @@ -58,7 +53,7 @@ func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, err // GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'. func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error) { - device, err := newFullGPUDiscoverer(l.logger, l.devRoot, l.nvidiaCDIHookPath, d) + device, err := l.newFullGPUDiscoverer(d) if err != nil { return nil, fmt.Errorf("failed to create device discoverer: %v", err) } @@ -71,164 +66,28 @@ func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error return editsForDevice, nil } -// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links -type byPathHookDiscoverer struct { - logger logger.Interface - devRoot string - nvidiaCDIHookPath string - pciBusID string - deviceNodes discover.Discover -} - -var _ discover.Discover = (*byPathHookDiscoverer)(nil) - // newFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device. -func newFullGPUDiscoverer(logger logger.Interface, devRoot string, nvidiaCDIHookPath string, d device.Device) (discover.Discover, error) { - // TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface. - // This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin. - minor, ret := d.GetMinorNumber() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU device minor number: %v", ret) - } - path := fmt.Sprintf("/dev/nvidia%d", minor) - - pciInfo, ret := d.GetPciInfo() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting PCI info for device: %v", ret) - } - pciBusID := getBusID(pciInfo) - - drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID) - if err != nil { - return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err) - } - - deviceNodePaths := append([]string{path}, drmDeviceNodes...) - - deviceNodes := discover.NewCharDeviceDiscoverer( - logger, - devRoot, - deviceNodePaths, +func (l *nvmllib) newFullGPUDiscoverer(d device.Device) (discover.Discover, error) { + deviceNodes, err := dgpu.NewForDevice(d, + dgpu.WithDevRoot(l.devRoot), + dgpu.WithLogger(l.logger), + dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath), ) - - byPathHooks := &byPathHookDiscoverer{ - logger: logger, - devRoot: devRoot, - nvidiaCDIHookPath: nvidiaCDIHookPath, - pciBusID: pciBusID, - deviceNodes: deviceNodes, + if err != nil { + return nil, fmt.Errorf("failed to create device discoverer: %v", err) } deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer( - logger, - devRoot, - nvidiaCDIHookPath, + l.logger, + l.devRoot, + l.nvidiaCDIHookPath, deviceNodes, ) dd := discover.Merge( deviceNodes, - byPathHooks, deviceFolderPermissionHooks, ) return dd, nil } - -// Devices returns the empty list for the by-path hook discoverer -func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) { - return nil, nil -} - -// Hooks returns the hooks for the GPU device. -// The following hooks are detected: -// 1. A hook to create /dev/dri/by-path symlinks -func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) { - links, err := d.deviceNodeLinks() - if err != nil { - return nil, fmt.Errorf("failed to discover DRA device links: %v", err) - } - if len(links) == 0 { - return nil, nil - } - - var args []string - for _, l := range links { - args = append(args, "--link", l) - } - - hook := discover.CreateNvidiaCDIHook( - d.nvidiaCDIHookPath, - "create-symlinks", - args..., - ) - - return []discover.Hook{hook}, nil -} - -// Mounts returns an empty slice for a full GPU -func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) { - return nil, nil -} - -func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) { - devices, err := d.deviceNodes.Devices() - if err != nil { - return nil, fmt.Errorf("failed to discover device nodes: %v", err) - } - - if len(devices) == 0 { - return nil, nil - } - - selectedDevices := make(map[string]bool) - for _, d := range devices { - selectedDevices[d.HostPath] = true - } - - candidates := []string{ - fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID), - fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID), - } - - var links []string - for _, c := range candidates { - linkPath := filepath.Join(d.devRoot, c) - device, err := os.Readlink(linkPath) - if err != nil { - d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath) - continue - } - - deviceNode := device - if !filepath.IsAbs(device) { - deviceNode = filepath.Join(filepath.Dir(linkPath), device) - } - if !selectedDevices[deviceNode] { - d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode) - continue - } - d.logger.Debugf("adding device symlink %v -> %v", linkPath, device) - links = append(links, fmt.Sprintf("%v::%v", device, linkPath)) - } - - return links, nil -} - -// getBusID provides a utility function that returns the string representation of the bus ID. -func getBusID(p nvml.PciInfo) string { - var bytes []byte - for _, b := range p.BusId { - if byte(b) == '\x00' { - break - } - bytes = append(bytes, byte(b)) - } - id := strings.ToLower(string(bytes)) - - if id != "0000" { - id = strings.TrimPrefix(id, "0000") - } - - return id -}