From be11cf428b1af06a41f319cfab0f06f1942aa1da Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 2 Jul 2024 16:49:15 +0200 Subject: [PATCH] [no-relnote] Add MIG discoverer to dgpu package Signed-off-by: Evan Lezar --- internal/platform-support/dgpu/dgpu.go | 20 +++++- internal/platform-support/dgpu/nvml.go | 90 +++++++++++++++++++++++++- pkg/nvcdi/mig-device-nvml.go | 74 +++------------------ 3 files changed, 114 insertions(+), 70 deletions(-) diff --git a/internal/platform-support/dgpu/dgpu.go b/internal/platform-support/dgpu/dgpu.go index 6d2da823..00982a62 100644 --- a/internal/platform-support/dgpu/dgpu.go +++ b/internal/platform-support/dgpu/dgpu.go @@ -24,7 +24,6 @@ import ( ) // NewForDevice creates a discoverer for the specified Device. -// nvsandboxutils is used for discovery if specified, otherwise NVML is used. func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) { o := &options{} for _, opt := range opts { @@ -37,3 +36,22 @@ func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) { return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d}) } + +// NewForDevice creates a discoverer for the specified device and its associated MIG device. +func NewForMigDevice(d device.Device, mig device.MigDevice, opts ...Option) (discover.Discover, error) { + o := &options{} + for _, opt := range opts { + opt(o) + } + + if o.logger == nil { + o.logger = logger.New() + } + + return o.newNvmlMigDiscoverer( + &toRequiredMigInfo{ + MigDevice: mig, + parent: &toRequiredInfo{d}, + }, + ) +} diff --git a/internal/platform-support/dgpu/nvml.go b/internal/platform-support/dgpu/nvml.go index 14e81b9b..e4b67641 100644 --- a/internal/platform-support/dgpu/nvml.go +++ b/internal/platform-support/dgpu/nvml.go @@ -24,19 +24,20 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" ) type requiredInfo interface { GetMinorNumber() (int, error) GetPCIBusID() (string, error) + getDevNodePath() (string, error) } func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) { - minor, err := d.GetMinorNumber() + path, err := d.getDevNodePath() if err != nil { - return nil, fmt.Errorf("error getting GPU device minor number: %w", err) + return nil, fmt.Errorf("error getting device node path: %w", err) } - path := fmt.Sprintf("/dev/nvidia%d", minor) pciBusID, err := d.GetPCIBusID() if err != nil { @@ -71,6 +72,52 @@ func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, erro return dd, nil } +type requiredMigInfo interface { + getPlacementInfo() (int, int, int, error) + getDevNodePath() (string, error) +} + +func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, error) { + gpu, gi, ci, err := d.getPlacementInfo() + if err != nil { + return nil, fmt.Errorf("error getting placement info: %w", err) + } + + migCaps, err := nvcaps.NewMigCaps() + if err != nil { + return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) + } + + giCap := nvcaps.NewGPUInstanceCap(gpu, gi) + giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) + if err != nil { + return nil, fmt.Errorf("failed to get GI cap device path: %v", err) + } + + ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci) + ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) + if err != nil { + return nil, fmt.Errorf("failed to get CI cap device path: %v", err) + } + + parentPath, err := d.getDevNodePath() + if err != nil { + return nil, err + } + + deviceNodes := discover.NewCharDeviceDiscoverer( + o.logger, + o.devRoot, + []string{ + parentPath, + giCapDevicePath, + ciCapDevicePath, + }, + ) + + return deviceNodes, nil +} + type toRequiredInfo struct { device.Device } @@ -82,3 +129,40 @@ func (d *toRequiredInfo) GetMinorNumber() (int, error) { } return minor, nil } + +func (d *toRequiredInfo) getDevNodePath() (string, error) { + minor, err := d.GetMinorNumber() + if err != nil { + return "", fmt.Errorf("error getting GPU device minor number: %w", err) + } + path := fmt.Sprintf("/dev/nvidia%d", minor) + return path, nil +} + +type toRequiredMigInfo struct { + device.MigDevice + parent requiredInfo +} + +func (d *toRequiredMigInfo) getPlacementInfo() (int, int, int, error) { + gpu, ret := d.parent.GetMinorNumber() + if ret != nvml.SUCCESS { + return 0, 0, 0, fmt.Errorf("error getting GPU minor: %v", ret) + } + + gi, ret := d.GetGpuInstanceId() + if ret != nvml.SUCCESS { + return 0, 0, 0, fmt.Errorf("error getting GPU Instance ID: %v", ret) + } + + ci, ret := d.GetComputeInstanceId() + if ret != nvml.SUCCESS { + return 0, 0, 0, fmt.Errorf("error getting Compute Instance ID: %v", ret) + } + + return gpu, gi, ci, nil +} + +func (d *toRequiredMigInfo) getDevNodePath() (string, error) { + return d.parent.getDevNodePath() +} diff --git a/pkg/nvcdi/mig-device-nvml.go b/pkg/nvcdi/mig-device-nvml.go index d67d7d46..91fe879c 100644 --- a/pkg/nvcdi/mig-device-nvml.go +++ b/pkg/nvcdi/mig-device-nvml.go @@ -20,14 +20,11 @@ import ( "fmt" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" - "github.com/NVIDIA/go-nvml/pkg/nvml" "tags.cncf.io/container-device-interface/pkg/cdi" "tags.cncf.io/container-device-interface/specs-go" - "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" - "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" - "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" + "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu" ) // GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. @@ -54,74 +51,19 @@ func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.Mi // GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'. func (l *nvmllib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) (*cdi.ContainerEdits, error) { - gpu, ret := parent.GetMinorNumber() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU minor: %v", ret) - } - - gi, ret := mig.GetGpuInstanceId() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret) - } - - ci, ret := mig.GetComputeInstanceId() - if ret != nvml.SUCCESS { - return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret) - } - - editsForDevice, err := l.GetEditsForComputeInstance(gpu, gi, ci) + deviceNodes, err := dgpu.NewForMigDevice(parent, mig, + dgpu.WithDevRoot(l.devRoot), + dgpu.WithLogger(l.logger), + dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath), + ) if err != nil { - return nil, fmt.Errorf("failed to create container edits for MIG device: %v", err) + return nil, fmt.Errorf("failed to create device discoverer: %v", err) } - return editsForDevice, nil -} - -// GetEditsForComputeInstance returns the CDI edits for a particular compute instance defined by the (gpu, gi, ci) tuple -func (l *nvmllib) GetEditsForComputeInstance(gpu int, gi int, ci int) (*cdi.ContainerEdits, error) { - computeInstance, err := newComputeInstanceDiscoverer(l.logger, l.devRoot, gpu, gi, ci) - if err != nil { - return nil, fmt.Errorf("failed to create discoverer for Compute Instance: %v", err) - } - - editsForDevice, err := edits.FromDiscoverer(computeInstance) + editsForDevice, err := edits.FromDiscoverer(deviceNodes) if err != nil { return nil, fmt.Errorf("failed to create container edits for Compute Instance: %v", err) } return editsForDevice, nil } - -// newComputeInstanceDiscoverer returns a discoverer for the specified compute instance -func newComputeInstanceDiscoverer(logger logger.Interface, devRoot string, gpu int, gi int, ci int) (discover.Discover, error) { - parentPath := fmt.Sprintf("/dev/nvidia%d", gpu) - - migCaps, err := nvcaps.NewMigCaps() - if err != nil { - return nil, fmt.Errorf("error getting MIG capability device paths: %v", err) - } - - giCap := nvcaps.NewGPUInstanceCap(gpu, gi) - giCapDevicePath, err := migCaps.GetCapDevicePath(giCap) - if err != nil { - return nil, fmt.Errorf("failed to get GI cap device path: %v", err) - } - - ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci) - ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap) - if err != nil { - return nil, fmt.Errorf("failed to get CI cap device path: %v", err) - } - - deviceNodes := discover.NewCharDeviceDiscoverer( - logger, - devRoot, - []string{ - parentPath, - giCapDevicePath, - ciCapDevicePath, - }, - ) - - return deviceNodes, nil -}