mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
[no-relnote] Add MIG discoverer to dgpu package
Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
@@ -24,7 +24,6 @@ import (
|
||||
)
|
||||
|
||||
// NewForDevice creates a discoverer for the specified Device.
|
||||
// nvsandboxutils is used for discovery if specified, otherwise NVML is used.
|
||||
func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) {
|
||||
o := &options{}
|
||||
for _, opt := range opts {
|
||||
@@ -37,3 +36,22 @@ func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) {
|
||||
|
||||
return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d})
|
||||
}
|
||||
|
||||
// NewForDevice creates a discoverer for the specified device and its associated MIG device.
|
||||
func NewForMigDevice(d device.Device, mig device.MigDevice, opts ...Option) (discover.Discover, error) {
|
||||
o := &options{}
|
||||
for _, opt := range opts {
|
||||
opt(o)
|
||||
}
|
||||
|
||||
if o.logger == nil {
|
||||
o.logger = logger.New()
|
||||
}
|
||||
|
||||
return o.newNvmlMigDiscoverer(
|
||||
&toRequiredMigInfo{
|
||||
MigDevice: mig,
|
||||
parent: &toRequiredInfo{d},
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
@@ -24,19 +24,20 @@ import (
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
||||
)
|
||||
|
||||
type requiredInfo interface {
|
||||
GetMinorNumber() (int, error)
|
||||
GetPCIBusID() (string, error)
|
||||
getDevNodePath() (string, error)
|
||||
}
|
||||
|
||||
func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) {
|
||||
minor, err := d.GetMinorNumber()
|
||||
path, err := d.getDevNodePath()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting GPU device minor number: %w", err)
|
||||
return nil, fmt.Errorf("error getting device node path: %w", err)
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
||||
|
||||
pciBusID, err := d.GetPCIBusID()
|
||||
if err != nil {
|
||||
@@ -71,6 +72,52 @@ func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, erro
|
||||
return dd, nil
|
||||
}
|
||||
|
||||
type requiredMigInfo interface {
|
||||
getPlacementInfo() (int, int, int, error)
|
||||
getDevNodePath() (string, error)
|
||||
}
|
||||
|
||||
func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, error) {
|
||||
gpu, gi, ci, err := d.getPlacementInfo()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting placement info: %w", err)
|
||||
}
|
||||
|
||||
migCaps, err := nvcaps.NewMigCaps()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
|
||||
}
|
||||
|
||||
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
|
||||
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
|
||||
}
|
||||
|
||||
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
|
||||
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
|
||||
}
|
||||
|
||||
parentPath, err := d.getDevNodePath()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
deviceNodes := discover.NewCharDeviceDiscoverer(
|
||||
o.logger,
|
||||
o.devRoot,
|
||||
[]string{
|
||||
parentPath,
|
||||
giCapDevicePath,
|
||||
ciCapDevicePath,
|
||||
},
|
||||
)
|
||||
|
||||
return deviceNodes, nil
|
||||
}
|
||||
|
||||
type toRequiredInfo struct {
|
||||
device.Device
|
||||
}
|
||||
@@ -82,3 +129,40 @@ func (d *toRequiredInfo) GetMinorNumber() (int, error) {
|
||||
}
|
||||
return minor, nil
|
||||
}
|
||||
|
||||
func (d *toRequiredInfo) getDevNodePath() (string, error) {
|
||||
minor, err := d.GetMinorNumber()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error getting GPU device minor number: %w", err)
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
||||
return path, nil
|
||||
}
|
||||
|
||||
type toRequiredMigInfo struct {
|
||||
device.MigDevice
|
||||
parent requiredInfo
|
||||
}
|
||||
|
||||
func (d *toRequiredMigInfo) getPlacementInfo() (int, int, int, error) {
|
||||
gpu, ret := d.parent.GetMinorNumber()
|
||||
if ret != nvml.SUCCESS {
|
||||
return 0, 0, 0, fmt.Errorf("error getting GPU minor: %v", ret)
|
||||
}
|
||||
|
||||
gi, ret := d.GetGpuInstanceId()
|
||||
if ret != nvml.SUCCESS {
|
||||
return 0, 0, 0, fmt.Errorf("error getting GPU Instance ID: %v", ret)
|
||||
}
|
||||
|
||||
ci, ret := d.GetComputeInstanceId()
|
||||
if ret != nvml.SUCCESS {
|
||||
return 0, 0, 0, fmt.Errorf("error getting Compute Instance ID: %v", ret)
|
||||
}
|
||||
|
||||
return gpu, gi, ci, nil
|
||||
}
|
||||
|
||||
func (d *toRequiredMigInfo) getDevNodePath() (string, error) {
|
||||
return d.parent.getDevNodePath()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user