mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-24 13:05:17 +00:00
[no-relnote] Add MIG discoverer to dgpu package
Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
parent
b42a5d3e3a
commit
be11cf428b
@ -24,7 +24,6 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// NewForDevice creates a discoverer for the specified Device.
|
// NewForDevice creates a discoverer for the specified Device.
|
||||||
// nvsandboxutils is used for discovery if specified, otherwise NVML is used.
|
|
||||||
func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) {
|
func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) {
|
||||||
o := &options{}
|
o := &options{}
|
||||||
for _, opt := range opts {
|
for _, opt := range opts {
|
||||||
@ -37,3 +36,22 @@ func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) {
|
|||||||
|
|
||||||
return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d})
|
return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NewForDevice creates a discoverer for the specified device and its associated MIG device.
|
||||||
|
func NewForMigDevice(d device.Device, mig device.MigDevice, opts ...Option) (discover.Discover, error) {
|
||||||
|
o := &options{}
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(o)
|
||||||
|
}
|
||||||
|
|
||||||
|
if o.logger == nil {
|
||||||
|
o.logger = logger.New()
|
||||||
|
}
|
||||||
|
|
||||||
|
return o.newNvmlMigDiscoverer(
|
||||||
|
&toRequiredMigInfo{
|
||||||
|
MigDevice: mig,
|
||||||
|
parent: &toRequiredInfo{d},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
@ -24,19 +24,20 @@ import (
|
|||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
||||||
)
|
)
|
||||||
|
|
||||||
type requiredInfo interface {
|
type requiredInfo interface {
|
||||||
GetMinorNumber() (int, error)
|
GetMinorNumber() (int, error)
|
||||||
GetPCIBusID() (string, error)
|
GetPCIBusID() (string, error)
|
||||||
|
getDevNodePath() (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) {
|
func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) {
|
||||||
minor, err := d.GetMinorNumber()
|
path, err := d.getDevNodePath()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("error getting GPU device minor number: %w", err)
|
return nil, fmt.Errorf("error getting device node path: %w", err)
|
||||||
}
|
}
|
||||||
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
|
||||||
|
|
||||||
pciBusID, err := d.GetPCIBusID()
|
pciBusID, err := d.GetPCIBusID()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -71,6 +72,52 @@ func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, erro
|
|||||||
return dd, nil
|
return dd, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type requiredMigInfo interface {
|
||||||
|
getPlacementInfo() (int, int, int, error)
|
||||||
|
getDevNodePath() (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, error) {
|
||||||
|
gpu, gi, ci, err := d.getPlacementInfo()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error getting placement info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
migCaps, err := nvcaps.NewMigCaps()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
|
||||||
|
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
|
||||||
|
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
parentPath, err := d.getDevNodePath()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
deviceNodes := discover.NewCharDeviceDiscoverer(
|
||||||
|
o.logger,
|
||||||
|
o.devRoot,
|
||||||
|
[]string{
|
||||||
|
parentPath,
|
||||||
|
giCapDevicePath,
|
||||||
|
ciCapDevicePath,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
return deviceNodes, nil
|
||||||
|
}
|
||||||
|
|
||||||
type toRequiredInfo struct {
|
type toRequiredInfo struct {
|
||||||
device.Device
|
device.Device
|
||||||
}
|
}
|
||||||
@ -82,3 +129,40 @@ func (d *toRequiredInfo) GetMinorNumber() (int, error) {
|
|||||||
}
|
}
|
||||||
return minor, nil
|
return minor, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (d *toRequiredInfo) getDevNodePath() (string, error) {
|
||||||
|
minor, err := d.GetMinorNumber()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error getting GPU device minor number: %w", err)
|
||||||
|
}
|
||||||
|
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
||||||
|
return path, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type toRequiredMigInfo struct {
|
||||||
|
device.MigDevice
|
||||||
|
parent requiredInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *toRequiredMigInfo) getPlacementInfo() (int, int, int, error) {
|
||||||
|
gpu, ret := d.parent.GetMinorNumber()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return 0, 0, 0, fmt.Errorf("error getting GPU minor: %v", ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
gi, ret := d.GetGpuInstanceId()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return 0, 0, 0, fmt.Errorf("error getting GPU Instance ID: %v", ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
ci, ret := d.GetComputeInstanceId()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return 0, 0, 0, fmt.Errorf("error getting Compute Instance ID: %v", ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
return gpu, gi, ci, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *toRequiredMigInfo) getDevNodePath() (string, error) {
|
||||||
|
return d.parent.getDevNodePath()
|
||||||
|
}
|
||||||
|
@ -20,14 +20,11 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
|
||||||
"tags.cncf.io/container-device-interface/pkg/cdi"
|
"tags.cncf.io/container-device-interface/pkg/cdi"
|
||||||
"tags.cncf.io/container-device-interface/specs-go"
|
"tags.cncf.io/container-device-interface/specs-go"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
|
// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
|
||||||
@ -54,74 +51,19 @@ func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.Mi
|
|||||||
|
|
||||||
// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'.
|
// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'.
|
||||||
func (l *nvmllib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) (*cdi.ContainerEdits, error) {
|
func (l *nvmllib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) (*cdi.ContainerEdits, error) {
|
||||||
gpu, ret := parent.GetMinorNumber()
|
deviceNodes, err := dgpu.NewForMigDevice(parent, mig,
|
||||||
if ret != nvml.SUCCESS {
|
dgpu.WithDevRoot(l.devRoot),
|
||||||
return nil, fmt.Errorf("error getting GPU minor: %v", ret)
|
dgpu.WithLogger(l.logger),
|
||||||
}
|
dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath),
|
||||||
|
)
|
||||||
gi, ret := mig.GetGpuInstanceId()
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret)
|
|
||||||
}
|
|
||||||
|
|
||||||
ci, ret := mig.GetComputeInstanceId()
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret)
|
|
||||||
}
|
|
||||||
|
|
||||||
editsForDevice, err := l.GetEditsForComputeInstance(gpu, gi, ci)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create container edits for MIG device: %v", err)
|
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return editsForDevice, nil
|
editsForDevice, err := edits.FromDiscoverer(deviceNodes)
|
||||||
}
|
|
||||||
|
|
||||||
// GetEditsForComputeInstance returns the CDI edits for a particular compute instance defined by the (gpu, gi, ci) tuple
|
|
||||||
func (l *nvmllib) GetEditsForComputeInstance(gpu int, gi int, ci int) (*cdi.ContainerEdits, error) {
|
|
||||||
computeInstance, err := newComputeInstanceDiscoverer(l.logger, l.devRoot, gpu, gi, ci)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to create discoverer for Compute Instance: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
editsForDevice, err := edits.FromDiscoverer(computeInstance)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create container edits for Compute Instance: %v", err)
|
return nil, fmt.Errorf("failed to create container edits for Compute Instance: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return editsForDevice, nil
|
return editsForDevice, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// newComputeInstanceDiscoverer returns a discoverer for the specified compute instance
|
|
||||||
func newComputeInstanceDiscoverer(logger logger.Interface, devRoot string, gpu int, gi int, ci int) (discover.Discover, error) {
|
|
||||||
parentPath := fmt.Sprintf("/dev/nvidia%d", gpu)
|
|
||||||
|
|
||||||
migCaps, err := nvcaps.NewMigCaps()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
|
|
||||||
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
|
|
||||||
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
deviceNodes := discover.NewCharDeviceDiscoverer(
|
|
||||||
logger,
|
|
||||||
devRoot,
|
|
||||||
[]string{
|
|
||||||
parentPath,
|
|
||||||
giCapDevicePath,
|
|
||||||
ciCapDevicePath,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
return deviceNodes, nil
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user