Update vendoring for nvpci

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2023-01-24 10:37:46 +01:00
parent 1d7e419008
commit 540f4349f5
25 changed files with 37514 additions and 187 deletions

View File

@@ -26,7 +26,9 @@ type Interface interface {
GetMigDevices() ([]MigDevice, error)
GetMigProfiles() ([]MigProfile, error)
NewDevice(d nvml.Device) (Device, error)
NewDeviceByUUID(uuid string) (Device, error)
NewMigDevice(d nvml.Device) (MigDevice, error)
NewMigDeviceByUUID(uuid string) (MigDevice, error)
NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, migMemorySizeMB, deviceMemorySizeBytes uint64) (MigProfile, error)
ParseMigProfile(profile string) (MigProfile, error)
VisitDevices(func(i int, d Device) error) error
@@ -35,7 +37,8 @@ type Interface interface {
}
type devicelib struct {
nvml nvml.Interface
nvml nvml.Interface
skippedDevices map[string]struct{}
}
var _ Interface = &devicelib{}
@@ -49,6 +52,12 @@ func New(opts ...Option) Interface {
if d.nvml == nil {
d.nvml = nvml.New()
}
if d.skippedDevices == nil {
WithSkippedDevices(
"DGX Display",
"NVIDIA DGX Display",
)(d)
}
return d
}
@@ -59,5 +68,17 @@ func WithNvml(nvml nvml.Interface) Option {
}
}
// WithSkippedDevices provides an Option to set devices to be skipped by model name
func WithSkippedDevices(names ...string) Option {
return func(d *devicelib) {
if d.skippedDevices == nil {
d.skippedDevices = make(map[string]struct{})
}
for _, name := range names {
d.skippedDevices[name] = struct{}{}
}
}
}
// Option defines a function for passing options to the New() call
type Option func(*devicelib)

View File

@@ -43,6 +43,20 @@ var _ Device = &device{}
// NewDevice builds a new Device from an nvml.Device
func (d *devicelib) NewDevice(dev nvml.Device) (Device, error) {
return d.newDevice(dev)
}
// NewDeviceByUUID builds a new Device from a UUID
func (d *devicelib) NewDeviceByUUID(uuid string) (Device, error) {
dev, ret := d.nvml.DeviceGetHandleByUUID(uuid)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting device handle for uuid '%v': %v", uuid, ret)
}
return d.newDevice(dev)
}
// newDevice creates a device from an nvml.Device
func (d *devicelib) newDevice(dev nvml.Device) (*device, error) {
return &device{dev, d}, nil
}
@@ -130,6 +144,12 @@ func (d *device) VisitMigProfiles(visit func(MigProfile) error) error {
for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
giProfileInfo, ret := d.GetGpuInstanceProfileInfo(i)
if ret == nvml.ERROR_NOT_SUPPORTED {
continue
}
if ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting GPU Instance profile info: %v", ret)
}
@@ -177,6 +197,20 @@ func (d *device) GetMigProfiles() ([]MigProfile, error) {
return profiles, nil
}
// isSkipped checks whether the device should be skipped.
func (d *device) isSkipped() (bool, error) {
name, ret := d.GetName()
if ret != nvml.SUCCESS {
return false, fmt.Errorf("error getting device name: %v", ret)
}
if _, exists := d.lib.skippedDevices[name]; exists {
return true, nil
}
return false, nil
}
// VisitDevices visits each top-level device and invokes a callback function for it
func (d *devicelib) VisitDevices(visit func(int, Device) error) error {
count, ret := d.nvml.DeviceGetCount()
@@ -189,10 +223,19 @@ func (d *devicelib) VisitDevices(visit func(int, Device) error) error {
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting device handle for index '%v': %v", i, ret)
}
dev, err := d.NewDevice(device)
dev, err := d.newDevice(device)
if err != nil {
return fmt.Errorf("error creating new device wrapper: %v", err)
}
isSkipped, err := dev.isSkipped()
if err != nil {
return fmt.Errorf("error checking whether device is skipped: %v", err)
}
if isSkipped {
continue
}
err = visit(i, dev)
if err != nil {
return fmt.Errorf("error visiting device: %v", err)

View File

@@ -48,6 +48,15 @@ func (d *devicelib) NewMigDevice(handle nvml.Device) (MigDevice, error) {
return &migdevice{handle, d, nil}, nil
}
// NewMigDeviceByUUID builds a new MigDevice from a UUID
func (d *devicelib) NewMigDeviceByUUID(uuid string) (MigDevice, error) {
dev, ret := d.nvml.DeviceGetHandleByUUID(uuid)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting device handle for uuid '%v': %v", uuid, ret)
}
return d.NewMigDevice(dev)
}
// GetProfile returns the MIG profile associated with a MIG device
func (m *migdevice) GetProfile() (MigProfile, error) {
if m.profile != nil {
@@ -101,6 +110,12 @@ func (m *migdevice) GetProfile() (MigProfile, error) {
for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
giProfileInfo, ret := parent.GetGpuInstanceProfileInfo(i)
if ret == nvml.ERROR_NOT_SUPPORTED {
continue
}
if ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance profile info: %v", ret)
}
@@ -112,6 +127,12 @@ func (m *migdevice) GetProfile() (MigProfile, error) {
for j := 0; j < nvml.COMPUTE_INSTANCE_PROFILE_COUNT; j++ {
for k := 0; k < nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT; k++ {
ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo(j, k)
if ret == nvml.ERROR_NOT_SUPPORTED {
continue
}
if ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance profile info: %v", ret)