Fix mode detection on Thor-based systems

This change updates github.com/NVIDIA/go-nvlib from v0.7.1 to v0.7.2
to allow Thor systems to be detected as Tegra-based. This allows fixes
automatic mode detection to work on these systems.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2025-05-13 21:25:11 +02:00
parent adb5e6719d
commit a4dc28bb3f
13 changed files with 236 additions and 93 deletions

View File

@@ -63,7 +63,7 @@ func (m *migdevice) GetProfile() (MigProfile, error) {
return m.profile, nil
}
parent, ret := m.Device.GetDeviceHandleFromMigDeviceHandle()
parent, ret := m.GetDeviceHandleFromMigDeviceHandle()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting parent device handle: %v", ret)
}
@@ -73,17 +73,17 @@ func (m *migdevice) GetProfile() (MigProfile, error) {
return nil, fmt.Errorf("error getting parent memory info: %v", ret)
}
attributes, ret := m.Device.GetAttributes()
attributes, ret := m.GetAttributes()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device attributes: %v", ret)
}
giID, ret := m.Device.GetGpuInstanceId()
giID, ret := m.GetGpuInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device GPU Instance ID: %v", ret)
}
ciID, ret := m.Device.GetComputeInstanceId()
ciID, ret := m.GetComputeInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device Compute Instance ID: %v", ret)
}

View File

@@ -30,12 +30,14 @@ type PlatformResolver interface {
// PropertyExtractor provides a set of functions to query capabilities of the
// system.
//
//go:generate moq -rm -out property-extractor_mock.go . PropertyExtractor
//go:generate moq -rm -fmt=goimports -out property-extractor_mock.go . PropertyExtractor
type PropertyExtractor interface {
HasDXCore() (bool, string)
HasNvml() (bool, string)
HasTegraFiles() (bool, string)
// Deprecated: Use HasTegraFiles instead.
IsTegraSystem() (bool, string)
// Deprecated: Use HasOnlyIntegratedGPUs
UsesOnlyNVGPUModule() (bool, string)
HasOnlyIntegratedGPUs() (bool, string)
}

View File

@@ -90,16 +90,24 @@ func (i *propertyExtractor) HasTegraFiles() (bool, string) {
}
// UsesOnlyNVGPUModule checks whether the only the nvgpu module is used.
// This kernel module is used on Tegra-based systems when using the iGPU.
// Since some of these systems also support NVML, we use the device name
// reported by NVML to determine whether the system is an iGPU system.
//
// Devices that use the nvgpu module have their device names as:
// Deprecated: UsesOnlyNVGPUModule is deprecated, use HasOnlyIntegratedGPUs instead.
func (i *propertyExtractor) UsesOnlyNVGPUModule() (uses bool, reason string) {
return i.HasOnlyIntegratedGPUs()
}
// HasOnlyIntegratedGPUs checks whether all GPUs are iGPUs that use NVML.
//
// As of Orin-based systems iGPUs also support limited NVML queries.
// In the absence of a robust API, we rely on heuristics to make this decision.
//
// The following device names are checked:
//
// GPU 0: Orin (nvgpu) (UUID: 54d0709b-558d-5a59-9c65-0c5fc14a21a4)
// GPU 0: NVIDIA Thor (UUID: 54d0709b-558d-5a59-9c65-0c5fc14a21a4)
//
// This function returns true if ALL devices use the nvgpu module.
func (i *propertyExtractor) UsesOnlyNVGPUModule() (uses bool, reason string) {
// This function returns true if ALL devices are detected as iGPUs.
func (i *propertyExtractor) HasOnlyIntegratedGPUs() (uses bool, reason string) {
// We ensure that this function never panics
defer func() {
if err := recover(); err != nil {
@@ -135,9 +143,19 @@ func (i *propertyExtractor) UsesOnlyNVGPUModule() (uses bool, reason string) {
}
for _, name := range names {
if !strings.Contains(name, "(nvgpu)") {
if !isIntegratedGPUName(name) {
return false, fmt.Sprintf("device %q does not use nvgpu module", name)
}
}
return true, "all devices use nvgpu module"
}
func isIntegratedGPUName(name string) bool {
if strings.Contains(name, "(nvgpu)") {
return true
}
if strings.Contains(name, "NVIDIA Thor") {
return true
}
return false
}

View File

@@ -23,6 +23,9 @@ var _ PropertyExtractor = &PropertyExtractorMock{}
// HasNvmlFunc: func() (bool, string) {
// panic("mock out the HasNvml method")
// },
// HasOnlyIntegratedGPUsFunc: func() (bool, string) {
// panic("mock out the HasOnlyIntegratedGPUs method")
// },
// HasTegraFilesFunc: func() (bool, string) {
// panic("mock out the HasTegraFiles method")
// },
@@ -45,6 +48,9 @@ type PropertyExtractorMock struct {
// HasNvmlFunc mocks the HasNvml method.
HasNvmlFunc func() (bool, string)
// HasOnlyIntegratedGPUsFunc mocks the HasOnlyIntegratedGPUs method.
HasOnlyIntegratedGPUsFunc func() (bool, string)
// HasTegraFilesFunc mocks the HasTegraFiles method.
HasTegraFilesFunc func() (bool, string)
@@ -62,6 +68,9 @@ type PropertyExtractorMock struct {
// HasNvml holds details about calls to the HasNvml method.
HasNvml []struct {
}
// HasOnlyIntegratedGPUs holds details about calls to the HasOnlyIntegratedGPUs method.
HasOnlyIntegratedGPUs []struct {
}
// HasTegraFiles holds details about calls to the HasTegraFiles method.
HasTegraFiles []struct {
}
@@ -72,11 +81,12 @@ type PropertyExtractorMock struct {
UsesOnlyNVGPUModule []struct {
}
}
lockHasDXCore sync.RWMutex
lockHasNvml sync.RWMutex
lockHasTegraFiles sync.RWMutex
lockIsTegraSystem sync.RWMutex
lockUsesOnlyNVGPUModule sync.RWMutex
lockHasDXCore sync.RWMutex
lockHasNvml sync.RWMutex
lockHasOnlyIntegratedGPUs sync.RWMutex
lockHasTegraFiles sync.RWMutex
lockIsTegraSystem sync.RWMutex
lockUsesOnlyNVGPUModule sync.RWMutex
}
// HasDXCore calls HasDXCoreFunc.
@@ -133,6 +143,33 @@ func (mock *PropertyExtractorMock) HasNvmlCalls() []struct {
return calls
}
// HasOnlyIntegratedGPUs calls HasOnlyIntegratedGPUsFunc.
func (mock *PropertyExtractorMock) HasOnlyIntegratedGPUs() (bool, string) {
if mock.HasOnlyIntegratedGPUsFunc == nil {
panic("PropertyExtractorMock.HasOnlyIntegratedGPUsFunc: method is nil but PropertyExtractor.HasOnlyIntegratedGPUs was just called")
}
callInfo := struct {
}{}
mock.lockHasOnlyIntegratedGPUs.Lock()
mock.calls.HasOnlyIntegratedGPUs = append(mock.calls.HasOnlyIntegratedGPUs, callInfo)
mock.lockHasOnlyIntegratedGPUs.Unlock()
return mock.HasOnlyIntegratedGPUsFunc()
}
// HasOnlyIntegratedGPUsCalls gets all the calls that were made to HasOnlyIntegratedGPUs.
// Check the length with:
//
// len(mockedPropertyExtractor.HasOnlyIntegratedGPUsCalls())
func (mock *PropertyExtractorMock) HasOnlyIntegratedGPUsCalls() []struct {
} {
var calls []struct {
}
mock.lockHasOnlyIntegratedGPUs.RLock()
calls = mock.calls.HasOnlyIntegratedGPUs
mock.lockHasOnlyIntegratedGPUs.RUnlock()
return calls
}
// HasTegraFiles calls HasTegraFilesFunc.
func (mock *PropertyExtractorMock) HasTegraFiles() (bool, string) {
if mock.HasTegraFilesFunc == nil {

View File

@@ -48,13 +48,13 @@ func (p platformResolver) ResolvePlatform() Platform {
hasNVML, reason := p.propertyExtractor.HasNvml()
p.logger.Debugf("Is NVML-based system? %v: %v", hasNVML, reason)
usesOnlyNVGPUModule, reason := p.propertyExtractor.UsesOnlyNVGPUModule()
p.logger.Debugf("Uses nvgpu kernel module? %v: %v", usesOnlyNVGPUModule, reason)
hasOnlyIntegratedGPUs, reason := p.propertyExtractor.HasOnlyIntegratedGPUs()
p.logger.Debugf("Has only integrated GPUs? %v: %v", hasOnlyIntegratedGPUs, reason)
switch {
case hasDXCore:
return PlatformWSL
case (hasTegraFiles && !hasNVML), usesOnlyNVGPUModule:
case (hasTegraFiles && !hasNVML), hasOnlyIntegratedGPUs:
return PlatformTegra
case hasNVML:
return PlatformNVML