Merge branch 'fix-platform-detection' into 'main'

Add UsesNVGPUModule info function See merge request nvidia/container-toolkit/container-toolkit!473
2025-06-26 18:18:24 +00:00 · 2023-08-28 15:58:41 +00:00 · 2023-08-28 15:58:41 +00:00 · 2a3afdd5d9
commit 2a3afdd5d9
parent 72c56567fe 1dc028cdf2
6 changed files with 485 additions and 61 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -8,6 +8,7 @@
 * Added `--library-search-path` option to `nvidia-ctk cdi generate` command in `csv` mode. This allows folders where
  libraries are located to be specified explicitly.
 * Updated go-nvlib to support devices which are not present in the PCI device database. This allows the creation of dev/char symlinks on systems with such devices installed.
+* Added `UsesNVGPUModule` info function for more robust platform detection. This is required on Tegra-based systems where libnvidia-ml.so is also supported.

 * [toolkit-container] Set `NVIDIA_VISIBLE_DEVICES=void` to prevent injection of NVIDIA devices and drivers into the NVIDIA Container Toolkit container.

--- a/internal/info/additional_info.go
+++ b/internal/info/additional_info.go
@ -0,0 +1,81 @@
+/**
+# Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package info
+
+import (
+	"fmt"
+	"strings"
+
+	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
+	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
+	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
+)
+
+// additionalInfo allows for the info.Interface to be extened to implement the infoInterface.
+type additionalInfo struct {
+	info.Interface
+	nvmllib   nvml.Interface
+	devicelib device.Interface
+}
+
+// UsesNVGPUModule checks whether the nvgpu module is used.
+// We use the device name to signal this, since devices that use the nvgpu module have their device
+// names as:
+//
+//	GPU 0: Orin (nvgpu) (UUID: 54d0709b-558d-5a59-9c65-0c5fc14a21a4)
+//
+// This function returns true if ALL devices use the nvgpu module.
+func (i additionalInfo) UsesNVGPUModule() (uses bool, reason string) {
+	// We ensure that this function never panics
+	defer func() {
+		if err := recover(); err != nil {
+			uses = false
+			reason = fmt.Sprintf("panic: %v", err)
+		}
+	}()
+
+	ret := i.nvmllib.Init()
+	if ret != nvml.SUCCESS {
+		return false, fmt.Sprintf("failed to initialize nvml: %v", ret)
+	}
+	defer i.nvmllib.Shutdown()
+
+	var names []string
+
+	err := i.devicelib.VisitDevices(func(i int, d device.Device) error {
+		name, ret := d.GetName()
+		if ret != nvml.SUCCESS {
+			return fmt.Errorf("device %v: %v", i, ret)
+		}
+		names = append(names, name)
+		return nil
+	})
+	if err != nil {
+		return false, fmt.Sprintf("failed to get device names: %v", err)
+	}
+
+	if len(names) == 0 {
+		return false, "no devices found"
+	}
+
+	for _, name := range names {
+		if !strings.Contains(name, "(nvgpu)") {
+			return false, fmt.Sprintf("device %q does not use nvgpu module", name)
+		}
+	}
+	return true, "all devices use nvgpu module"
+}
--- a/internal/info/additional_info_test.go
+++ b/internal/info/additional_info_test.go
@ -0,0 +1,252 @@
+/**
+# Copyright (c) NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package info
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/require"
+	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
+	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
+)
+
+func TestUsesNVGPUModule(t *testing.T) {
+	testCases := []struct {
+		description string
+		nvmllib     nvml.Interface
+		expected    bool
+	}{
+		{
+			description: "init failure returns false",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.ERROR_LIBRARY_NOT_FOUND
+				},
+			},
+			expected: false,
+		},
+		{
+			description: "no devices returns false",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				ShutdownFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				DeviceGetCountFunc: func() (int, nvml.Return) {
+					return 0, nvml.SUCCESS
+				},
+			},
+			expected: false,
+		},
+		{
+			description: "DeviceGetCount error returns false",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				ShutdownFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				DeviceGetCountFunc: func() (int, nvml.Return) {
+					return 0, nvml.ERROR_UNKNOWN
+				},
+			},
+			expected: false,
+		},
+		{
+			description: "Failure to get device name returns false",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				ShutdownFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				DeviceGetCountFunc: func() (int, nvml.Return) {
+					return 1, nvml.SUCCESS
+				},
+				DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
+					device := &nvml.DeviceMock{
+						GetNameFunc: func() (string, nvml.Return) {
+							return "", nvml.ERROR_UNKNOWN
+						},
+					}
+					return device, nvml.SUCCESS
+				},
+			},
+			expected: false,
+		},
+		{
+			description: "nested panic returns false",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				ShutdownFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				DeviceGetCountFunc: func() (int, nvml.Return) {
+					return 1, nvml.SUCCESS
+				},
+				DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
+					device := &nvml.DeviceMock{
+						GetNameFunc: func() (string, nvml.Return) {
+							panic("deep panic")
+						},
+					}
+					return device, nvml.SUCCESS
+				},
+			},
+			expected: false,
+		},
+		{
+			description: "Single device name with no nvgpu",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				ShutdownFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				DeviceGetCountFunc: func() (int, nvml.Return) {
+					return 1, nvml.SUCCESS
+				},
+				DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
+					device := &nvml.DeviceMock{
+						GetNameFunc: func() (string, nvml.Return) {
+							return "NVIDIA A100-SXM4-40GB", nvml.SUCCESS
+						},
+					}
+					return device, nvml.SUCCESS
+				},
+			},
+			expected: false,
+		},
+		{
+			description: "Single device name with nvgpu",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				ShutdownFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				DeviceGetCountFunc: func() (int, nvml.Return) {
+					return 1, nvml.SUCCESS
+				},
+				DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
+					device := &nvml.DeviceMock{
+						GetNameFunc: func() (string, nvml.Return) {
+							return "Orin (nvgpu)", nvml.SUCCESS
+						},
+					}
+					return device, nvml.SUCCESS
+				},
+			},
+			expected: true,
+		},
+		{
+			description: "Multiple device names with no nvgpu",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				ShutdownFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				DeviceGetCountFunc: func() (int, nvml.Return) {
+					return 2, nvml.SUCCESS
+				},
+				DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
+					device := &nvml.DeviceMock{
+						GetNameFunc: func() (string, nvml.Return) {
+							return "NVIDIA A100-SXM4-40GB", nvml.SUCCESS
+						},
+					}
+					return device, nvml.SUCCESS
+				},
+			},
+			expected: false,
+		},
+		{
+			description: "Multiple device names with nvgpu",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				ShutdownFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				DeviceGetCountFunc: func() (int, nvml.Return) {
+					return 2, nvml.SUCCESS
+				},
+				DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
+					device := &nvml.DeviceMock{
+						GetNameFunc: func() (string, nvml.Return) {
+							return "Orin (nvgpu)", nvml.SUCCESS
+						},
+					}
+					return device, nvml.SUCCESS
+				},
+			},
+			expected: true,
+		},
+		{
+			description: "Mixed device names",
+			nvmllib: &nvml.InterfaceMock{
+				InitFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				ShutdownFunc: func() nvml.Return {
+					return nvml.SUCCESS
+				},
+				DeviceGetCountFunc: func() (int, nvml.Return) {
+					return 2, nvml.SUCCESS
+				},
+				DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
+					var deviceName string
+					if index == 0 {
+						deviceName = "NVIDIA A100-SXM4-40GB"
+					} else {
+						deviceName = "Orin (nvgpu)"
+					}
+					device := &nvml.DeviceMock{
+						GetNameFunc: func() (string, nvml.Return) {
+							return deviceName, nvml.SUCCESS
+						},
+					}
+					return device, nvml.SUCCESS
+				},
+			},
+			expected: false,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			sut := additionalInfo{
+				nvmllib:   tc.nvmllib,
+				devicelib: device.New(device.WithNvml(tc.nvmllib)),
+			}
+
+			flag, _ := sut.UsesNVGPUModule()
+			require.Equal(t, tc.expected, flag)
+		})
+	}
+}
--- a/internal/info/auto.go
+++ b/internal/info/auto.go
@ -20,25 +20,42 @@ import (
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
 	cdi "github.com/container-orchestrated-devices/container-device-interface/pkg/parser"
+	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
 	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
+	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
 )

 // infoInterface provides an alias for mocking.
 //
 //go:generate moq -stub -out info-interface_mock.go . infoInterface
-type infoInterface info.Interface
+type infoInterface interface {
+	info.Interface
+	// UsesNVGPUModule indicates whether the system is using the nvgpu kernel module
+	UsesNVGPUModule() (bool, string)
+}

 type resolver struct {
 	logger logger.Interface
-	info   info.Interface
+	info   infoInterface
 }

 // ResolveAutoMode determines the correct mode for the platform if set to "auto"
 func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode string) {
 	nvinfo := info.New()
+	nvmllib := nvml.New()
+	devicelib := device.New(
+		device.WithNvml(nvmllib),
+	)
+
+	info := additionalInfo{
+		Interface: nvinfo,
+		nvmllib:   nvmllib,
+		devicelib: devicelib,
+	}
+
 	r := resolver{
 		logger: logger,
-		info:   nvinfo,
+		info:   info,
 	}
 	return r.resolveMode(mode, image)
 }
@ -62,7 +79,10 @@ func (r resolver) resolveMode(mode string, image image.CUDA) (rmode string) {
 	hasNVML, reason := r.info.HasNvml()
 	r.logger.Debugf("Has NVML? %v: %v", hasNVML, reason)

-	if isTegra && !hasNVML {
+	usesNVGPUModule, reason := r.info.UsesNVGPUModule()
+	r.logger.Debugf("Uses nvgpu kernel module? %v: %v", usesNVGPUModule, reason)
+
+	if (isTegra && !hasNVML) || usesNVGPUModule {
 		return "csv"
 	}

--- a/internal/info/auto_test.go
+++ b/internal/info/auto_test.go
@ -22,7 +22,6 @@ import (
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
 	testlog "github.com/sirupsen/logrus/hooks/test"
 	"github.com/stretchr/testify/require"
-	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
 )

 func TestResolveAutoMode(t *testing.T) {
@ -32,7 +31,7 @@ func TestResolveAutoMode(t *testing.T) {
 		description  string
 		mode         string
 		expectedMode string
-		info         info.Interface
+		info         map[string]bool
 		image        image.CUDA
 	}{
 		{
@ -41,54 +40,78 @@ func TestResolveAutoMode(t *testing.T) {
 			expectedMode: "not-auto",
 		},
 		{
-			description: "nvml non-tegra resolves to legacy",
+			description:  "no info defaults to legacy",
+			mode:         "auto",
+			info:         map[string]bool{},
+			expectedMode: "legacy",
+		},
+		{
+			description: "non-nvml, non-tegra, nvgpu resolves to csv",
 			mode:        "auto",
-			info: &infoInterfaceMock{
-				HasNvmlFunc: func() (bool, string) {
-					return true, "nvml"
-				},
-				IsTegraSystemFunc: func() (bool, string) {
-					return false, "tegra"
-				},
+			info: map[string]bool{
+				"nvml":  false,
+				"tegra": false,
+				"nvgpu": true,
+			},
+			expectedMode: "csv",
+		},
+		{
+			description: "non-nvml, tegra, non-nvgpu resolves to csv",
+			mode:        "auto",
+			info: map[string]bool{
+				"nvml":  false,
+				"tegra": true,
+				"nvgpu": false,
+			},
+			expectedMode: "csv",
+		},
+		{
+			description: "non-nvml, tegra, nvgpu resolves to csv",
+			mode:        "auto",
+			info: map[string]bool{
+				"nvml":  false,
+				"tegra": true,
+				"nvgpu": true,
+			},
+			expectedMode: "csv",
+		},
+		{
+			description: "nvml, non-tegra, non-nvgpu resolves to legacy",
+			mode:        "auto",
+			info: map[string]bool{
+				"nvml":  true,
+				"tegra": false,
+				"nvgpu": false,
 			},
 			expectedMode: "legacy",
 		},
 		{
-			description: "non-nvml non-tegra resolves to legacy",
+			description: "nvml, non-tegra, nvgpu resolves to csv",
 			mode:        "auto",
-			info: &infoInterfaceMock{
-				HasNvmlFunc: func() (bool, string) {
-					return false, "nvml"
-				},
-				IsTegraSystemFunc: func() (bool, string) {
-					return false, "tegra"
-				},
+			info: map[string]bool{
+				"nvml":  true,
+				"tegra": false,
+				"nvgpu": true,
+			},
+			expectedMode: "csv",
+		},
+		{
+			description: "nvml, tegra, non-nvgpu resolves to legacy",
+			mode:        "auto",
+			info: map[string]bool{
+				"nvml":  true,
+				"tegra": true,
+				"nvgpu": false,
 			},
 			expectedMode: "legacy",
 		},
 		{
-			description: "nvml tegra resolves to legacy",
+			description: "nvml, tegra, nvgpu resolves to csv",
 			mode:        "auto",
-			info: &infoInterfaceMock{
-				HasNvmlFunc: func() (bool, string) {
-					return true, "nvml"
-				},
-				IsTegraSystemFunc: func() (bool, string) {
-					return true, "tegra"
-				},
-			},
-			expectedMode: "legacy",
-		},
-		{
-			description: "non-nvml tegra resolves to csv",
-			mode:        "auto",
-			info: &infoInterfaceMock{
-				HasNvmlFunc: func() (bool, string) {
-					return false, "nvml"
-				},
-				IsTegraSystemFunc: func() (bool, string) {
-					return true, "tegra"
-				},
+			info: map[string]bool{
+				"nvml":  true,
+				"tegra": true,
+				"nvgpu": true,
 			},
 			expectedMode: "csv",
 		},
@ -114,13 +137,10 @@ func TestResolveAutoMode(t *testing.T) {
 			image: image.CUDA{
 				"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
 			},
-			info: &infoInterfaceMock{
-				HasNvmlFunc: func() (bool, string) {
-					return true, "nvml"
-				},
-				IsTegraSystemFunc: func() (bool, string) {
-					return true, "tegra"
-				},
+			info: map[string]bool{
+				"nvml":  true,
+				"tegra": false,
+				"nvgpu": false,
 			},
 			expectedMode: "legacy",
 		},
@ -130,13 +150,10 @@ func TestResolveAutoMode(t *testing.T) {
 			image: image.CUDA{
 				"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
 			},
-			info: &infoInterfaceMock{
-				HasNvmlFunc: func() (bool, string) {
-					return false, "nvml"
-				},
-				IsTegraSystemFunc: func() (bool, string) {
-					return true, "tegra"
-				},
+			info: map[string]bool{
+				"nvml":  false,
+				"tegra": true,
+				"nvgpu": false,
 			},
 			expectedMode: "csv",
 		},
@ -144,9 +161,21 @@ func TestResolveAutoMode(t *testing.T) {

 	for _, tc := range testCases {
 		t.Run(tc.description, func(t *testing.T) {
+			info := &infoInterfaceMock{
+				HasNvmlFunc: func() (bool, string) {
+					return tc.info["nvml"], "nvml"
+				},
+				IsTegraSystemFunc: func() (bool, string) {
+					return tc.info["tegra"], "tegra"
+				},
+				UsesNVGPUModuleFunc: func() (bool, string) {
+					return tc.info["nvgpu"], "nvgpu"
+				},
+			}
+
 			r := resolver{
 				logger: logger,
-				info:   tc.info,
+				info:   info,
 			}
 			mode := r.resolveMode(tc.mode, tc.image)
 			require.EqualValues(t, tc.expectedMode, mode)
--- a/internal/info/info-interface_mock.go
+++ b/internal/info/info-interface_mock.go
@ -26,6 +26,9 @@ var _ infoInterface = &infoInterfaceMock{}
 //			IsTegraSystemFunc: func() (bool, string) {
 //				panic("mock out the IsTegraSystem method")
 //			},
+//			UsesNVGPUModuleFunc: func() (bool, string) {
+//				panic("mock out the UsesNVGPUModule method")
+//			},
 //		}
 //
 //		// use mockedinfoInterface in code that requires infoInterface
@ -42,6 +45,9 @@ type infoInterfaceMock struct {
 	// IsTegraSystemFunc mocks the IsTegraSystem method.
 	IsTegraSystemFunc func() (bool, string)

+	// UsesNVGPUModuleFunc mocks the UsesNVGPUModule method.
+	UsesNVGPUModuleFunc func() (bool, string)
+
 	// calls tracks calls to the methods.
 	calls struct {
 		// HasDXCore holds details about calls to the HasDXCore method.
@ -53,10 +59,14 @@ type infoInterfaceMock struct {
 		// IsTegraSystem holds details about calls to the IsTegraSystem method.
 		IsTegraSystem []struct {
 		}
+		// UsesNVGPUModule holds details about calls to the UsesNVGPUModule method.
+		UsesNVGPUModule []struct {
+		}
 	}
-	lockHasDXCore     sync.RWMutex
-	lockHasNvml       sync.RWMutex
-	lockIsTegraSystem sync.RWMutex
+	lockHasDXCore       sync.RWMutex
+	lockHasNvml         sync.RWMutex
+	lockIsTegraSystem   sync.RWMutex
+	lockUsesNVGPUModule sync.RWMutex
 }

 // HasDXCore calls HasDXCoreFunc.
@ -151,3 +161,34 @@ func (mock *infoInterfaceMock) IsTegraSystemCalls() []struct {
 	mock.lockIsTegraSystem.RUnlock()
 	return calls
 }
+
+// UsesNVGPUModule calls UsesNVGPUModuleFunc.
+func (mock *infoInterfaceMock) UsesNVGPUModule() (bool, string) {
+	callInfo := struct {
+	}{}
+	mock.lockUsesNVGPUModule.Lock()
+	mock.calls.UsesNVGPUModule = append(mock.calls.UsesNVGPUModule, callInfo)
+	mock.lockUsesNVGPUModule.Unlock()
+	if mock.UsesNVGPUModuleFunc == nil {
+		var (
+			bOut bool
+			sOut string
+		)
+		return bOut, sOut
+	}
+	return mock.UsesNVGPUModuleFunc()
+}
+
+// UsesNVGPUModuleCalls gets all the calls that were made to UsesNVGPUModule.
+// Check the length with:
+//
+//	len(mockedinfoInterface.UsesNVGPUModuleCalls())
+func (mock *infoInterfaceMock) UsesNVGPUModuleCalls() []struct {
+} {
+	var calls []struct {
+	}
+	mock.lockUsesNVGPUModule.RLock()
+	calls = mock.calls.UsesNVGPUModule
+	mock.lockUsesNVGPUModule.RUnlock()
+	return calls
+}