diff --git a/CHANGELOG.md b/CHANGELOG.md index 21aeedec..a710b952 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * Added `--library-search-path` option to `nvidia-ctk cdi generate` command in `csv` mode. This allows folders where libraries are located to be specified explicitly. * Updated go-nvlib to support devices which are not present in the PCI device database. This allows the creation of dev/char symlinks on systems with such devices installed. +* Added `UsesNVGPUModule` info function for more robust platform detection. This is required on Tegra-based systems where libnvidia-ml.so is also supported. * [toolkit-container] Set `NVIDIA_VISIBLE_DEVICES=void` to prevent injection of NVIDIA devices and drivers into the NVIDIA Container Toolkit container. diff --git a/internal/info/additional_info.go b/internal/info/additional_info.go new file mode 100644 index 00000000..fbb9ed23 --- /dev/null +++ b/internal/info/additional_info.go @@ -0,0 +1,81 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package info + +import ( + "fmt" + "strings" + + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +// additionalInfo allows for the info.Interface to be extened to implement the infoInterface. +type additionalInfo struct { + info.Interface + nvmllib nvml.Interface + devicelib device.Interface +} + +// UsesNVGPUModule checks whether the nvgpu module is used. +// We use the device name to signal this, since devices that use the nvgpu module have their device +// names as: +// +// GPU 0: Orin (nvgpu) (UUID: 54d0709b-558d-5a59-9c65-0c5fc14a21a4) +// +// This function returns true if ALL devices use the nvgpu module. +func (i additionalInfo) UsesNVGPUModule() (uses bool, reason string) { + // We ensure that this function never panics + defer func() { + if err := recover(); err != nil { + uses = false + reason = fmt.Sprintf("panic: %v", err) + } + }() + + ret := i.nvmllib.Init() + if ret != nvml.SUCCESS { + return false, fmt.Sprintf("failed to initialize nvml: %v", ret) + } + defer i.nvmllib.Shutdown() + + var names []string + + err := i.devicelib.VisitDevices(func(i int, d device.Device) error { + name, ret := d.GetName() + if ret != nvml.SUCCESS { + return fmt.Errorf("device %v: %v", i, ret) + } + names = append(names, name) + return nil + }) + if err != nil { + return false, fmt.Sprintf("failed to get device names: %v", err) + } + + if len(names) == 0 { + return false, "no devices found" + } + + for _, name := range names { + if !strings.Contains(name, "(nvgpu)") { + return false, fmt.Sprintf("device %q does not use nvgpu module", name) + } + } + return true, "all devices use nvgpu module" +} diff --git a/internal/info/additional_info_test.go b/internal/info/additional_info_test.go new file mode 100644 index 00000000..7281f324 --- /dev/null +++ b/internal/info/additional_info_test.go @@ -0,0 +1,252 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package info + +import ( + "testing" + + "github.com/stretchr/testify/require" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" +) + +func TestUsesNVGPUModule(t *testing.T) { + testCases := []struct { + description string + nvmllib nvml.Interface + expected bool + }{ + { + description: "init failure returns false", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.ERROR_LIBRARY_NOT_FOUND + }, + }, + expected: false, + }, + { + description: "no devices returns false", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 0, nvml.SUCCESS + }, + }, + expected: false, + }, + { + description: "DeviceGetCount error returns false", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 0, nvml.ERROR_UNKNOWN + }, + }, + expected: false, + }, + { + description: "Failure to get device name returns false", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 1, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) { + device := &nvml.DeviceMock{ + GetNameFunc: func() (string, nvml.Return) { + return "", nvml.ERROR_UNKNOWN + }, + } + return device, nvml.SUCCESS + }, + }, + expected: false, + }, + { + description: "nested panic returns false", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 1, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) { + device := &nvml.DeviceMock{ + GetNameFunc: func() (string, nvml.Return) { + panic("deep panic") + }, + } + return device, nvml.SUCCESS + }, + }, + expected: false, + }, + { + description: "Single device name with no nvgpu", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 1, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) { + device := &nvml.DeviceMock{ + GetNameFunc: func() (string, nvml.Return) { + return "NVIDIA A100-SXM4-40GB", nvml.SUCCESS + }, + } + return device, nvml.SUCCESS + }, + }, + expected: false, + }, + { + description: "Single device name with nvgpu", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 1, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) { + device := &nvml.DeviceMock{ + GetNameFunc: func() (string, nvml.Return) { + return "Orin (nvgpu)", nvml.SUCCESS + }, + } + return device, nvml.SUCCESS + }, + }, + expected: true, + }, + { + description: "Multiple device names with no nvgpu", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 2, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) { + device := &nvml.DeviceMock{ + GetNameFunc: func() (string, nvml.Return) { + return "NVIDIA A100-SXM4-40GB", nvml.SUCCESS + }, + } + return device, nvml.SUCCESS + }, + }, + expected: false, + }, + { + description: "Multiple device names with nvgpu", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 2, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) { + device := &nvml.DeviceMock{ + GetNameFunc: func() (string, nvml.Return) { + return "Orin (nvgpu)", nvml.SUCCESS + }, + } + return device, nvml.SUCCESS + }, + }, + expected: true, + }, + { + description: "Mixed device names", + nvmllib: &nvml.InterfaceMock{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 2, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) { + var deviceName string + if index == 0 { + deviceName = "NVIDIA A100-SXM4-40GB" + } else { + deviceName = "Orin (nvgpu)" + } + device := &nvml.DeviceMock{ + GetNameFunc: func() (string, nvml.Return) { + return deviceName, nvml.SUCCESS + }, + } + return device, nvml.SUCCESS + }, + }, + expected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + sut := additionalInfo{ + nvmllib: tc.nvmllib, + devicelib: device.New(device.WithNvml(tc.nvmllib)), + } + + flag, _ := sut.UsesNVGPUModule() + require.Equal(t, tc.expected, flag) + }) + } +} diff --git a/internal/info/auto.go b/internal/info/auto.go index 2b7794fa..760d33d9 100644 --- a/internal/info/auto.go +++ b/internal/info/auto.go @@ -20,25 +20,42 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" cdi "github.com/container-orchestrated-devices/container-device-interface/pkg/parser" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" ) // infoInterface provides an alias for mocking. // //go:generate moq -stub -out info-interface_mock.go . infoInterface -type infoInterface info.Interface +type infoInterface interface { + info.Interface + // UsesNVGPUModule indicates whether the system is using the nvgpu kernel module + UsesNVGPUModule() (bool, string) +} type resolver struct { logger logger.Interface - info info.Interface + info infoInterface } // ResolveAutoMode determines the correct mode for the platform if set to "auto" func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode string) { nvinfo := info.New() + nvmllib := nvml.New() + devicelib := device.New( + device.WithNvml(nvmllib), + ) + + info := additionalInfo{ + Interface: nvinfo, + nvmllib: nvmllib, + devicelib: devicelib, + } + r := resolver{ logger: logger, - info: nvinfo, + info: info, } return r.resolveMode(mode, image) } @@ -62,7 +79,10 @@ func (r resolver) resolveMode(mode string, image image.CUDA) (rmode string) { hasNVML, reason := r.info.HasNvml() r.logger.Debugf("Has NVML? %v: %v", hasNVML, reason) - if isTegra && !hasNVML { + usesNVGPUModule, reason := r.info.UsesNVGPUModule() + r.logger.Debugf("Uses nvgpu kernel module? %v: %v", usesNVGPUModule, reason) + + if (isTegra && !hasNVML) || usesNVGPUModule { return "csv" } diff --git a/internal/info/auto_test.go b/internal/info/auto_test.go index 71919c01..f91682f6 100644 --- a/internal/info/auto_test.go +++ b/internal/info/auto_test.go @@ -22,7 +22,6 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" testlog "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/require" - "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info" ) func TestResolveAutoMode(t *testing.T) { @@ -32,7 +31,7 @@ func TestResolveAutoMode(t *testing.T) { description string mode string expectedMode string - info info.Interface + info map[string]bool image image.CUDA }{ { @@ -41,54 +40,78 @@ func TestResolveAutoMode(t *testing.T) { expectedMode: "not-auto", }, { - description: "nvml non-tegra resolves to legacy", + description: "no info defaults to legacy", + mode: "auto", + info: map[string]bool{}, + expectedMode: "legacy", + }, + { + description: "non-nvml, non-tegra, nvgpu resolves to csv", mode: "auto", - info: &infoInterfaceMock{ - HasNvmlFunc: func() (bool, string) { - return true, "nvml" - }, - IsTegraSystemFunc: func() (bool, string) { - return false, "tegra" - }, + info: map[string]bool{ + "nvml": false, + "tegra": false, + "nvgpu": true, + }, + expectedMode: "csv", + }, + { + description: "non-nvml, tegra, non-nvgpu resolves to csv", + mode: "auto", + info: map[string]bool{ + "nvml": false, + "tegra": true, + "nvgpu": false, + }, + expectedMode: "csv", + }, + { + description: "non-nvml, tegra, nvgpu resolves to csv", + mode: "auto", + info: map[string]bool{ + "nvml": false, + "tegra": true, + "nvgpu": true, + }, + expectedMode: "csv", + }, + { + description: "nvml, non-tegra, non-nvgpu resolves to legacy", + mode: "auto", + info: map[string]bool{ + "nvml": true, + "tegra": false, + "nvgpu": false, }, expectedMode: "legacy", }, { - description: "non-nvml non-tegra resolves to legacy", + description: "nvml, non-tegra, nvgpu resolves to csv", mode: "auto", - info: &infoInterfaceMock{ - HasNvmlFunc: func() (bool, string) { - return false, "nvml" - }, - IsTegraSystemFunc: func() (bool, string) { - return false, "tegra" - }, + info: map[string]bool{ + "nvml": true, + "tegra": false, + "nvgpu": true, + }, + expectedMode: "csv", + }, + { + description: "nvml, tegra, non-nvgpu resolves to legacy", + mode: "auto", + info: map[string]bool{ + "nvml": true, + "tegra": true, + "nvgpu": false, }, expectedMode: "legacy", }, { - description: "nvml tegra resolves to legacy", + description: "nvml, tegra, nvgpu resolves to csv", mode: "auto", - info: &infoInterfaceMock{ - HasNvmlFunc: func() (bool, string) { - return true, "nvml" - }, - IsTegraSystemFunc: func() (bool, string) { - return true, "tegra" - }, - }, - expectedMode: "legacy", - }, - { - description: "non-nvml tegra resolves to csv", - mode: "auto", - info: &infoInterfaceMock{ - HasNvmlFunc: func() (bool, string) { - return false, "nvml" - }, - IsTegraSystemFunc: func() (bool, string) { - return true, "tegra" - }, + info: map[string]bool{ + "nvml": true, + "tegra": true, + "nvgpu": true, }, expectedMode: "csv", }, @@ -114,13 +137,10 @@ func TestResolveAutoMode(t *testing.T) { image: image.CUDA{ "NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0", }, - info: &infoInterfaceMock{ - HasNvmlFunc: func() (bool, string) { - return true, "nvml" - }, - IsTegraSystemFunc: func() (bool, string) { - return true, "tegra" - }, + info: map[string]bool{ + "nvml": true, + "tegra": false, + "nvgpu": false, }, expectedMode: "legacy", }, @@ -130,13 +150,10 @@ func TestResolveAutoMode(t *testing.T) { image: image.CUDA{ "NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0", }, - info: &infoInterfaceMock{ - HasNvmlFunc: func() (bool, string) { - return false, "nvml" - }, - IsTegraSystemFunc: func() (bool, string) { - return true, "tegra" - }, + info: map[string]bool{ + "nvml": false, + "tegra": true, + "nvgpu": false, }, expectedMode: "csv", }, @@ -144,9 +161,21 @@ func TestResolveAutoMode(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { + info := &infoInterfaceMock{ + HasNvmlFunc: func() (bool, string) { + return tc.info["nvml"], "nvml" + }, + IsTegraSystemFunc: func() (bool, string) { + return tc.info["tegra"], "tegra" + }, + UsesNVGPUModuleFunc: func() (bool, string) { + return tc.info["nvgpu"], "nvgpu" + }, + } + r := resolver{ logger: logger, - info: tc.info, + info: info, } mode := r.resolveMode(tc.mode, tc.image) require.EqualValues(t, tc.expectedMode, mode) diff --git a/internal/info/info-interface_mock.go b/internal/info/info-interface_mock.go index 2375a4f1..c1e491c9 100644 --- a/internal/info/info-interface_mock.go +++ b/internal/info/info-interface_mock.go @@ -26,6 +26,9 @@ var _ infoInterface = &infoInterfaceMock{} // IsTegraSystemFunc: func() (bool, string) { // panic("mock out the IsTegraSystem method") // }, +// UsesNVGPUModuleFunc: func() (bool, string) { +// panic("mock out the UsesNVGPUModule method") +// }, // } // // // use mockedinfoInterface in code that requires infoInterface @@ -42,6 +45,9 @@ type infoInterfaceMock struct { // IsTegraSystemFunc mocks the IsTegraSystem method. IsTegraSystemFunc func() (bool, string) + // UsesNVGPUModuleFunc mocks the UsesNVGPUModule method. + UsesNVGPUModuleFunc func() (bool, string) + // calls tracks calls to the methods. calls struct { // HasDXCore holds details about calls to the HasDXCore method. @@ -53,10 +59,14 @@ type infoInterfaceMock struct { // IsTegraSystem holds details about calls to the IsTegraSystem method. IsTegraSystem []struct { } + // UsesNVGPUModule holds details about calls to the UsesNVGPUModule method. + UsesNVGPUModule []struct { + } } - lockHasDXCore sync.RWMutex - lockHasNvml sync.RWMutex - lockIsTegraSystem sync.RWMutex + lockHasDXCore sync.RWMutex + lockHasNvml sync.RWMutex + lockIsTegraSystem sync.RWMutex + lockUsesNVGPUModule sync.RWMutex } // HasDXCore calls HasDXCoreFunc. @@ -151,3 +161,34 @@ func (mock *infoInterfaceMock) IsTegraSystemCalls() []struct { mock.lockIsTegraSystem.RUnlock() return calls } + +// UsesNVGPUModule calls UsesNVGPUModuleFunc. +func (mock *infoInterfaceMock) UsesNVGPUModule() (bool, string) { + callInfo := struct { + }{} + mock.lockUsesNVGPUModule.Lock() + mock.calls.UsesNVGPUModule = append(mock.calls.UsesNVGPUModule, callInfo) + mock.lockUsesNVGPUModule.Unlock() + if mock.UsesNVGPUModuleFunc == nil { + var ( + bOut bool + sOut string + ) + return bOut, sOut + } + return mock.UsesNVGPUModuleFunc() +} + +// UsesNVGPUModuleCalls gets all the calls that were made to UsesNVGPUModule. +// Check the length with: +// +// len(mockedinfoInterface.UsesNVGPUModuleCalls()) +func (mock *infoInterfaceMock) UsesNVGPUModuleCalls() []struct { +} { + var calls []struct { + } + mock.lockUsesNVGPUModule.RLock() + calls = mock.calls.UsesNVGPUModule + mock.lockUsesNVGPUModule.RUnlock() + return calls +}