Merge branch 'fix-platform-detection' into 'main'

Add UsesNVGPUModule info function

See merge request nvidia/container-toolkit/container-toolkit!473
This commit is contained in:
Evan Lezar 2023-08-28 15:58:41 +00:00
commit 2a3afdd5d9
6 changed files with 485 additions and 61 deletions

View File

@ -8,6 +8,7 @@
* Added `--library-search-path` option to `nvidia-ctk cdi generate` command in `csv` mode. This allows folders where
libraries are located to be specified explicitly.
* Updated go-nvlib to support devices which are not present in the PCI device database. This allows the creation of dev/char symlinks on systems with such devices installed.
* Added `UsesNVGPUModule` info function for more robust platform detection. This is required on Tegra-based systems where libnvidia-ml.so is also supported.
* [toolkit-container] Set `NVIDIA_VISIBLE_DEVICES=void` to prevent injection of NVIDIA devices and drivers into the NVIDIA Container Toolkit container.

View File

@ -0,0 +1,81 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package info
import (
"fmt"
"strings"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
// additionalInfo allows for the info.Interface to be extened to implement the infoInterface.
type additionalInfo struct {
info.Interface
nvmllib nvml.Interface
devicelib device.Interface
}
// UsesNVGPUModule checks whether the nvgpu module is used.
// We use the device name to signal this, since devices that use the nvgpu module have their device
// names as:
//
// GPU 0: Orin (nvgpu) (UUID: 54d0709b-558d-5a59-9c65-0c5fc14a21a4)
//
// This function returns true if ALL devices use the nvgpu module.
func (i additionalInfo) UsesNVGPUModule() (uses bool, reason string) {
// We ensure that this function never panics
defer func() {
if err := recover(); err != nil {
uses = false
reason = fmt.Sprintf("panic: %v", err)
}
}()
ret := i.nvmllib.Init()
if ret != nvml.SUCCESS {
return false, fmt.Sprintf("failed to initialize nvml: %v", ret)
}
defer i.nvmllib.Shutdown()
var names []string
err := i.devicelib.VisitDevices(func(i int, d device.Device) error {
name, ret := d.GetName()
if ret != nvml.SUCCESS {
return fmt.Errorf("device %v: %v", i, ret)
}
names = append(names, name)
return nil
})
if err != nil {
return false, fmt.Sprintf("failed to get device names: %v", err)
}
if len(names) == 0 {
return false, "no devices found"
}
for _, name := range names {
if !strings.Contains(name, "(nvgpu)") {
return false, fmt.Sprintf("device %q does not use nvgpu module", name)
}
}
return true, "all devices use nvgpu module"
}

View File

@ -0,0 +1,252 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package info
import (
"testing"
"github.com/stretchr/testify/require"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
func TestUsesNVGPUModule(t *testing.T) {
testCases := []struct {
description string
nvmllib nvml.Interface
expected bool
}{
{
description: "init failure returns false",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.ERROR_LIBRARY_NOT_FOUND
},
},
expected: false,
},
{
description: "no devices returns false",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 0, nvml.SUCCESS
},
},
expected: false,
},
{
description: "DeviceGetCount error returns false",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 0, nvml.ERROR_UNKNOWN
},
},
expected: false,
},
{
description: "Failure to get device name returns false",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 1, nvml.SUCCESS
},
DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
device := &nvml.DeviceMock{
GetNameFunc: func() (string, nvml.Return) {
return "", nvml.ERROR_UNKNOWN
},
}
return device, nvml.SUCCESS
},
},
expected: false,
},
{
description: "nested panic returns false",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 1, nvml.SUCCESS
},
DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
device := &nvml.DeviceMock{
GetNameFunc: func() (string, nvml.Return) {
panic("deep panic")
},
}
return device, nvml.SUCCESS
},
},
expected: false,
},
{
description: "Single device name with no nvgpu",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 1, nvml.SUCCESS
},
DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
device := &nvml.DeviceMock{
GetNameFunc: func() (string, nvml.Return) {
return "NVIDIA A100-SXM4-40GB", nvml.SUCCESS
},
}
return device, nvml.SUCCESS
},
},
expected: false,
},
{
description: "Single device name with nvgpu",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 1, nvml.SUCCESS
},
DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
device := &nvml.DeviceMock{
GetNameFunc: func() (string, nvml.Return) {
return "Orin (nvgpu)", nvml.SUCCESS
},
}
return device, nvml.SUCCESS
},
},
expected: true,
},
{
description: "Multiple device names with no nvgpu",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 2, nvml.SUCCESS
},
DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
device := &nvml.DeviceMock{
GetNameFunc: func() (string, nvml.Return) {
return "NVIDIA A100-SXM4-40GB", nvml.SUCCESS
},
}
return device, nvml.SUCCESS
},
},
expected: false,
},
{
description: "Multiple device names with nvgpu",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 2, nvml.SUCCESS
},
DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
device := &nvml.DeviceMock{
GetNameFunc: func() (string, nvml.Return) {
return "Orin (nvgpu)", nvml.SUCCESS
},
}
return device, nvml.SUCCESS
},
},
expected: true,
},
{
description: "Mixed device names",
nvmllib: &nvml.InterfaceMock{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 2, nvml.SUCCESS
},
DeviceGetHandleByIndexFunc: func(index int) (nvml.Device, nvml.Return) {
var deviceName string
if index == 0 {
deviceName = "NVIDIA A100-SXM4-40GB"
} else {
deviceName = "Orin (nvgpu)"
}
device := &nvml.DeviceMock{
GetNameFunc: func() (string, nvml.Return) {
return deviceName, nvml.SUCCESS
},
}
return device, nvml.SUCCESS
},
},
expected: false,
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
sut := additionalInfo{
nvmllib: tc.nvmllib,
devicelib: device.New(device.WithNvml(tc.nvmllib)),
}
flag, _ := sut.UsesNVGPUModule()
require.Equal(t, tc.expected, flag)
})
}
}

View File

@ -20,25 +20,42 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
cdi "github.com/container-orchestrated-devices/container-device-interface/pkg/parser"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
// infoInterface provides an alias for mocking.
//
//go:generate moq -stub -out info-interface_mock.go . infoInterface
type infoInterface info.Interface
type infoInterface interface {
info.Interface
// UsesNVGPUModule indicates whether the system is using the nvgpu kernel module
UsesNVGPUModule() (bool, string)
}
type resolver struct {
logger logger.Interface
info info.Interface
info infoInterface
}
// ResolveAutoMode determines the correct mode for the platform if set to "auto"
func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode string) {
nvinfo := info.New()
nvmllib := nvml.New()
devicelib := device.New(
device.WithNvml(nvmllib),
)
info := additionalInfo{
Interface: nvinfo,
nvmllib: nvmllib,
devicelib: devicelib,
}
r := resolver{
logger: logger,
info: nvinfo,
info: info,
}
return r.resolveMode(mode, image)
}
@ -62,7 +79,10 @@ func (r resolver) resolveMode(mode string, image image.CUDA) (rmode string) {
hasNVML, reason := r.info.HasNvml()
r.logger.Debugf("Has NVML? %v: %v", hasNVML, reason)
if isTegra && !hasNVML {
usesNVGPUModule, reason := r.info.UsesNVGPUModule()
r.logger.Debugf("Uses nvgpu kernel module? %v: %v", usesNVGPUModule, reason)
if (isTegra && !hasNVML) || usesNVGPUModule {
return "csv"
}

View File

@ -22,7 +22,6 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
)
func TestResolveAutoMode(t *testing.T) {
@ -32,7 +31,7 @@ func TestResolveAutoMode(t *testing.T) {
description string
mode string
expectedMode string
info info.Interface
info map[string]bool
image image.CUDA
}{
{
@ -41,54 +40,78 @@ func TestResolveAutoMode(t *testing.T) {
expectedMode: "not-auto",
},
{
description: "nvml non-tegra resolves to legacy",
description: "no info defaults to legacy",
mode: "auto",
info: map[string]bool{},
expectedMode: "legacy",
},
{
description: "non-nvml, non-tegra, nvgpu resolves to csv",
mode: "auto",
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return true, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return false, "tegra"
},
info: map[string]bool{
"nvml": false,
"tegra": false,
"nvgpu": true,
},
expectedMode: "csv",
},
{
description: "non-nvml, tegra, non-nvgpu resolves to csv",
mode: "auto",
info: map[string]bool{
"nvml": false,
"tegra": true,
"nvgpu": false,
},
expectedMode: "csv",
},
{
description: "non-nvml, tegra, nvgpu resolves to csv",
mode: "auto",
info: map[string]bool{
"nvml": false,
"tegra": true,
"nvgpu": true,
},
expectedMode: "csv",
},
{
description: "nvml, non-tegra, non-nvgpu resolves to legacy",
mode: "auto",
info: map[string]bool{
"nvml": true,
"tegra": false,
"nvgpu": false,
},
expectedMode: "legacy",
},
{
description: "non-nvml non-tegra resolves to legacy",
description: "nvml, non-tegra, nvgpu resolves to csv",
mode: "auto",
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return false, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return false, "tegra"
},
info: map[string]bool{
"nvml": true,
"tegra": false,
"nvgpu": true,
},
expectedMode: "csv",
},
{
description: "nvml, tegra, non-nvgpu resolves to legacy",
mode: "auto",
info: map[string]bool{
"nvml": true,
"tegra": true,
"nvgpu": false,
},
expectedMode: "legacy",
},
{
description: "nvml tegra resolves to legacy",
description: "nvml, tegra, nvgpu resolves to csv",
mode: "auto",
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return true, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return true, "tegra"
},
},
expectedMode: "legacy",
},
{
description: "non-nvml tegra resolves to csv",
mode: "auto",
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return false, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return true, "tegra"
},
info: map[string]bool{
"nvml": true,
"tegra": true,
"nvgpu": true,
},
expectedMode: "csv",
},
@ -114,13 +137,10 @@ func TestResolveAutoMode(t *testing.T) {
image: image.CUDA{
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
},
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return true, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return true, "tegra"
},
info: map[string]bool{
"nvml": true,
"tegra": false,
"nvgpu": false,
},
expectedMode: "legacy",
},
@ -130,13 +150,10 @@ func TestResolveAutoMode(t *testing.T) {
image: image.CUDA{
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
},
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return false, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return true, "tegra"
},
info: map[string]bool{
"nvml": false,
"tegra": true,
"nvgpu": false,
},
expectedMode: "csv",
},
@ -144,9 +161,21 @@ func TestResolveAutoMode(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
info := &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return tc.info["nvml"], "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return tc.info["tegra"], "tegra"
},
UsesNVGPUModuleFunc: func() (bool, string) {
return tc.info["nvgpu"], "nvgpu"
},
}
r := resolver{
logger: logger,
info: tc.info,
info: info,
}
mode := r.resolveMode(tc.mode, tc.image)
require.EqualValues(t, tc.expectedMode, mode)

View File

@ -26,6 +26,9 @@ var _ infoInterface = &infoInterfaceMock{}
// IsTegraSystemFunc: func() (bool, string) {
// panic("mock out the IsTegraSystem method")
// },
// UsesNVGPUModuleFunc: func() (bool, string) {
// panic("mock out the UsesNVGPUModule method")
// },
// }
//
// // use mockedinfoInterface in code that requires infoInterface
@ -42,6 +45,9 @@ type infoInterfaceMock struct {
// IsTegraSystemFunc mocks the IsTegraSystem method.
IsTegraSystemFunc func() (bool, string)
// UsesNVGPUModuleFunc mocks the UsesNVGPUModule method.
UsesNVGPUModuleFunc func() (bool, string)
// calls tracks calls to the methods.
calls struct {
// HasDXCore holds details about calls to the HasDXCore method.
@ -53,10 +59,14 @@ type infoInterfaceMock struct {
// IsTegraSystem holds details about calls to the IsTegraSystem method.
IsTegraSystem []struct {
}
// UsesNVGPUModule holds details about calls to the UsesNVGPUModule method.
UsesNVGPUModule []struct {
}
}
lockHasDXCore sync.RWMutex
lockHasNvml sync.RWMutex
lockIsTegraSystem sync.RWMutex
lockHasDXCore sync.RWMutex
lockHasNvml sync.RWMutex
lockIsTegraSystem sync.RWMutex
lockUsesNVGPUModule sync.RWMutex
}
// HasDXCore calls HasDXCoreFunc.
@ -151,3 +161,34 @@ func (mock *infoInterfaceMock) IsTegraSystemCalls() []struct {
mock.lockIsTegraSystem.RUnlock()
return calls
}
// UsesNVGPUModule calls UsesNVGPUModuleFunc.
func (mock *infoInterfaceMock) UsesNVGPUModule() (bool, string) {
callInfo := struct {
}{}
mock.lockUsesNVGPUModule.Lock()
mock.calls.UsesNVGPUModule = append(mock.calls.UsesNVGPUModule, callInfo)
mock.lockUsesNVGPUModule.Unlock()
if mock.UsesNVGPUModuleFunc == nil {
var (
bOut bool
sOut string
)
return bOut, sOut
}
return mock.UsesNVGPUModuleFunc()
}
// UsesNVGPUModuleCalls gets all the calls that were made to UsesNVGPUModule.
// Check the length with:
//
// len(mockedinfoInterface.UsesNVGPUModuleCalls())
func (mock *infoInterfaceMock) UsesNVGPUModuleCalls() []struct {
} {
var calls []struct {
}
mock.lockUsesNVGPUModule.RLock()
calls = mock.calls.UsesNVGPUModule
mock.lockUsesNVGPUModule.RUnlock()
return calls
}