[no-relnote] Use image.CUDA to extract visible devices

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2024-10-14 15:06:06 +02:00
parent 1991b3ef2a
commit 92df542f2f
11 changed files with 313 additions and 337 deletions

View File

@@ -47,7 +47,7 @@ func New(opt ...Option) (CUDA, error) {
// build creates a CUDA image from the builder.
func (b builder) build() (CUDA, error) {
if b.disableRequire {
b.env[envNVDisableRequire] = "true"
b.env[EnvVarNvidiaDisableRequire] = "true"
}
c := CUDA{

View File

@@ -28,12 +28,9 @@ import (
)
const (
envCUDAVersion = "CUDA_VERSION"
envNVRequirePrefix = "NVIDIA_REQUIRE_"
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
envNVRequireJetpack = envNVRequirePrefix + "JETPACK"
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
DeviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
volumeMountDevicePrefixCDI = "cdi/"
)
// CUDA represents a CUDA image that can be used for GPU computing. This wraps
@@ -80,8 +77,8 @@ func (i CUDA) HasEnvvar(key string) bool {
// image is considered legacy if it has a CUDA_VERSION environment variable defined
// and no NVIDIA_REQUIRE_CUDA environment variable defined.
func (i CUDA) IsLegacy() bool {
legacyCudaVersion := i.env[envCUDAVersion]
cudaRequire := i.env[envNVRequireCUDA]
legacyCudaVersion := i.env[EnvVarCudaVersion]
cudaRequire := i.env[EnvVarNvidiaRequireCuda]
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
}
@@ -95,7 +92,7 @@ func (i CUDA) GetRequirements() ([]string, error) {
// All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli
var requirements []string
for name, value := range i.env {
if strings.HasPrefix(name, envNVRequirePrefix) && !strings.HasPrefix(name, envNVRequireJetpack) {
if strings.HasPrefix(name, NvidiaRequirePrefix) && !strings.HasPrefix(name, EnvVarNvidiaRequireJetpack) {
requirements = append(requirements, value)
}
}
@@ -113,7 +110,7 @@ func (i CUDA) GetRequirements() ([]string, error) {
// HasDisableRequire checks for the value of the NVIDIA_DISABLE_REQUIRE. If set
// to a valid (true) boolean value this can be used to disable the requirement checks
func (i CUDA) HasDisableRequire() bool {
if disable, exists := i.env[envNVDisableRequire]; exists {
if disable, exists := i.env[EnvVarNvidiaDisableRequire]; exists {
// i.logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", disable)
d, _ := strconv.ParseBool(disable)
return d
@@ -157,7 +154,7 @@ func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices {
// GetDriverCapabilities returns the requested driver capabilities.
func (i CUDA) GetDriverCapabilities() DriverCapabilities {
env := i.env[envNVDriverCapabilities]
env := i.env[EnvVarNvidiaDriverCapabilities]
capabilities := make(DriverCapabilities)
for _, c := range strings.Split(env, ",") {
@@ -168,7 +165,7 @@ func (i CUDA) GetDriverCapabilities() DriverCapabilities {
}
func (i CUDA) legacyVersion() (string, error) {
cudaVersion := i.env[envCUDAVersion]
cudaVersion := i.env[EnvVarCudaVersion]
majorMinor, err := parseMajorMinorVersion(cudaVersion)
if err != nil {
return "", fmt.Errorf("invalid CUDA version %v: %v", cudaVersion, err)
@@ -202,7 +199,7 @@ func parseMajorMinorVersion(version string) (string, error) {
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
var hasCDIdevice bool
for _, device := range i.DevicesFromEnvvars("NVIDIA_VISIBLE_DEVICES").List() {
for _, device := range i.VisibleDevicesFromEnvVar() {
if !parser.IsQualifiedName(device) {
return false
}
@@ -218,14 +215,28 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
return hasCDIdevice
}
const (
deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
)
// VisibleDevicesFromEnvVar returns the set of visible devices requested through
// the NVIDIA_VISIBLE_DEVICES environment variable.
func (i CUDA) VisibleDevicesFromEnvVar() []string {
return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List()
}
// VisibleDevicesFromMounts returns the set of visible devices requested as mounts.
func (i CUDA) VisibleDevicesFromMounts() []string {
var devices []string
for _, device := range i.DevicesFromMounts() {
if strings.HasPrefix(device, volumeMountDevicePrefixCDI) {
continue
}
devices = append(devices, device)
}
return devices
}
// DevicesFromMounts returns a list of device specified as mounts.
// TODO: This should be merged with getDevicesFromMounts used in the NVIDIA Container Runtime
func (i CUDA) DevicesFromMounts() []string {
root := filepath.Clean(deviceListAsVolumeMountsRoot)
root := filepath.Clean(DeviceListAsVolumeMountsRoot)
seen := make(map[string]bool)
var devices []string
for _, m := range i.mounts {
@@ -260,10 +271,10 @@ func (i CUDA) DevicesFromMounts() []string {
func (i CUDA) CDIDevicesFromMounts() []string {
var devices []string
for _, mountDevice := range i.DevicesFromMounts() {
if !strings.HasPrefix(mountDevice, "cdi/") {
if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixCDI) {
continue
}
parts := strings.SplitN(strings.TrimPrefix(mountDevice, "cdi/"), "/", 3)
parts := strings.SplitN(strings.TrimPrefix(mountDevice, volumeMountDevicePrefixCDI), "/", 3)
if len(parts) != 3 {
continue
}

View File

@@ -17,8 +17,10 @@
package image
import (
"path/filepath"
"testing"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/require"
)
@@ -130,3 +132,80 @@ func TestGetRequirements(t *testing.T) {
}
}
func TestGetVisibleDevicesFromMounts(t *testing.T) {
var tests = []struct {
description string
mounts []specs.Mount
expectedDevices []string
}{
{
description: "No mounts",
mounts: nil,
expectedDevices: nil,
},
{
description: "Host path is not /dev/null",
mounts: []specs.Mount{
{
Source: "/not/dev/null",
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU0"),
},
},
expectedDevices: nil,
},
{
description: "Container path is not prefixed by 'root'",
mounts: []specs.Mount{
{
Source: "/dev/null",
Destination: filepath.Join("/other/prefix", "GPU0"),
},
},
expectedDevices: nil,
},
{
description: "Container path is only 'root'",
mounts: []specs.Mount{
{
Source: "/dev/null",
Destination: DeviceListAsVolumeMountsRoot,
},
},
expectedDevices: nil,
},
{
description: "Discover 2 devices",
mounts: makeTestMounts("GPU0", "GPU1"),
expectedDevices: []string{"GPU0", "GPU1"},
},
{
description: "Discover 2 devices with slashes in the name",
mounts: makeTestMounts("GPU0-MIG0/0/1", "GPU1-MIG0/0/1"),
expectedDevices: []string{"GPU0-MIG0/0/1", "GPU1-MIG0/0/1"},
},
{
description: "cdi devices are ignored",
mounts: makeTestMounts("GPU0", "cdi/nvidia.com/gpu=all", "GPU1"),
expectedDevices: []string{"GPU0", "GPU1"},
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
image, _ := New(WithMounts(tc.mounts))
require.Equal(t, tc.expectedDevices, image.VisibleDevicesFromMounts())
})
}
}
func makeTestMounts(paths ...string) []specs.Mount {
var mounts []specs.Mount
for _, path := range paths {
mount := specs.Mount{
Source: "/dev/null",
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, path),
}
mounts = append(mounts, mount)
}
return mounts
}

View File

@@ -0,0 +1,31 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package image
const (
EnvVarCudaVersion = "CUDA_VERSION"
EnvVarNvidiaDisableRequire = "NVIDIA_DISABLE_REQUIRE"
EnvVarNvidiaDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
EnvVarNvidiaImexChannels = "NVIDIA_IMEX_CHANNELS"
EnvVarNvidiaMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
EnvVarNvidiaMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
EnvVarNvidiaRequireCuda = NvidiaRequirePrefix + "CUDA"
EnvVarNvidiaRequireJetpack = NvidiaRequirePrefix + "JETPACK"
EnvVarNvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
NvidiaRequirePrefix = "NVIDIA_REQUIRE_"
)