mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-16 11:30:20 +00:00
The gated modifiers used to add support for GDS, Mofed, and CUDA Forward Comatibility only check the NVIDIA_VISIBLE_DEVICES envvar to determine whether GPUs are requested and modifications should be made. This means that use cases where volume mounts are used to request devices (e.g. when using the GPU Device Plugin) are not supported. This patch takes visibleDevicesFromEnvVar private, making VisibleDevices the only exported method to query valid devices. And edits the gated modifiers to use this func, ensuring device requests via mounts are also taken into acount. Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
377 lines
11 KiB
Go
377 lines
11 KiB
Go
/**
|
|
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
**/
|
|
|
|
package image
|
|
|
|
import (
|
|
"fmt"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"golang.org/x/mod/semver"
|
|
"tags.cncf.io/container-device-interface/pkg/parser"
|
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
|
)
|
|
|
|
const (
|
|
DeviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
|
|
|
|
volumeMountDevicePrefixCDI = "cdi/"
|
|
volumeMountDevicePrefixImex = "imex/"
|
|
)
|
|
|
|
// CUDA represents a CUDA image that can be used for GPU computing. This wraps
|
|
// a map of environment variable to values that can be used to perform lookups
|
|
// such as requirements.
|
|
type CUDA struct {
|
|
logger logger.Interface
|
|
|
|
env map[string]string
|
|
isPrivileged bool
|
|
mounts []specs.Mount
|
|
|
|
acceptDeviceListAsVolumeMounts bool
|
|
acceptEnvvarUnprivileged bool
|
|
preferredVisibleDeviceEnvVars []string
|
|
}
|
|
|
|
// NewCUDAImageFromSpec creates a CUDA image from the input OCI runtime spec.
|
|
// The process environment is read (if present) to construc the CUDA Image.
|
|
func NewCUDAImageFromSpec(spec *specs.Spec, opts ...Option) (CUDA, error) {
|
|
var env []string
|
|
if spec != nil && spec.Process != nil {
|
|
env = spec.Process.Env
|
|
}
|
|
|
|
specOpts := []Option{
|
|
WithEnv(env),
|
|
WithMounts(spec.Mounts),
|
|
WithPrivileged(IsPrivileged((*OCISpec)(spec))),
|
|
}
|
|
|
|
return New(append(opts, specOpts...)...)
|
|
}
|
|
|
|
// newCUDAImageFromEnv creates a CUDA image from the input environment. The environment
|
|
// is a list of strings of the form ENVAR=VALUE.
|
|
func newCUDAImageFromEnv(env []string) (CUDA, error) {
|
|
return New(WithEnv(env))
|
|
}
|
|
|
|
// Getenv returns the value of the specified environment variable.
|
|
// If the environment variable is not specified, an empty string is returned.
|
|
func (i CUDA) Getenv(key string) string {
|
|
return i.env[key]
|
|
}
|
|
|
|
// HasEnvvar checks whether the specified envvar is defined in the image.
|
|
func (i CUDA) HasEnvvar(key string) bool {
|
|
_, exists := i.env[key]
|
|
return exists
|
|
}
|
|
|
|
// IsLegacy returns whether the associated CUDA image is a "legacy" image. An
|
|
// image is considered legacy if it has a CUDA_VERSION environment variable defined
|
|
// and no NVIDIA_REQUIRE_CUDA environment variable defined.
|
|
func (i CUDA) IsLegacy() bool {
|
|
legacyCudaVersion := i.env[EnvVarCudaVersion]
|
|
cudaRequire := i.env[EnvVarNvidiaRequireCuda]
|
|
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
|
|
}
|
|
|
|
// GetRequirements returns the requirements from all NVIDIA_REQUIRE_ environment
|
|
// variables.
|
|
func (i CUDA) GetRequirements() ([]string, error) {
|
|
if i.HasDisableRequire() {
|
|
return nil, nil
|
|
}
|
|
|
|
// All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli
|
|
var requirements []string
|
|
for name, value := range i.env {
|
|
if strings.HasPrefix(name, NvidiaRequirePrefix) && !strings.HasPrefix(name, EnvVarNvidiaRequireJetpack) {
|
|
requirements = append(requirements, value)
|
|
}
|
|
}
|
|
if i.IsLegacy() {
|
|
v, err := i.legacyVersion()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get version: %v", err)
|
|
}
|
|
cudaRequire := fmt.Sprintf("cuda>=%s", v)
|
|
requirements = append(requirements, cudaRequire)
|
|
}
|
|
return requirements, nil
|
|
}
|
|
|
|
// HasDisableRequire checks for the value of the NVIDIA_DISABLE_REQUIRE. If set
|
|
// to a valid (true) boolean value this can be used to disable the requirement checks
|
|
func (i CUDA) HasDisableRequire() bool {
|
|
if disable, exists := i.env[EnvVarNvidiaDisableRequire]; exists {
|
|
// i.logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", disable)
|
|
d, _ := strconv.ParseBool(disable)
|
|
return d
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// DevicesFromEnvvars returns the devices requested by the image through environment variables
|
|
func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices {
|
|
// We concantenate all the devices from the specified env.
|
|
var isSet bool
|
|
var devices []string
|
|
requested := make(map[string]bool)
|
|
for _, envVar := range envVars {
|
|
if devs, ok := i.env[envVar]; ok {
|
|
isSet = true
|
|
for _, d := range strings.Split(devs, ",") {
|
|
trimmed := strings.TrimSpace(d)
|
|
if len(trimmed) == 0 {
|
|
continue
|
|
}
|
|
devices = append(devices, trimmed)
|
|
requested[trimmed] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// Environment variable unset with legacy image: default to "all".
|
|
if !isSet && len(devices) == 0 && i.IsLegacy() {
|
|
return NewVisibleDevices("all")
|
|
}
|
|
|
|
// Environment variable unset or empty or "void": return nil
|
|
if len(devices) == 0 || requested["void"] {
|
|
return NewVisibleDevices("void")
|
|
}
|
|
|
|
return NewVisibleDevices(devices...)
|
|
}
|
|
|
|
// GetDriverCapabilities returns the requested driver capabilities.
|
|
func (i CUDA) GetDriverCapabilities() DriverCapabilities {
|
|
env := i.env[EnvVarNvidiaDriverCapabilities]
|
|
|
|
capabilities := make(DriverCapabilities)
|
|
for _, c := range strings.Split(env, ",") {
|
|
capabilities[DriverCapability(c)] = true
|
|
}
|
|
|
|
return capabilities
|
|
}
|
|
|
|
func (i CUDA) legacyVersion() (string, error) {
|
|
cudaVersion := i.env[EnvVarCudaVersion]
|
|
majorMinor, err := parseMajorMinorVersion(cudaVersion)
|
|
if err != nil {
|
|
return "", fmt.Errorf("invalid CUDA version %v: %v", cudaVersion, err)
|
|
}
|
|
|
|
return majorMinor, nil
|
|
}
|
|
|
|
func parseMajorMinorVersion(version string) (string, error) {
|
|
vVersion := "v" + strings.TrimPrefix(version, "v")
|
|
|
|
if !semver.IsValid(vVersion) {
|
|
return "", fmt.Errorf("invalid version string")
|
|
}
|
|
|
|
majorMinor := strings.TrimPrefix(semver.MajorMinor(vVersion), "v")
|
|
parts := strings.Split(majorMinor, ".")
|
|
|
|
var err error
|
|
_, err = strconv.ParseUint(parts[0], 10, 32)
|
|
if err != nil {
|
|
return "", fmt.Errorf("invalid major version")
|
|
}
|
|
_, err = strconv.ParseUint(parts[1], 10, 32)
|
|
if err != nil {
|
|
return "", fmt.Errorf("invalid minor version")
|
|
}
|
|
return majorMinor, nil
|
|
}
|
|
|
|
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
|
|
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
|
|
var hasCDIdevice bool
|
|
for _, device := range i.visibleDevicesFromEnvVar() {
|
|
if !parser.IsQualifiedName(device) {
|
|
return false
|
|
}
|
|
hasCDIdevice = true
|
|
}
|
|
|
|
for _, device := range i.DevicesFromMounts() {
|
|
if !strings.HasPrefix(device, "cdi/") {
|
|
return false
|
|
}
|
|
hasCDIdevice = true
|
|
}
|
|
return hasCDIdevice
|
|
}
|
|
|
|
// visibleEnvVars returns the environment variables that are used to determine device visibility.
|
|
// It returns the preferred environment variables that are set, or NVIDIA_VISIBLE_DEVICES if none are set.
|
|
func (i CUDA) visibleEnvVars() []string {
|
|
var envVars []string
|
|
for _, envVar := range i.preferredVisibleDeviceEnvVars {
|
|
if i.HasEnvvar(envVar) {
|
|
envVars = append(envVars, envVar)
|
|
}
|
|
}
|
|
if len(envVars) == 0 {
|
|
envVars = append(envVars, EnvVarNvidiaVisibleDevices)
|
|
}
|
|
return envVars
|
|
}
|
|
|
|
// VisibleDevices returns a list of devices requested in the container image.
|
|
// If volume mount requests are enabled these are returned if requested,
|
|
// otherwise device requests through environment variables are considered.
|
|
// In cases where environment variable requests required privileged containers,
|
|
// such devices requests are ignored.
|
|
func (i CUDA) VisibleDevices() []string {
|
|
// If enabled, try and get the device list from volume mounts first
|
|
if i.acceptDeviceListAsVolumeMounts {
|
|
volumeMountDeviceRequests := i.visibleDevicesFromMounts()
|
|
if len(volumeMountDeviceRequests) > 0 {
|
|
return volumeMountDeviceRequests
|
|
}
|
|
}
|
|
|
|
// Get the Fallback to reading from the environment variable if privileges are correct
|
|
envVarDeviceRequests := i.visibleDevicesFromEnvVar()
|
|
if len(envVarDeviceRequests) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// If the container is privileged, or environment variable requests are
|
|
// allowed for unprivileged containers, these devices are returned.
|
|
if i.isPrivileged || i.acceptEnvvarUnprivileged {
|
|
return envVarDeviceRequests
|
|
}
|
|
|
|
// We log a warning if we are ignoring the environment variable requests.
|
|
envVars := i.visibleEnvVars()
|
|
if len(envVars) > 0 {
|
|
i.logger.Warningf("Ignoring devices requested by environment variable(s) in unprivileged container: %v", envVars)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
|
|
// If any of the preferredVisibleDeviceEnvVars are present in the image, they
|
|
// are used to determine the visible devices. If this is not the case, the
|
|
// NVIDIA_VISIBLE_DEVICES environment variable is used.
|
|
func (i CUDA) visibleDevicesFromEnvVar() []string {
|
|
envVars := i.visibleEnvVars()
|
|
return i.DevicesFromEnvvars(envVars...).List()
|
|
}
|
|
|
|
// visibleDevicesFromMounts returns the set of visible devices requested as mounts.
|
|
func (i CUDA) visibleDevicesFromMounts() []string {
|
|
var devices []string
|
|
for _, device := range i.DevicesFromMounts() {
|
|
switch {
|
|
case strings.HasPrefix(device, volumeMountDevicePrefixCDI):
|
|
continue
|
|
case strings.HasPrefix(device, volumeMountDevicePrefixImex):
|
|
continue
|
|
}
|
|
devices = append(devices, device)
|
|
}
|
|
return devices
|
|
}
|
|
|
|
// DevicesFromMounts returns a list of device specified as mounts.
|
|
func (i CUDA) DevicesFromMounts() []string {
|
|
root := filepath.Clean(DeviceListAsVolumeMountsRoot)
|
|
seen := make(map[string]bool)
|
|
var devices []string
|
|
for _, m := range i.mounts {
|
|
source := filepath.Clean(m.Source)
|
|
// Only consider mounts who's host volume is /dev/null
|
|
if source != "/dev/null" {
|
|
continue
|
|
}
|
|
|
|
destination := filepath.Clean(m.Destination)
|
|
if seen[destination] {
|
|
continue
|
|
}
|
|
seen[destination] = true
|
|
|
|
// Only consider container mount points that begin with 'root'
|
|
if !strings.HasPrefix(destination, root) {
|
|
continue
|
|
}
|
|
|
|
// Grab the full path beyond 'root' and add it to the list of devices
|
|
device := strings.Trim(strings.TrimPrefix(destination, root), "/")
|
|
if len(device) == 0 {
|
|
continue
|
|
}
|
|
devices = append(devices, device)
|
|
}
|
|
return devices
|
|
}
|
|
|
|
// CDIDevicesFromMounts returns a list of CDI devices specified as mounts on the image.
|
|
func (i CUDA) CDIDevicesFromMounts() []string {
|
|
var devices []string
|
|
for _, mountDevice := range i.DevicesFromMounts() {
|
|
if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixCDI) {
|
|
continue
|
|
}
|
|
parts := strings.SplitN(strings.TrimPrefix(mountDevice, volumeMountDevicePrefixCDI), "/", 3)
|
|
if len(parts) != 3 {
|
|
continue
|
|
}
|
|
vendor := parts[0]
|
|
class := parts[1]
|
|
device := parts[2]
|
|
devices = append(devices, fmt.Sprintf("%s/%s=%s", vendor, class, device))
|
|
}
|
|
return devices
|
|
}
|
|
|
|
// ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image.
|
|
func (i CUDA) ImexChannelsFromEnvVar() []string {
|
|
imexChannels := i.DevicesFromEnvvars(EnvVarNvidiaImexChannels).List()
|
|
if len(imexChannels) == 1 && imexChannels[0] == "all" {
|
|
return nil
|
|
}
|
|
return imexChannels
|
|
}
|
|
|
|
// ImexChannelsFromMounts returns the list of IMEX channels requested for the image.
|
|
func (i CUDA) ImexChannelsFromMounts() []string {
|
|
var channels []string
|
|
for _, mountDevice := range i.DevicesFromMounts() {
|
|
if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixImex) {
|
|
continue
|
|
}
|
|
channels = append(channels, strings.TrimPrefix(mountDevice, volumeMountDevicePrefixImex))
|
|
}
|
|
return channels
|
|
}
|