2022-04-26 10:07:01 +00:00
|
|
|
/**
|
|
|
|
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
**/
|
|
|
|
|
|
|
|
package image
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2023-10-10 11:48:38 +00:00
|
|
|
"path/filepath"
|
2022-04-26 10:07:01 +00:00
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
|
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
|
|
"golang.org/x/mod/semver"
|
2023-11-01 11:40:51 +00:00
|
|
|
"tags.cncf.io/container-device-interface/pkg/parser"
|
2022-04-26 10:07:01 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2022-09-29 12:39:56 +00:00
|
|
|
envCUDAVersion = "CUDA_VERSION"
|
|
|
|
envNVRequirePrefix = "NVIDIA_REQUIRE_"
|
|
|
|
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
|
|
|
|
envNVRequireJetpack = envNVRequirePrefix + "JETPACK"
|
|
|
|
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
|
|
|
|
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
|
2022-04-26 10:07:01 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// CUDA represents a CUDA image that can be used for GPU computing. This wraps
|
|
|
|
// a map of environment variable to values that can be used to perform lookups
|
|
|
|
// such as requirements.
|
2023-10-10 11:48:38 +00:00
|
|
|
type CUDA struct {
|
|
|
|
env map[string]string
|
|
|
|
mounts []specs.Mount
|
|
|
|
}
|
2022-04-26 10:07:01 +00:00
|
|
|
|
|
|
|
// NewCUDAImageFromSpec creates a CUDA image from the input OCI runtime spec.
|
|
|
|
// The process environment is read (if present) to construc the CUDA Image.
|
|
|
|
func NewCUDAImageFromSpec(spec *specs.Spec) (CUDA, error) {
|
2023-07-07 10:34:43 +00:00
|
|
|
var env []string
|
|
|
|
if spec != nil && spec.Process != nil {
|
|
|
|
env = spec.Process.Env
|
2022-04-26 10:07:01 +00:00
|
|
|
}
|
|
|
|
|
2023-10-10 11:48:38 +00:00
|
|
|
return New(
|
|
|
|
WithEnv(env),
|
|
|
|
WithMounts(spec.Mounts),
|
|
|
|
)
|
2022-04-26 10:07:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewCUDAImageFromEnv creates a CUDA image from the input environment. The environment
|
|
|
|
// is a list of strings of the form ENVAR=VALUE.
|
|
|
|
func NewCUDAImageFromEnv(env []string) (CUDA, error) {
|
2023-07-07 10:34:43 +00:00
|
|
|
return New(WithEnv(env))
|
2022-04-26 10:07:01 +00:00
|
|
|
}
|
|
|
|
|
2023-10-10 11:48:38 +00:00
|
|
|
// Getenv returns the value of the specified environment variable.
|
|
|
|
// If the environment variable is not specified, an empty string is returned.
|
|
|
|
func (i CUDA) Getenv(key string) string {
|
|
|
|
return i.env[key]
|
|
|
|
}
|
|
|
|
|
|
|
|
// HasEnvvar checks whether the specified envvar is defined in the image.
|
|
|
|
func (i CUDA) HasEnvvar(key string) bool {
|
|
|
|
_, exists := i.env[key]
|
|
|
|
return exists
|
|
|
|
}
|
|
|
|
|
2022-04-26 10:07:01 +00:00
|
|
|
// IsLegacy returns whether the associated CUDA image is a "legacy" image. An
|
|
|
|
// image is considered legacy if it has a CUDA_VERSION environment variable defined
|
|
|
|
// and no NVIDIA_REQUIRE_CUDA environment variable defined.
|
|
|
|
func (i CUDA) IsLegacy() bool {
|
2023-10-10 11:48:38 +00:00
|
|
|
legacyCudaVersion := i.env[envCUDAVersion]
|
|
|
|
cudaRequire := i.env[envNVRequireCUDA]
|
2022-04-26 10:07:01 +00:00
|
|
|
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
|
|
|
|
}
|
|
|
|
|
2022-05-06 11:22:34 +00:00
|
|
|
// GetRequirements returns the requirements from all NVIDIA_REQUIRE_ environment
|
2022-04-26 10:07:01 +00:00
|
|
|
// variables.
|
|
|
|
func (i CUDA) GetRequirements() ([]string, error) {
|
2023-07-07 10:34:43 +00:00
|
|
|
if i.HasDisableRequire() {
|
|
|
|
return nil, nil
|
|
|
|
}
|
2022-04-26 10:07:01 +00:00
|
|
|
|
|
|
|
// All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli
|
|
|
|
var requirements []string
|
2023-10-10 11:48:38 +00:00
|
|
|
for name, value := range i.env {
|
2022-05-24 07:34:35 +00:00
|
|
|
if strings.HasPrefix(name, envNVRequirePrefix) && !strings.HasPrefix(name, envNVRequireJetpack) {
|
2022-04-26 10:07:01 +00:00
|
|
|
requirements = append(requirements, value)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if i.IsLegacy() {
|
|
|
|
v, err := i.legacyVersion()
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to get version: %v", err)
|
|
|
|
}
|
|
|
|
cudaRequire := fmt.Sprintf("cuda>=%s", v)
|
|
|
|
requirements = append(requirements, cudaRequire)
|
|
|
|
}
|
|
|
|
return requirements, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// HasDisableRequire checks for the value of the NVIDIA_DISABLE_REQUIRE. If set
|
|
|
|
// to a valid (true) boolean value this can be used to disable the requirement checks
|
|
|
|
func (i CUDA) HasDisableRequire() bool {
|
2023-10-10 11:48:38 +00:00
|
|
|
if disable, exists := i.env[envNVDisableRequire]; exists {
|
2022-04-26 10:07:01 +00:00
|
|
|
// i.logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", disable)
|
|
|
|
d, _ := strconv.ParseBool(disable)
|
|
|
|
return d
|
|
|
|
}
|
|
|
|
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2022-06-29 15:33:05 +00:00
|
|
|
// DevicesFromEnvvars returns the devices requested by the image through environment variables
|
2022-10-26 10:37:23 +00:00
|
|
|
func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices {
|
2023-10-10 11:48:38 +00:00
|
|
|
// We concantenate all the devices from the specified env.
|
2022-10-05 10:32:28 +00:00
|
|
|
var isSet bool
|
|
|
|
var devices []string
|
|
|
|
requested := make(map[string]bool)
|
2022-06-29 15:33:05 +00:00
|
|
|
for _, envVar := range envVars {
|
2023-10-10 11:48:38 +00:00
|
|
|
if devs, ok := i.env[envVar]; ok {
|
2022-10-05 10:32:28 +00:00
|
|
|
isSet = true
|
|
|
|
for _, d := range strings.Split(devs, ",") {
|
|
|
|
trimmed := strings.TrimSpace(d)
|
|
|
|
if len(trimmed) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
devices = append(devices, trimmed)
|
|
|
|
requested[trimmed] = true
|
|
|
|
}
|
2022-06-29 15:33:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Environment variable unset with legacy image: default to "all".
|
2022-10-05 10:32:28 +00:00
|
|
|
if !isSet && len(devices) == 0 && i.IsLegacy() {
|
2022-11-14 11:19:59 +00:00
|
|
|
return NewVisibleDevices("all")
|
2022-06-29 15:33:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Environment variable unset or empty or "void": return nil
|
2022-10-05 10:32:28 +00:00
|
|
|
if len(devices) == 0 || requested["void"] {
|
2022-11-14 11:19:59 +00:00
|
|
|
return NewVisibleDevices("void")
|
2022-06-29 15:33:05 +00:00
|
|
|
}
|
|
|
|
|
2022-11-14 11:19:59 +00:00
|
|
|
return NewVisibleDevices(devices...)
|
2022-06-29 15:33:05 +00:00
|
|
|
}
|
|
|
|
|
2022-09-29 12:39:56 +00:00
|
|
|
// GetDriverCapabilities returns the requested driver capabilities.
|
|
|
|
func (i CUDA) GetDriverCapabilities() DriverCapabilities {
|
2023-10-10 11:48:38 +00:00
|
|
|
env := i.env[envNVDriverCapabilities]
|
2022-09-29 12:39:56 +00:00
|
|
|
|
2023-08-25 14:14:06 +00:00
|
|
|
capabilities := make(DriverCapabilities)
|
2022-09-29 12:39:56 +00:00
|
|
|
for _, c := range strings.Split(env, ",") {
|
2023-08-25 14:14:06 +00:00
|
|
|
capabilities[DriverCapability(c)] = true
|
2022-09-29 12:39:56 +00:00
|
|
|
}
|
|
|
|
|
2023-08-25 14:14:06 +00:00
|
|
|
return capabilities
|
2022-09-29 12:39:56 +00:00
|
|
|
}
|
|
|
|
|
2022-04-26 10:07:01 +00:00
|
|
|
func (i CUDA) legacyVersion() (string, error) {
|
2023-10-10 11:48:38 +00:00
|
|
|
cudaVersion := i.env[envCUDAVersion]
|
2023-07-07 10:38:38 +00:00
|
|
|
majorMinor, err := parseMajorMinorVersion(cudaVersion)
|
2022-04-26 10:07:01 +00:00
|
|
|
if err != nil {
|
2023-07-07 10:38:38 +00:00
|
|
|
return "", fmt.Errorf("invalid CUDA version %v: %v", cudaVersion, err)
|
2022-04-26 10:07:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return majorMinor, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func parseMajorMinorVersion(version string) (string, error) {
|
|
|
|
vVersion := "v" + strings.TrimPrefix(version, "v")
|
|
|
|
|
|
|
|
if !semver.IsValid(vVersion) {
|
|
|
|
return "", fmt.Errorf("invalid version string")
|
|
|
|
}
|
|
|
|
|
|
|
|
majorMinor := strings.TrimPrefix(semver.MajorMinor(vVersion), "v")
|
|
|
|
parts := strings.Split(majorMinor, ".")
|
|
|
|
|
|
|
|
var err error
|
|
|
|
_, err = strconv.ParseUint(parts[0], 10, 32)
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("invalid major version")
|
|
|
|
}
|
|
|
|
_, err = strconv.ParseUint(parts[1], 10, 32)
|
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("invalid minor version")
|
|
|
|
}
|
|
|
|
return majorMinor, nil
|
|
|
|
}
|
2023-10-10 11:48:38 +00:00
|
|
|
|
|
|
|
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
|
|
|
|
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
|
|
|
|
var hasCDIdevice bool
|
|
|
|
for _, device := range i.DevicesFromEnvvars("NVIDIA_VISIBLE_DEVICES").List() {
|
|
|
|
if !parser.IsQualifiedName(device) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
hasCDIdevice = true
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, device := range i.DevicesFromMounts() {
|
|
|
|
if !strings.HasPrefix(device, "cdi/") {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
hasCDIdevice = true
|
|
|
|
}
|
|
|
|
return hasCDIdevice
|
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
|
|
|
deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
|
|
|
|
)
|
|
|
|
|
|
|
|
// DevicesFromMounts returns a list of device specified as mounts.
|
|
|
|
// TODO: This should be merged with getDevicesFromMounts used in the NVIDIA Container Runtime
|
|
|
|
func (i CUDA) DevicesFromMounts() []string {
|
|
|
|
root := filepath.Clean(deviceListAsVolumeMountsRoot)
|
|
|
|
seen := make(map[string]bool)
|
|
|
|
var devices []string
|
|
|
|
for _, m := range i.mounts {
|
|
|
|
source := filepath.Clean(m.Source)
|
|
|
|
// Only consider mounts who's host volume is /dev/null
|
|
|
|
if source != "/dev/null" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
destination := filepath.Clean(m.Destination)
|
|
|
|
if seen[destination] {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
seen[destination] = true
|
|
|
|
|
|
|
|
// Only consider container mount points that begin with 'root'
|
|
|
|
if !strings.HasPrefix(destination, root) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// Grab the full path beyond 'root' and add it to the list of devices
|
|
|
|
device := strings.Trim(strings.TrimPrefix(destination, root), "/")
|
|
|
|
if len(device) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
devices = append(devices, device)
|
|
|
|
}
|
|
|
|
return devices
|
|
|
|
}
|
|
|
|
|
|
|
|
// CDIDevicesFromMounts returns a list of CDI devices specified as mounts on the image.
|
|
|
|
func (i CUDA) CDIDevicesFromMounts() []string {
|
|
|
|
var devices []string
|
|
|
|
for _, mountDevice := range i.DevicesFromMounts() {
|
|
|
|
if !strings.HasPrefix(mountDevice, "cdi/") {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
parts := strings.SplitN(strings.TrimPrefix(mountDevice, "cdi/"), "/", 3)
|
|
|
|
if len(parts) != 3 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
vendor := parts[0]
|
|
|
|
class := parts[1]
|
|
|
|
device := parts[2]
|
|
|
|
devices = append(devices, fmt.Sprintf("%s/%s=%s", vendor, class, device))
|
|
|
|
}
|
|
|
|
return devices
|
|
|
|
}
|