/** # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. **/ package image import ( "fmt" "path/filepath" "strconv" "strings" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/mod/semver" "tags.cncf.io/container-device-interface/pkg/parser" ) const ( DeviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices" volumeMountDevicePrefixCDI = "cdi/" volumeMountDevicePrefixImex = "imex/" ) // CUDA represents a CUDA image that can be used for GPU computing. This wraps // a map of environment variable to values that can be used to perform lookups // such as requirements. type CUDA struct { env map[string]string mounts []specs.Mount } // NewCUDAImageFromSpec creates a CUDA image from the input OCI runtime spec. // The process environment is read (if present) to construc the CUDA Image. func NewCUDAImageFromSpec(spec *specs.Spec) (CUDA, error) { var env []string if spec != nil && spec.Process != nil { env = spec.Process.Env } return New( WithEnv(env), WithMounts(spec.Mounts), ) } // NewCUDAImageFromEnv creates a CUDA image from the input environment. The environment // is a list of strings of the form ENVAR=VALUE. func NewCUDAImageFromEnv(env []string) (CUDA, error) { return New(WithEnv(env)) } // Getenv returns the value of the specified environment variable. // If the environment variable is not specified, an empty string is returned. func (i CUDA) Getenv(key string) string { return i.env[key] } // HasEnvvar checks whether the specified envvar is defined in the image. func (i CUDA) HasEnvvar(key string) bool { _, exists := i.env[key] return exists } // IsLegacy returns whether the associated CUDA image is a "legacy" image. An // image is considered legacy if it has a CUDA_VERSION environment variable defined // and no NVIDIA_REQUIRE_CUDA environment variable defined. func (i CUDA) IsLegacy() bool { legacyCudaVersion := i.env[EnvVarCudaVersion] cudaRequire := i.env[EnvVarNvidiaRequireCuda] return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 } // GetRequirements returns the requirements from all NVIDIA_REQUIRE_ environment // variables. func (i CUDA) GetRequirements() ([]string, error) { if i.HasDisableRequire() { return nil, nil } // All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli var requirements []string for name, value := range i.env { if strings.HasPrefix(name, NvidiaRequirePrefix) && !strings.HasPrefix(name, EnvVarNvidiaRequireJetpack) { requirements = append(requirements, value) } } if i.IsLegacy() { v, err := i.legacyVersion() if err != nil { return nil, fmt.Errorf("failed to get version: %v", err) } cudaRequire := fmt.Sprintf("cuda>=%s", v) requirements = append(requirements, cudaRequire) } return requirements, nil } // HasDisableRequire checks for the value of the NVIDIA_DISABLE_REQUIRE. If set // to a valid (true) boolean value this can be used to disable the requirement checks func (i CUDA) HasDisableRequire() bool { if disable, exists := i.env[EnvVarNvidiaDisableRequire]; exists { // i.logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", disable) d, _ := strconv.ParseBool(disable) return d } return false } // DevicesFromEnvvars returns the devices requested by the image through environment variables func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices { // We concantenate all the devices from the specified env. var isSet bool var devices []string requested := make(map[string]bool) for _, envVar := range envVars { if devs, ok := i.env[envVar]; ok { isSet = true for _, d := range strings.Split(devs, ",") { trimmed := strings.TrimSpace(d) if len(trimmed) == 0 { continue } devices = append(devices, trimmed) requested[trimmed] = true } } } // Environment variable unset with legacy image: default to "all". if !isSet && len(devices) == 0 && i.IsLegacy() { return NewVisibleDevices("all") } // Environment variable unset or empty or "void": return nil if len(devices) == 0 || requested["void"] { return NewVisibleDevices("void") } return NewVisibleDevices(devices...) } // GetDriverCapabilities returns the requested driver capabilities. func (i CUDA) GetDriverCapabilities() DriverCapabilities { env := i.env[EnvVarNvidiaDriverCapabilities] capabilities := make(DriverCapabilities) for _, c := range strings.Split(env, ",") { capabilities[DriverCapability(c)] = true } return capabilities } func (i CUDA) legacyVersion() (string, error) { cudaVersion := i.env[EnvVarCudaVersion] majorMinor, err := parseMajorMinorVersion(cudaVersion) if err != nil { return "", fmt.Errorf("invalid CUDA version %v: %v", cudaVersion, err) } return majorMinor, nil } func parseMajorMinorVersion(version string) (string, error) { vVersion := "v" + strings.TrimPrefix(version, "v") if !semver.IsValid(vVersion) { return "", fmt.Errorf("invalid version string") } majorMinor := strings.TrimPrefix(semver.MajorMinor(vVersion), "v") parts := strings.Split(majorMinor, ".") var err error _, err = strconv.ParseUint(parts[0], 10, 32) if err != nil { return "", fmt.Errorf("invalid major version") } _, err = strconv.ParseUint(parts[1], 10, 32) if err != nil { return "", fmt.Errorf("invalid minor version") } return majorMinor, nil } // OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { var hasCDIdevice bool for _, device := range i.VisibleDevicesFromEnvVar() { if !parser.IsQualifiedName(device) { return false } hasCDIdevice = true } for _, device := range i.DevicesFromMounts() { if !strings.HasPrefix(device, "cdi/") { return false } hasCDIdevice = true } return hasCDIdevice } // VisibleDevicesFromEnvVar returns the set of visible devices requested through // the NVIDIA_VISIBLE_DEVICES environment variable. func (i CUDA) VisibleDevicesFromEnvVar() []string { return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List() } // VisibleDevicesFromMounts returns the set of visible devices requested as mounts. func (i CUDA) VisibleDevicesFromMounts() []string { var devices []string for _, device := range i.DevicesFromMounts() { switch { case strings.HasPrefix(device, volumeMountDevicePrefixCDI): continue case strings.HasPrefix(device, volumeMountDevicePrefixImex): continue } devices = append(devices, device) } return devices } // DevicesFromMounts returns a list of device specified as mounts. // TODO: This should be merged with getDevicesFromMounts used in the NVIDIA Container Runtime func (i CUDA) DevicesFromMounts() []string { root := filepath.Clean(DeviceListAsVolumeMountsRoot) seen := make(map[string]bool) var devices []string for _, m := range i.mounts { source := filepath.Clean(m.Source) // Only consider mounts who's host volume is /dev/null if source != "/dev/null" { continue } destination := filepath.Clean(m.Destination) if seen[destination] { continue } seen[destination] = true // Only consider container mount points that begin with 'root' if !strings.HasPrefix(destination, root) { continue } // Grab the full path beyond 'root' and add it to the list of devices device := strings.Trim(strings.TrimPrefix(destination, root), "/") if len(device) == 0 { continue } devices = append(devices, device) } return devices } // CDIDevicesFromMounts returns a list of CDI devices specified as mounts on the image. func (i CUDA) CDIDevicesFromMounts() []string { var devices []string for _, mountDevice := range i.DevicesFromMounts() { if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixCDI) { continue } parts := strings.SplitN(strings.TrimPrefix(mountDevice, volumeMountDevicePrefixCDI), "/", 3) if len(parts) != 3 { continue } vendor := parts[0] class := parts[1] device := parts[2] devices = append(devices, fmt.Sprintf("%s/%s=%s", vendor, class, device)) } return devices } // ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image. func (i CUDA) ImexChannelsFromEnvVar() []string { imexChannels := i.DevicesFromEnvvars(EnvVarNvidiaImexChannels).List() if len(imexChannels) == 1 && imexChannels[0] == "all" { return nil } return imexChannels } // ImexChannelsFromMounts returns the list of IMEX channels requested for the image. func (i CUDA) ImexChannelsFromMounts() []string { var channels []string for _, mountDevice := range i.DevicesFromMounts() { if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixImex) { continue } channels = append(channels, strings.TrimPrefix(mountDevice, volumeMountDevicePrefixImex)) } return channels }