mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-22 00:08:11 +00:00
Merge pull request #740 from elezar/imex-by-volume-mount
Allow IMEX channel requests by volume mount
This commit is contained in:
commit
2987c4d670
@ -6,8 +6,6 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"golang.org/x/mod/semver"
|
||||
@ -15,31 +13,15 @@ import (
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||
)
|
||||
|
||||
const (
|
||||
envCUDAVersion = "CUDA_VERSION"
|
||||
envNVRequirePrefix = "NVIDIA_REQUIRE_"
|
||||
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
|
||||
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
|
||||
envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
||||
envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
|
||||
envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
|
||||
envNVImexChannels = "NVIDIA_IMEX_CHANNELS"
|
||||
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
|
||||
)
|
||||
|
||||
const (
|
||||
capSysAdmin = "CAP_SYS_ADMIN"
|
||||
)
|
||||
|
||||
const (
|
||||
deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
|
||||
)
|
||||
|
||||
type nvidiaConfig struct {
|
||||
Devices string
|
||||
Devices []string
|
||||
MigConfigDevices string
|
||||
MigMonitorDevices string
|
||||
ImexChannels string
|
||||
ImexChannels []string
|
||||
DriverCapabilities string
|
||||
// Requirements defines the requirements DSL for the container to run.
|
||||
// This is empty if no specific requirements are needed, or if requirements are
|
||||
@ -77,23 +59,14 @@ type LinuxCapabilities struct {
|
||||
Ambient []string `json:"ambient,omitempty" platform:"linux"`
|
||||
}
|
||||
|
||||
// Mount from OCI runtime spec
|
||||
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L103
|
||||
type Mount struct {
|
||||
Destination string `json:"destination"`
|
||||
Type string `json:"type,omitempty" platform:"linux,solaris"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Options []string `json:"options,omitempty"`
|
||||
}
|
||||
|
||||
// Spec from OCI runtime spec
|
||||
// We use pointers to structs, similarly to the latest version of runtime-spec:
|
||||
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L5-L28
|
||||
type Spec struct {
|
||||
Version *string `json:"ociVersion"`
|
||||
Process *Process `json:"process,omitempty"`
|
||||
Root *Root `json:"root,omitempty"`
|
||||
Mounts []Mount `json:"mounts,omitempty"`
|
||||
Version *string `json:"ociVersion"`
|
||||
Process *Process `json:"process,omitempty"`
|
||||
Root *Root `json:"root,omitempty"`
|
||||
Mounts []specs.Mount `json:"mounts,omitempty"`
|
||||
}
|
||||
|
||||
// HookState holds state information about the hook
|
||||
@ -172,82 +145,30 @@ func isPrivileged(s *Spec) bool {
|
||||
return image.IsPrivileged(&fullSpec)
|
||||
}
|
||||
|
||||
func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string {
|
||||
func getDevicesFromEnvvar(containerImage image.CUDA, swarmResourceEnvvars []string) []string {
|
||||
// We check if the image has at least one of the Swarm resource envvars defined and use this
|
||||
// if specified.
|
||||
var hasSwarmEnvvar bool
|
||||
for _, envvar := range swarmResourceEnvvars {
|
||||
if image.HasEnvvar(envvar) {
|
||||
hasSwarmEnvvar = true
|
||||
break
|
||||
if containerImage.HasEnvvar(envvar) {
|
||||
return containerImage.DevicesFromEnvvars(swarmResourceEnvvars...).List()
|
||||
}
|
||||
}
|
||||
|
||||
var devices []string
|
||||
if hasSwarmEnvvar {
|
||||
devices = image.DevicesFromEnvvars(swarmResourceEnvvars...).List()
|
||||
} else {
|
||||
devices = image.DevicesFromEnvvars(envNVVisibleDevices).List()
|
||||
}
|
||||
|
||||
if len(devices) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
devicesString := strings.Join(devices, ",")
|
||||
|
||||
return &devicesString
|
||||
return containerImage.VisibleDevicesFromEnvVar()
|
||||
}
|
||||
|
||||
func getDevicesFromMounts(mounts []Mount) *string {
|
||||
var devices []string
|
||||
for _, m := range mounts {
|
||||
root := filepath.Clean(deviceListAsVolumeMountsRoot)
|
||||
source := filepath.Clean(m.Source)
|
||||
destination := filepath.Clean(m.Destination)
|
||||
|
||||
// Only consider mounts who's host volume is /dev/null
|
||||
if source != "/dev/null" {
|
||||
continue
|
||||
}
|
||||
// Only consider container mount points that begin with 'root'
|
||||
if len(destination) < len(root) {
|
||||
continue
|
||||
}
|
||||
if destination[:len(root)] != root {
|
||||
continue
|
||||
}
|
||||
// Grab the full path beyond 'root' and add it to the list of devices
|
||||
device := destination[len(root):]
|
||||
if len(device) > 0 && device[0] == '/' {
|
||||
device = device[1:]
|
||||
}
|
||||
if len(device) == 0 {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, device)
|
||||
}
|
||||
|
||||
if devices == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
ret := strings.Join(devices, ",")
|
||||
return &ret
|
||||
}
|
||||
|
||||
func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *string {
|
||||
func getDevices(hookConfig *HookConfig, image image.CUDA, privileged bool) []string {
|
||||
// If enabled, try and get the device list from volume mounts first
|
||||
if hookConfig.AcceptDeviceListAsVolumeMounts {
|
||||
devices := getDevicesFromMounts(mounts)
|
||||
if devices != nil {
|
||||
devices := image.VisibleDevicesFromMounts()
|
||||
if len(devices) > 0 {
|
||||
return devices
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to reading from the environment variable if privileges are correct
|
||||
devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars())
|
||||
if devices == nil {
|
||||
if len(devices) == 0 {
|
||||
return nil
|
||||
}
|
||||
if privileged || hookConfig.AcceptEnvvarUnprivileged {
|
||||
@ -260,12 +181,12 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMigConfigDevices(image image.CUDA) *string {
|
||||
return getMigDevices(image, envNVMigConfigDevices)
|
||||
func getMigConfigDevices(i image.CUDA) *string {
|
||||
return getMigDevices(i, image.EnvVarNvidiaMigConfigDevices)
|
||||
}
|
||||
|
||||
func getMigMonitorDevices(image image.CUDA) *string {
|
||||
return getMigDevices(image, envNVMigMonitorDevices)
|
||||
func getMigMonitorDevices(i image.CUDA) *string {
|
||||
return getMigDevices(i, image.EnvVarNvidiaMigMonitorDevices)
|
||||
}
|
||||
|
||||
func getMigDevices(image image.CUDA, envvar string) *string {
|
||||
@ -276,12 +197,24 @@ func getMigDevices(image image.CUDA, envvar string) *string {
|
||||
return &devices
|
||||
}
|
||||
|
||||
func getImexChannels(image image.CUDA) *string {
|
||||
if !image.HasEnvvar(envNVImexChannels) {
|
||||
func getImexChannels(hookConfig *HookConfig, image image.CUDA, privileged bool) []string {
|
||||
// If enabled, try and get the device list from volume mounts first
|
||||
if hookConfig.AcceptDeviceListAsVolumeMounts {
|
||||
devices := image.ImexChannelsFromMounts()
|
||||
if len(devices) > 0 {
|
||||
return devices
|
||||
}
|
||||
}
|
||||
devices := image.ImexChannelsFromEnvVar()
|
||||
if len(devices) == 0 {
|
||||
return nil
|
||||
}
|
||||
chans := image.Getenv(envNVImexChannels)
|
||||
return &chans
|
||||
|
||||
if privileged || hookConfig.AcceptEnvvarUnprivileged {
|
||||
return devices
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage bool) image.DriverCapabilities {
|
||||
@ -291,8 +224,8 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo
|
||||
|
||||
capabilities := supportedDriverCapabilities.Intersection(image.DefaultDriverCapabilities)
|
||||
|
||||
capsEnvSpecified := cudaImage.HasEnvvar(envNVDriverCapabilities)
|
||||
capsEnv := cudaImage.Getenv(envNVDriverCapabilities)
|
||||
capsEnvSpecified := cudaImage.HasEnvvar(image.EnvVarNvidiaDriverCapabilities)
|
||||
capsEnv := cudaImage.Getenv(image.EnvVarNvidiaDriverCapabilities)
|
||||
|
||||
if !capsEnvSpecified && legacyImage {
|
||||
// Environment variable unset with legacy image: set all capabilities.
|
||||
@ -311,14 +244,12 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo
|
||||
return capabilities
|
||||
}
|
||||
|
||||
func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *nvidiaConfig {
|
||||
func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, privileged bool) *nvidiaConfig {
|
||||
legacyImage := image.IsLegacy()
|
||||
|
||||
var devices string
|
||||
if d := getDevices(hookConfig, image, mounts, privileged); d != nil {
|
||||
devices = *d
|
||||
} else {
|
||||
// 'nil' devices means this is not a GPU container.
|
||||
devices := getDevices(hookConfig, image, privileged)
|
||||
if len(devices) == 0 {
|
||||
// empty devices means this is not a GPU container.
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -338,10 +269,7 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, p
|
||||
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
|
||||
}
|
||||
|
||||
var imexChannels string
|
||||
if c := getImexChannels(image); c != nil {
|
||||
imexChannels = *c
|
||||
}
|
||||
imexChannels := getImexChannels(hookConfig, image, privileged)
|
||||
|
||||
driverCapabilities := hookConfig.getDriverCapabilities(image, legacyImage).String()
|
||||
|
||||
@ -376,6 +304,7 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
|
||||
|
||||
image, err := image.New(
|
||||
image.WithEnv(s.Process.Env),
|
||||
image.WithMounts(s.Mounts),
|
||||
image.WithDisableRequire(hook.DisableRequire),
|
||||
)
|
||||
if err != nil {
|
||||
@ -387,6 +316,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
|
||||
Pid: h.Pid,
|
||||
Rootfs: s.Root.Path,
|
||||
Image: image,
|
||||
Nvidia: getNvidiaConfig(&hook, image, s.Mounts, privileged),
|
||||
Nvidia: getNvidiaConfig(&hook, image, privileged),
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -120,8 +120,8 @@ func doPrestart() {
|
||||
if cli.NoCgroups {
|
||||
args = append(args, "--no-cgroups")
|
||||
}
|
||||
if len(nvidia.Devices) > 0 {
|
||||
args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices))
|
||||
if devicesString := strings.Join(nvidia.Devices, ","); len(devicesString) > 0 {
|
||||
args = append(args, fmt.Sprintf("--device=%s", devicesString))
|
||||
}
|
||||
if len(nvidia.MigConfigDevices) > 0 {
|
||||
args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices))
|
||||
@ -129,8 +129,8 @@ func doPrestart() {
|
||||
if len(nvidia.MigMonitorDevices) > 0 {
|
||||
args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices))
|
||||
}
|
||||
if len(nvidia.ImexChannels) > 0 {
|
||||
args = append(args, fmt.Sprintf("--imex-channel=%s", nvidia.ImexChannels))
|
||||
if imexString := strings.Join(nvidia.ImexChannels, ","); len(imexString) > 0 {
|
||||
args = append(args, fmt.Sprintf("--imex-channel=%s", imexString))
|
||||
}
|
||||
|
||||
for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") {
|
||||
|
@ -47,7 +47,7 @@ func New(opt ...Option) (CUDA, error) {
|
||||
// build creates a CUDA image from the builder.
|
||||
func (b builder) build() (CUDA, error) {
|
||||
if b.disableRequire {
|
||||
b.env[envNVDisableRequire] = "true"
|
||||
b.env[EnvVarNvidiaDisableRequire] = "true"
|
||||
}
|
||||
|
||||
c := CUDA{
|
||||
|
@ -28,12 +28,10 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
envCUDAVersion = "CUDA_VERSION"
|
||||
envNVRequirePrefix = "NVIDIA_REQUIRE_"
|
||||
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
|
||||
envNVRequireJetpack = envNVRequirePrefix + "JETPACK"
|
||||
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
|
||||
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
|
||||
DeviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
|
||||
|
||||
volumeMountDevicePrefixCDI = "cdi/"
|
||||
volumeMountDevicePrefixImex = "imex/"
|
||||
)
|
||||
|
||||
// CUDA represents a CUDA image that can be used for GPU computing. This wraps
|
||||
@ -80,8 +78,8 @@ func (i CUDA) HasEnvvar(key string) bool {
|
||||
// image is considered legacy if it has a CUDA_VERSION environment variable defined
|
||||
// and no NVIDIA_REQUIRE_CUDA environment variable defined.
|
||||
func (i CUDA) IsLegacy() bool {
|
||||
legacyCudaVersion := i.env[envCUDAVersion]
|
||||
cudaRequire := i.env[envNVRequireCUDA]
|
||||
legacyCudaVersion := i.env[EnvVarCudaVersion]
|
||||
cudaRequire := i.env[EnvVarNvidiaRequireCuda]
|
||||
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
|
||||
}
|
||||
|
||||
@ -95,7 +93,7 @@ func (i CUDA) GetRequirements() ([]string, error) {
|
||||
// All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli
|
||||
var requirements []string
|
||||
for name, value := range i.env {
|
||||
if strings.HasPrefix(name, envNVRequirePrefix) && !strings.HasPrefix(name, envNVRequireJetpack) {
|
||||
if strings.HasPrefix(name, NvidiaRequirePrefix) && !strings.HasPrefix(name, EnvVarNvidiaRequireJetpack) {
|
||||
requirements = append(requirements, value)
|
||||
}
|
||||
}
|
||||
@ -113,7 +111,7 @@ func (i CUDA) GetRequirements() ([]string, error) {
|
||||
// HasDisableRequire checks for the value of the NVIDIA_DISABLE_REQUIRE. If set
|
||||
// to a valid (true) boolean value this can be used to disable the requirement checks
|
||||
func (i CUDA) HasDisableRequire() bool {
|
||||
if disable, exists := i.env[envNVDisableRequire]; exists {
|
||||
if disable, exists := i.env[EnvVarNvidiaDisableRequire]; exists {
|
||||
// i.logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", disable)
|
||||
d, _ := strconv.ParseBool(disable)
|
||||
return d
|
||||
@ -157,7 +155,7 @@ func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices {
|
||||
|
||||
// GetDriverCapabilities returns the requested driver capabilities.
|
||||
func (i CUDA) GetDriverCapabilities() DriverCapabilities {
|
||||
env := i.env[envNVDriverCapabilities]
|
||||
env := i.env[EnvVarNvidiaDriverCapabilities]
|
||||
|
||||
capabilities := make(DriverCapabilities)
|
||||
for _, c := range strings.Split(env, ",") {
|
||||
@ -168,7 +166,7 @@ func (i CUDA) GetDriverCapabilities() DriverCapabilities {
|
||||
}
|
||||
|
||||
func (i CUDA) legacyVersion() (string, error) {
|
||||
cudaVersion := i.env[envCUDAVersion]
|
||||
cudaVersion := i.env[EnvVarCudaVersion]
|
||||
majorMinor, err := parseMajorMinorVersion(cudaVersion)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("invalid CUDA version %v: %v", cudaVersion, err)
|
||||
@ -202,7 +200,7 @@ func parseMajorMinorVersion(version string) (string, error) {
|
||||
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
|
||||
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
|
||||
var hasCDIdevice bool
|
||||
for _, device := range i.DevicesFromEnvvars("NVIDIA_VISIBLE_DEVICES").List() {
|
||||
for _, device := range i.VisibleDevicesFromEnvVar() {
|
||||
if !parser.IsQualifiedName(device) {
|
||||
return false
|
||||
}
|
||||
@ -218,14 +216,31 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
|
||||
return hasCDIdevice
|
||||
}
|
||||
|
||||
const (
|
||||
deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
|
||||
)
|
||||
// VisibleDevicesFromEnvVar returns the set of visible devices requested through
|
||||
// the NVIDIA_VISIBLE_DEVICES environment variable.
|
||||
func (i CUDA) VisibleDevicesFromEnvVar() []string {
|
||||
return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List()
|
||||
}
|
||||
|
||||
// VisibleDevicesFromMounts returns the set of visible devices requested as mounts.
|
||||
func (i CUDA) VisibleDevicesFromMounts() []string {
|
||||
var devices []string
|
||||
for _, device := range i.DevicesFromMounts() {
|
||||
switch {
|
||||
case strings.HasPrefix(device, volumeMountDevicePrefixCDI):
|
||||
continue
|
||||
case strings.HasPrefix(device, volumeMountDevicePrefixImex):
|
||||
continue
|
||||
}
|
||||
devices = append(devices, device)
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// DevicesFromMounts returns a list of device specified as mounts.
|
||||
// TODO: This should be merged with getDevicesFromMounts used in the NVIDIA Container Runtime
|
||||
func (i CUDA) DevicesFromMounts() []string {
|
||||
root := filepath.Clean(deviceListAsVolumeMountsRoot)
|
||||
root := filepath.Clean(DeviceListAsVolumeMountsRoot)
|
||||
seen := make(map[string]bool)
|
||||
var devices []string
|
||||
for _, m := range i.mounts {
|
||||
@ -260,10 +275,10 @@ func (i CUDA) DevicesFromMounts() []string {
|
||||
func (i CUDA) CDIDevicesFromMounts() []string {
|
||||
var devices []string
|
||||
for _, mountDevice := range i.DevicesFromMounts() {
|
||||
if !strings.HasPrefix(mountDevice, "cdi/") {
|
||||
if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixCDI) {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(strings.TrimPrefix(mountDevice, "cdi/"), "/", 3)
|
||||
parts := strings.SplitN(strings.TrimPrefix(mountDevice, volumeMountDevicePrefixCDI), "/", 3)
|
||||
if len(parts) != 3 {
|
||||
continue
|
||||
}
|
||||
@ -275,6 +290,19 @@ func (i CUDA) CDIDevicesFromMounts() []string {
|
||||
return devices
|
||||
}
|
||||
|
||||
func (i CUDA) IsEnabled(envvar string) bool {
|
||||
return i.Getenv(envvar) == "enabled"
|
||||
// ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image.
|
||||
func (i CUDA) ImexChannelsFromEnvVar() []string {
|
||||
return i.DevicesFromEnvvars(EnvVarNvidiaImexChannels).List()
|
||||
}
|
||||
|
||||
// ImexChannelsFromMounts returns the list of IMEX channels requested for the image.
|
||||
func (i CUDA) ImexChannelsFromMounts() []string {
|
||||
var channels []string
|
||||
for _, mountDevice := range i.DevicesFromMounts() {
|
||||
if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixImex) {
|
||||
continue
|
||||
}
|
||||
channels = append(channels, strings.TrimPrefix(mountDevice, volumeMountDevicePrefixImex))
|
||||
}
|
||||
return channels
|
||||
}
|
||||
|
@ -17,8 +17,10 @@
|
||||
package image
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
@ -130,3 +132,85 @@ func TestGetRequirements(t *testing.T) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetVisibleDevicesFromMounts(t *testing.T) {
|
||||
var tests = []struct {
|
||||
description string
|
||||
mounts []specs.Mount
|
||||
expectedDevices []string
|
||||
}{
|
||||
{
|
||||
description: "No mounts",
|
||||
mounts: nil,
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "Host path is not /dev/null",
|
||||
mounts: []specs.Mount{
|
||||
{
|
||||
Source: "/not/dev/null",
|
||||
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU0"),
|
||||
},
|
||||
},
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "Container path is not prefixed by 'root'",
|
||||
mounts: []specs.Mount{
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join("/other/prefix", "GPU0"),
|
||||
},
|
||||
},
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "Container path is only 'root'",
|
||||
mounts: []specs.Mount{
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: DeviceListAsVolumeMountsRoot,
|
||||
},
|
||||
},
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "Discover 2 devices",
|
||||
mounts: makeTestMounts("GPU0", "GPU1"),
|
||||
expectedDevices: []string{"GPU0", "GPU1"},
|
||||
},
|
||||
{
|
||||
description: "Discover 2 devices with slashes in the name",
|
||||
mounts: makeTestMounts("GPU0-MIG0/0/1", "GPU1-MIG0/0/1"),
|
||||
expectedDevices: []string{"GPU0-MIG0/0/1", "GPU1-MIG0/0/1"},
|
||||
},
|
||||
{
|
||||
description: "cdi devices are ignored",
|
||||
mounts: makeTestMounts("GPU0", "cdi/nvidia.com/gpu=all", "GPU1"),
|
||||
expectedDevices: []string{"GPU0", "GPU1"},
|
||||
},
|
||||
{
|
||||
description: "imex devices are ignored",
|
||||
mounts: makeTestMounts("GPU0", "imex/0", "GPU1"),
|
||||
expectedDevices: []string{"GPU0", "GPU1"},
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
image, _ := New(WithMounts(tc.mounts))
|
||||
require.Equal(t, tc.expectedDevices, image.VisibleDevicesFromMounts())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func makeTestMounts(paths ...string) []specs.Mount {
|
||||
var mounts []specs.Mount
|
||||
for _, path := range paths {
|
||||
mount := specs.Mount{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, path),
|
||||
}
|
||||
mounts = append(mounts, mount)
|
||||
}
|
||||
return mounts
|
||||
}
|
||||
|
31
internal/config/image/envvars.go
Normal file
31
internal/config/image/envvars.go
Normal file
@ -0,0 +1,31 @@
|
||||
/**
|
||||
# Copyright 2024 NVIDIA CORPORATION
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package image
|
||||
|
||||
const (
|
||||
EnvVarCudaVersion = "CUDA_VERSION"
|
||||
EnvVarNvidiaDisableRequire = "NVIDIA_DISABLE_REQUIRE"
|
||||
EnvVarNvidiaDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
|
||||
EnvVarNvidiaImexChannels = "NVIDIA_IMEX_CHANNELS"
|
||||
EnvVarNvidiaMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
|
||||
EnvVarNvidiaMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
|
||||
EnvVarNvidiaRequireCuda = NvidiaRequirePrefix + "CUDA"
|
||||
EnvVarNvidiaRequireJetpack = NvidiaRequirePrefix + "JETPACK"
|
||||
EnvVarNvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
||||
|
||||
NvidiaRequirePrefix = "NVIDIA_REQUIRE_"
|
||||
)
|
@ -90,11 +90,9 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C
|
||||
}
|
||||
}
|
||||
|
||||
envDevices := container.DevicesFromEnvvars(visibleDevicesEnvvar)
|
||||
|
||||
var devices []string
|
||||
seen := make(map[string]bool)
|
||||
for _, name := range envDevices.List() {
|
||||
for _, name := range container.VisibleDevicesFromEnvVar() {
|
||||
if !parser.IsQualifiedName(name) {
|
||||
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
|
||||
}
|
||||
|
@ -30,23 +30,16 @@ import (
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
||||
)
|
||||
|
||||
const (
|
||||
visibleDevicesEnvvar = "NVIDIA_VISIBLE_DEVICES"
|
||||
visibleDevicesVoid = "void"
|
||||
|
||||
nvidiaRequireJetpackEnvvar = "NVIDIA_REQUIRE_JETPACK"
|
||||
)
|
||||
|
||||
// NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper.
|
||||
// The modifications are defined by CSV MountSpecs.
|
||||
func NewCSVModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
|
||||
if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 {
|
||||
func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) {
|
||||
if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
logger.Infof("No modification required; no devices requested")
|
||||
return nil, nil
|
||||
}
|
||||
logger.Infof("Constructing modifier from config: %+v", *cfg)
|
||||
|
||||
if err := checkRequirements(logger, image); err != nil {
|
||||
if err := checkRequirements(logger, container); err != nil {
|
||||
return nil, fmt.Errorf("requirements not met: %v", err)
|
||||
}
|
||||
|
||||
@ -55,7 +48,7 @@ func NewCSVModifier(logger logger.Interface, cfg *config.Config, image image.CUD
|
||||
return nil, fmt.Errorf("failed to get list of CSV files: %v", err)
|
||||
}
|
||||
|
||||
if image.Getenv(nvidiaRequireJetpackEnvvar) != "csv-mounts=all" {
|
||||
if container.Getenv(image.EnvVarNvidiaRequireJetpack) != "csv-mounts=all" {
|
||||
csvFiles = csv.BaseFilesOnly(csvFiles)
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,7 @@ import (
|
||||
//
|
||||
// If not devices are selected, no changes are made.
|
||||
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
|
||||
if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 {
|
||||
if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
logger.Infof("No modification required; no devices requested")
|
||||
return nil, nil
|
||||
}
|
||||
|
@ -29,8 +29,8 @@ import (
|
||||
|
||||
// NewGraphicsModifier constructs a modifier that injects graphics-related modifications into an OCI runtime specification.
|
||||
// The value of the NVIDIA_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made.
|
||||
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver) (oci.SpecModifier, error) {
|
||||
if required, reason := requiresGraphicsModifier(image); !required {
|
||||
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerImage image.CUDA, driver *root.Driver) (oci.SpecModifier, error) {
|
||||
if required, reason := requiresGraphicsModifier(containerImage); !required {
|
||||
logger.Infof("No graphics modifier required: %v", reason)
|
||||
return nil, nil
|
||||
}
|
||||
@ -50,7 +50,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image imag
|
||||
devRoot := driver.Root
|
||||
drmNodes, err := discover.NewDRMNodesDiscoverer(
|
||||
logger,
|
||||
image.DevicesFromEnvvars(visibleDevicesEnvvar),
|
||||
containerImage.DevicesFromEnvvars(image.EnvVarNvidiaVisibleDevices),
|
||||
devRoot,
|
||||
nvidiaCDIHookPath,
|
||||
)
|
||||
@ -67,7 +67,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image imag
|
||||
|
||||
// requiresGraphicsModifier determines whether a graphics modifier is required.
|
||||
func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) {
|
||||
if devices := cudaImage.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 {
|
||||
if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
return false, "no devices requested"
|
||||
}
|
||||
|
||||
|
@ -24,6 +24,7 @@ import (
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
"tags.cncf.io/container-device-interface/pkg/cdi"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils"
|
||||
@ -200,7 +201,7 @@ func (m *wrapper) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
edits.Env = append(edits.Env, "NVIDIA_VISIBLE_DEVICES=void")
|
||||
edits.Env = append(edits.Env, image.EnvVarNvidiaVisibleDevices+"=void")
|
||||
|
||||
return edits, nil
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user