mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-04-03 12:20:46 +00:00
Merge branch 'refactor-envvar-devices' into 'main'
Add DevicesFromEnvvars function to CUDA image abstraction See merge request nvidia/container-toolkit/container-toolkit!178
This commit is contained in:
commit
89824849d3
@ -165,7 +165,7 @@ func isPrivileged(s *Spec) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func getDevicesFromEnvvar(env map[string]string, legacyImage bool) *string {
|
func getDevicesFromEnvvar(image image.CUDA) *string {
|
||||||
// Build a list of envvars to consider.
|
// Build a list of envvars to consider.
|
||||||
envVars := []string{envNVVisibleDevices}
|
envVars := []string{envNVVisibleDevices}
|
||||||
if envSwarmGPU != nil {
|
if envSwarmGPU != nil {
|
||||||
@ -173,35 +173,14 @@ func getDevicesFromEnvvar(env map[string]string, legacyImage bool) *string {
|
|||||||
envVars = append([]string{*envSwarmGPU}, envVars...)
|
envVars = append([]string{*envSwarmGPU}, envVars...)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Grab a reference to devices from the first envvar
|
devices := image.DevicesFromEnvvars(envVars...)
|
||||||
// in the list that actually exists in the environment.
|
if len(devices) == 0 {
|
||||||
var devices *string
|
|
||||||
for _, envVar := range envVars {
|
|
||||||
if devs, ok := env[envVar]; ok {
|
|
||||||
devices = &devs
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Environment variable unset with legacy image: default to "all".
|
|
||||||
if devices == nil && legacyImage {
|
|
||||||
all := "all"
|
|
||||||
return &all
|
|
||||||
}
|
|
||||||
|
|
||||||
// Environment variable unset or empty or "void": return nil
|
|
||||||
if devices == nil || len(*devices) == 0 || *devices == "void" {
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Environment variable set to "none": reset to "".
|
devicesString := strings.Join(devices, ",")
|
||||||
if *devices == "none" {
|
|
||||||
empty := ""
|
|
||||||
return &empty
|
|
||||||
}
|
|
||||||
|
|
||||||
// Any other value.
|
return &devicesString
|
||||||
return devices
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func getDevicesFromMounts(mounts []Mount) *string {
|
func getDevicesFromMounts(mounts []Mount) *string {
|
||||||
@ -241,7 +220,7 @@ func getDevicesFromMounts(mounts []Mount) *string {
|
|||||||
return &ret
|
return &ret
|
||||||
}
|
}
|
||||||
|
|
||||||
func getDevices(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool, legacyImage bool) *string {
|
func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *string {
|
||||||
// If enabled, try and get the device list from volume mounts first
|
// If enabled, try and get the device list from volume mounts first
|
||||||
if hookConfig.AcceptDeviceListAsVolumeMounts {
|
if hookConfig.AcceptDeviceListAsVolumeMounts {
|
||||||
devices := getDevicesFromMounts(mounts)
|
devices := getDevicesFromMounts(mounts)
|
||||||
@ -251,7 +230,7 @@ func getDevices(hookConfig *HookConfig, env map[string]string, mounts []Mount, p
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fallback to reading from the environment variable if privileges are correct
|
// Fallback to reading from the environment variable if privileges are correct
|
||||||
devices := getDevicesFromEnvvar(env, legacyImage)
|
devices := getDevicesFromEnvvar(image)
|
||||||
if devices == nil {
|
if devices == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -307,7 +286,7 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, p
|
|||||||
legacyImage := image.IsLegacy()
|
legacyImage := image.IsLegacy()
|
||||||
|
|
||||||
var devices string
|
var devices string
|
||||||
if d := getDevices(hookConfig, image, mounts, privileged, legacyImage); d != nil {
|
if d := getDevices(hookConfig, image, mounts, privileged); d != nil {
|
||||||
devices = *d
|
devices = *d
|
||||||
} else {
|
} else {
|
||||||
// 'nil' devices means this is not a GPU container.
|
// 'nil' devices means this is not a GPU container.
|
||||||
|
@ -4,6 +4,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -671,7 +672,7 @@ func TestDeviceListSourcePriority(t *testing.T) {
|
|||||||
hookConfig := getDefaultHookConfig()
|
hookConfig := getDefaultHookConfig()
|
||||||
hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged
|
hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged
|
||||||
hookConfig.AcceptDeviceListAsVolumeMounts = tc.acceptMounts
|
hookConfig.AcceptDeviceListAsVolumeMounts = tc.acceptMounts
|
||||||
devices = getDevices(&hookConfig, env, tc.mountDevices, tc.privileged, false)
|
devices = getDevices(&hookConfig, env, tc.mountDevices, tc.privileged)
|
||||||
}
|
}
|
||||||
|
|
||||||
// For all other tests, just grab the devices and check the results
|
// For all other tests, just grab the devices and check the results
|
||||||
@ -693,7 +694,6 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
|||||||
description string
|
description string
|
||||||
envSwarmGPU *string
|
envSwarmGPU *string
|
||||||
env map[string]string
|
env map[string]string
|
||||||
legacyImage bool
|
|
||||||
expectedDevices *string
|
expectedDevices *string
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
@ -729,13 +729,15 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
|||||||
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
|
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envNVVisibleDevices: gpuID,
|
envNVVisibleDevices: gpuID,
|
||||||
|
envCUDAVersion: "legacy",
|
||||||
},
|
},
|
||||||
legacyImage: true,
|
|
||||||
expectedDevices: &gpuID,
|
expectedDevices: &gpuID,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "empty env returns all for legacy image",
|
description: "empty env returns all for legacy image",
|
||||||
legacyImage: true,
|
env: map[string]string{
|
||||||
|
envCUDAVersion: "legacy",
|
||||||
|
},
|
||||||
expectedDevices: &all,
|
expectedDevices: &all,
|
||||||
},
|
},
|
||||||
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when
|
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when
|
||||||
@ -781,16 +783,16 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
|||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envNVVisibleDevices: gpuID,
|
envNVVisibleDevices: gpuID,
|
||||||
envDockerResourceGPUs: anotherGPUID,
|
envDockerResourceGPUs: anotherGPUID,
|
||||||
|
envCUDAVersion: "legacy",
|
||||||
},
|
},
|
||||||
legacyImage: true,
|
|
||||||
expectedDevices: &gpuID,
|
expectedDevices: &gpuID,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "empty env returns all for legacy image",
|
description: "empty env returns all for legacy image",
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envDockerResourceGPUs: anotherGPUID,
|
envDockerResourceGPUs: anotherGPUID,
|
||||||
|
envCUDAVersion: "legacy",
|
||||||
},
|
},
|
||||||
legacyImage: true,
|
|
||||||
expectedDevices: &all,
|
expectedDevices: &all,
|
||||||
},
|
},
|
||||||
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
|
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
|
||||||
@ -834,8 +836,8 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
|||||||
envSwarmGPU: &envDockerResourceGPUs,
|
envSwarmGPU: &envDockerResourceGPUs,
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envDockerResourceGPUs: gpuID,
|
envDockerResourceGPUs: gpuID,
|
||||||
|
envCUDAVersion: "legacy",
|
||||||
},
|
},
|
||||||
legacyImage: true,
|
|
||||||
expectedDevices: &gpuID,
|
expectedDevices: &gpuID,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -860,7 +862,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
|||||||
for i, tc := range tests {
|
for i, tc := range tests {
|
||||||
t.Run(tc.description, func(t *testing.T) {
|
t.Run(tc.description, func(t *testing.T) {
|
||||||
envSwarmGPU = tc.envSwarmGPU
|
envSwarmGPU = tc.envSwarmGPU
|
||||||
devices := getDevicesFromEnvvar(tc.env, tc.legacyImage)
|
devices := getDevicesFromEnvvar(image.CUDA(tc.env))
|
||||||
if tc.expectedDevices == nil {
|
if tc.expectedDevices == nil {
|
||||||
require.Nil(t, devices, "%d: %v", i, tc)
|
require.Nil(t, devices, "%d: %v", i, tc)
|
||||||
return
|
return
|
||||||
|
@ -112,6 +112,36 @@ func (i CUDA) HasDisableRequire() bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DevicesFromEnvvars returns the devices requested by the image through environment variables
|
||||||
|
func (i CUDA) DevicesFromEnvvars(envVars ...string) []string {
|
||||||
|
// Grab a reference to devices from the first envvar
|
||||||
|
// in the list that actually exists in the environment.
|
||||||
|
var devices *string
|
||||||
|
for _, envVar := range envVars {
|
||||||
|
if devs, ok := i[envVar]; ok {
|
||||||
|
devices = &devs
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Environment variable unset with legacy image: default to "all".
|
||||||
|
if devices == nil && i.IsLegacy() {
|
||||||
|
return []string{"all"}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Environment variable unset or empty or "void": return nil
|
||||||
|
if devices == nil || len(*devices) == 0 || *devices == "void" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Environment variable set to "none": reset to "".
|
||||||
|
if *devices == "none" {
|
||||||
|
return []string{""}
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.Split(*devices, ",")
|
||||||
|
}
|
||||||
|
|
||||||
func (i CUDA) legacyVersion() (string, error) {
|
func (i CUDA) legacyVersion() (string, error) {
|
||||||
majorMinor, err := parseMajorMinorVersion(i[envCUDAVersion])
|
majorMinor, err := parseMajorMinorVersion(i[envCUDAVersion])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -50,10 +50,13 @@ func NewCSVModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec)
|
|||||||
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We check whether a modification is required and return a nil modifier if this is not the case.
|
image, err := image.NewCUDAImageFromSpec(rawSpec)
|
||||||
visibleDevices, exists := ociSpec.LookupEnv(visibleDevicesEnvvar)
|
if err != nil {
|
||||||
if !exists || visibleDevices == "" || visibleDevices == visibleDevicesVoid {
|
return nil, err
|
||||||
logger.Infof("No modification required: %v=%v (exists=%v)", visibleDevicesEnvvar, visibleDevices, exists)
|
}
|
||||||
|
|
||||||
|
if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices) == 0 {
|
||||||
|
logger.Infof("No modification required; no devices requested")
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
logger.Infof("Constructing modifier from config: %+v", *cfg)
|
logger.Infof("Constructing modifier from config: %+v", *cfg)
|
||||||
@ -63,14 +66,7 @@ func NewCSVModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec)
|
|||||||
NVIDIAContainerToolkitCLIExecutablePath: cfg.NVIDIACTKConfig.Path,
|
NVIDIAContainerToolkitCLIExecutablePath: cfg.NVIDIACTKConfig.Path,
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Once the devices have been encapsulated in the CUDA image, this can be moved to before the
|
if err := checkRequirements(logger, image); err != nil {
|
||||||
// visible devices are checked.
|
|
||||||
image, err := image.NewCUDAImageFromSpec(rawSpec)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := checkRequirements(logger, &image); err != nil {
|
|
||||||
return nil, fmt.Errorf("requirements not met: %v", err)
|
return nil, fmt.Errorf("requirements not met: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,8 +75,7 @@ func NewCSVModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec)
|
|||||||
return nil, fmt.Errorf("failed to get list of CSV files: %v", err)
|
return nil, fmt.Errorf("failed to get list of CSV files: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
nvidiaRequireJetpack, _ := ociSpec.LookupEnv(nvidiaRequireJetpackEnvvar)
|
if nvidiaRequireJetpack, _ := image[nvidiaRequireJetpackEnvvar]; nvidiaRequireJetpack != "csv-mounts=all" {
|
||||||
if nvidiaRequireJetpack != "csv-mounts=all" {
|
|
||||||
csvFiles = csv.BaseFilesOnly(csvFiles)
|
csvFiles = csv.BaseFilesOnly(csvFiles)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,7 +109,7 @@ func NewCSVModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec)
|
|||||||
return modifiers, nil
|
return modifiers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func checkRequirements(logger *logrus.Logger, image *image.CUDA) error {
|
func checkRequirements(logger *logrus.Logger, image image.CUDA) error {
|
||||||
if image.HasDisableRequire() {
|
if image.HasDisableRequire() {
|
||||||
// TODO: We could print the real value here instead
|
// TODO: We could print the real value here instead
|
||||||
logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", true)
|
logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", true)
|
||||||
|
@ -20,6 +20,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
@ -32,19 +33,22 @@ const (
|
|||||||
// NewGDSModifier creates the modifiers for GDS devices.
|
// NewGDSModifier creates the modifiers for GDS devices.
|
||||||
// If the spec does not contain the NVIDIA_GDS=enabled environment variable no changes are made.
|
// If the spec does not contain the NVIDIA_GDS=enabled environment variable no changes are made.
|
||||||
func NewGDSModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
func NewGDSModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
||||||
_, err := ociSpec.Load()
|
rawSpec, err := ociSpec.Load()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We check whether a modification is required and return a nil modifier if this is not the case.
|
image, err := image.NewCUDAImageFromSpec(rawSpec)
|
||||||
visibleDevices, exists := ociSpec.LookupEnv(visibleDevicesEnvvar)
|
if err != nil {
|
||||||
if !exists || visibleDevices == "" || visibleDevices == visibleDevicesVoid {
|
return nil, err
|
||||||
logger.Infof("No modification required: %v=%v (exists=%v)", visibleDevicesEnvvar, visibleDevices, exists)
|
}
|
||||||
|
|
||||||
|
if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices) == 0 {
|
||||||
|
logger.Infof("No modification required; no devices requested")
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if gds, _ := ociSpec.LookupEnv(nvidiaGDSEnvvar); gds != "enabled" {
|
if gds, _ := image[nvidiaGDSEnvvar]; gds != "enabled" {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
@ -32,19 +33,22 @@ const (
|
|||||||
// NewMOFEDModifier creates the modifiers for MOFED devices.
|
// NewMOFEDModifier creates the modifiers for MOFED devices.
|
||||||
// If the spec does not contain the NVIDIA_MOFED=enabled environment variable no changes are made.
|
// If the spec does not contain the NVIDIA_MOFED=enabled environment variable no changes are made.
|
||||||
func NewMOFEDModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
func NewMOFEDModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
||||||
_, err := ociSpec.Load()
|
rawSpec, err := ociSpec.Load()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// We check whether a modification is required and return a nil modifier if this is not the case.
|
image, err := image.NewCUDAImageFromSpec(rawSpec)
|
||||||
visibleDevices, exists := ociSpec.LookupEnv(visibleDevicesEnvvar)
|
if err != nil {
|
||||||
if !exists || visibleDevices == "" || visibleDevices == visibleDevicesVoid {
|
return nil, err
|
||||||
logger.Infof("No modification required: %v=%v (exists=%v)", visibleDevicesEnvvar, visibleDevices, exists)
|
}
|
||||||
|
|
||||||
|
if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices) == 0 {
|
||||||
|
logger.Infof("No modification required; no devices requested")
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
if mofed, _ := ociSpec.LookupEnv(nvidiaMOFEDEnvvar); mofed != "enabled" {
|
if mofed, _ := image[nvidiaMOFEDEnvvar]; mofed != "enabled" {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user