mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-26 13:59:45 +00:00
d37c17857e
This change adds functionality to generate CDI specifications for all devices detected on the system. A specification containing all GPUs and MIG devices is generated. All libraries on the host ldcache that have an NVIDIA Driver Version suffix are included as are the required binaries and IPC sockets. A hook (based on the nvidia-ctk hook subcommand) to update the ldcache in the container for the libraries being injected is also added to the CDI specificiation. Signed-off-by: Evan Lezar <elezar@nvidia.com>
124 lines
3.5 KiB
Go
124 lines
3.5 KiB
Go
/*
|
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package cdi
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
|
)
|
|
|
|
// nvmlDevice wraps an nvml.Device with more functions.
|
|
type nvmlDevice struct {
|
|
nvml.Device
|
|
}
|
|
|
|
// nvmlMigDevice allows for specific functions of nvmlDevice to be overridden.
|
|
type nvmlMigDevice nvmlDevice
|
|
|
|
// deviceInfo defines the information the required to construct a Device
|
|
type deviceInfo interface {
|
|
GetUUID() (string, error)
|
|
GetDeviceNodes() ([]string, error)
|
|
}
|
|
|
|
var _ deviceInfo = (*nvmlDevice)(nil)
|
|
var _ deviceInfo = (*nvmlMigDevice)(nil)
|
|
|
|
func newGPUDevice(i int, gpu device.Device) (string, nvmlDevice) {
|
|
return fmt.Sprintf("%v", i), nvmlDevice{gpu}
|
|
}
|
|
|
|
func newMigDevice(i int, j int, mig device.MigDevice) (string, nvmlMigDevice) {
|
|
return fmt.Sprintf("%v:%v", i, j), nvmlMigDevice{mig}
|
|
}
|
|
|
|
// GetUUID returns the UUID of the device
|
|
func (d nvmlDevice) GetUUID() (string, error) {
|
|
uuid, ret := d.Device.GetUUID()
|
|
if ret != nvml.SUCCESS {
|
|
return "", ret
|
|
}
|
|
return uuid, nil
|
|
}
|
|
|
|
// GetUUID returns the UUID of the device
|
|
func (d nvmlMigDevice) GetUUID() (string, error) {
|
|
return nvmlDevice(d).GetUUID()
|
|
}
|
|
|
|
// GetDeviceNodes returns the device node paths for a GPU device
|
|
func (d nvmlDevice) GetDeviceNodes() ([]string, error) {
|
|
minor, ret := d.GetMinorNumber()
|
|
if ret != nvml.SUCCESS {
|
|
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
|
|
}
|
|
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
|
|
|
return []string{path}, nil
|
|
}
|
|
|
|
// GetDeviceNodes returns the device node paths for a MIG device
|
|
func (d nvmlMigDevice) GetDeviceNodes() ([]string, error) {
|
|
parent, ret := d.GetDeviceHandleFromMigDeviceHandle()
|
|
if ret != nvml.SUCCESS {
|
|
return nil, fmt.Errorf("error getting parent device: %v", ret)
|
|
}
|
|
minor, ret := parent.GetMinorNumber()
|
|
if ret != nvml.SUCCESS {
|
|
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
|
|
}
|
|
parentPath := fmt.Sprintf("/dev/nvidia%d", minor)
|
|
|
|
migCaps, err := nvcaps.NewMigCaps()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
|
|
}
|
|
|
|
gi, ret := d.GetGpuInstanceId()
|
|
if ret != nvml.SUCCESS {
|
|
return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret)
|
|
}
|
|
|
|
ci, ret := d.GetComputeInstanceId()
|
|
if ret != nvml.SUCCESS {
|
|
return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret)
|
|
}
|
|
|
|
giCap := nvcaps.NewGPUInstanceCap(minor, gi)
|
|
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
|
|
}
|
|
|
|
ciCap := nvcaps.NewComputeInstanceCap(minor, gi, ci)
|
|
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
|
|
}
|
|
|
|
devicePaths := []string{
|
|
parentPath,
|
|
giCapDevicePath,
|
|
ciCapDevicePath,
|
|
}
|
|
|
|
return devicePaths, nil
|
|
}
|