diff --git a/cmd/nvidia-ctk/info/generate-cdi/generate-cdi.go b/cmd/nvidia-ctk/info/generate-cdi/generate-cdi.go index d5438d5e..7ef290b4 100644 --- a/cmd/nvidia-ctk/info/generate-cdi/generate-cdi.go +++ b/cmd/nvidia-ctk/info/generate-cdi/generate-cdi.go @@ -139,7 +139,7 @@ func (m command) generateSpec() (*specs.Spec, error) { devicelib := device.New(device.WithNvml(nvmllib)) spec := specs.Spec{ - Version: specs.CurrentVersion, + Version: "0.4.0", Kind: "nvidia.com/gpu", ContainerEdits: specs.ContainerEdits{}, } @@ -176,6 +176,23 @@ func (m command) generateSpec() (*specs.Spec, error) { return nil, fmt.Errorf("falied to generate CDI spec for MIG devices: %v", err) } + // We create an "all" device with all the discovered device nodes + var allDeviceNodes []*specs.DeviceNode + for _, d := range spec.Devices { + for _, dn := range d.ContainerEdits.DeviceNodes { + allDeviceNodes = append(allDeviceNodes, dn) + } + } + all := specs.Device{ + Name: "all", + ContainerEdits: specs.ContainerEdits{ + DeviceNodes: allDeviceNodes, + }, + } + + spec.Devices = append(spec.Devices, all) + spec.ContainerEdits.DeviceNodes = m.getExistingMetaDeviceNodes() + libraries, err := m.findLibs(nvmllib) if err != nil { return nil, fmt.Errorf("failed to locate driver libraries: %v", err) @@ -201,20 +218,13 @@ func (m command) generateSpec() (*specs.Spec, error) { } func generateEditsForDevice(name string, d deviceInfo) (specs.Device, error) { - var deviceNodes []*specs.DeviceNode - deviceNodePaths, err := d.GetDeviceNodes() if err != nil { return specs.Device{}, fmt.Errorf("failed to get paths for device: %v", err) } - for _, p := range deviceNodePaths { - deviceNode := specs.DeviceNode{ - Path: p, - // TODO: Set the host path dependent on the root - HostPath: p, - } - deviceNodes = append(deviceNodes, &deviceNode) - } + + deviceNodes := getDeviceNodesFromPaths(deviceNodePaths) + device := specs.Device{ Name: name, ContainerEdits: specs.ContainerEdits{ @@ -225,6 +235,38 @@ func generateEditsForDevice(name string, d deviceInfo) (specs.Device, error) { return device, nil } +func (m command) getExistingMetaDeviceNodes() []*specs.DeviceNode { + metaDeviceNodePaths := []string{ + "/dev/nvidia-modeset", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-uvm", + "/dev/nvidiactl", + } + + var existingDeviceNodePaths []string + for _, p := range metaDeviceNodePaths { + if _, err := os.Stat(p); err != nil { + m.logger.Infof("Ignoring missing meta device %v", p) + continue + } + existingDeviceNodePaths = append(existingDeviceNodePaths, p) + } + + return getDeviceNodesFromPaths(existingDeviceNodePaths) +} + +func getDeviceNodesFromPaths(deviceNodePaths []string) []*specs.DeviceNode { + var deviceNodes []*specs.DeviceNode + for _, p := range deviceNodePaths { + deviceNode := specs.DeviceNode{ + Path: p, + } + deviceNodes = append(deviceNodes, &deviceNode) + } + + return deviceNodes +} + func (m command) findLibs(nvmllib nvml.Interface) ([]string, error) { version, r := nvmllib.SystemGetDriverVersion() if r != nvml.SUCCESS { diff --git a/cmd/nvidia-ctk/info/generate-cdi/nvml_devices.go b/cmd/nvidia-ctk/info/generate-cdi/nvml_devices.go index 95265bf0..221c4005 100644 --- a/cmd/nvidia-ctk/info/generate-cdi/nvml_devices.go +++ b/cmd/nvidia-ctk/info/generate-cdi/nvml_devices.go @@ -42,11 +42,11 @@ var _ deviceInfo = (*nvmlDevice)(nil) var _ deviceInfo = (*nvmlMigDevice)(nil) func newGPUDevice(i int, gpu device.Device) (string, nvmlDevice) { - return fmt.Sprintf("%v", i), nvmlDevice{gpu} + return fmt.Sprintf("gpu%v", i), nvmlDevice{gpu} } func newMigDevice(i int, j int, mig device.MigDevice) (string, nvmlMigDevice) { - return fmt.Sprintf("%v:%v", i, j), nvmlMigDevice{mig} + return fmt.Sprintf("mig%v:%v", i, j), nvmlMigDevice{mig} } // GetUUID returns the UUID of the device