[no-relnote] Refactor dGPU device discovery

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2024-06-11 15:12:57 +02:00
parent 6732f6d13b
commit b42a5d3e3a
6 changed files with 389 additions and 153 deletions

View File

@ -0,0 +1,117 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package dgpu
import (
"fmt"
"os"
"path/filepath"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
)
// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links
type byPathHookDiscoverer struct {
logger logger.Interface
devRoot string
nvidiaCDIHookPath string
pciBusID string
deviceNodes discover.Discover
}
var _ discover.Discover = (*byPathHookDiscoverer)(nil)
// Devices returns the empty list for the by-path hook discoverer
func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) {
return nil, nil
}
// Hooks returns the hooks for the GPU device.
// The following hooks are detected:
// 1. A hook to create /dev/dri/by-path symlinks
func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) {
links, err := d.deviceNodeLinks()
if err != nil {
return nil, fmt.Errorf("failed to discover DRA device links: %v", err)
}
if len(links) == 0 {
return nil, nil
}
var args []string
for _, l := range links {
args = append(args, "--link", l)
}
hook := discover.CreateNvidiaCDIHook(
d.nvidiaCDIHookPath,
"create-symlinks",
args...,
)
return []discover.Hook{hook}, nil
}
// Mounts returns an empty slice for a full GPU
func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) {
return nil, nil
}
func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) {
devices, err := d.deviceNodes.Devices()
if err != nil {
return nil, fmt.Errorf("failed to discover device nodes: %v", err)
}
if len(devices) == 0 {
return nil, nil
}
selectedDevices := make(map[string]bool)
for _, d := range devices {
selectedDevices[d.HostPath] = true
}
candidates := []string{
fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID),
fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID),
}
var links []string
for _, c := range candidates {
linkPath := filepath.Join(d.devRoot, c)
device, err := os.Readlink(linkPath)
if err != nil {
d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath)
continue
}
deviceNode := device
if !filepath.IsAbs(device) {
deviceNode = filepath.Join(filepath.Dir(linkPath), device)
}
if !selectedDevices[deviceNode] {
d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode)
continue
}
d.logger.Debugf("adding device symlink %v -> %v", linkPath, device)
links = append(links, fmt.Sprintf("%v::%v", device, linkPath))
}
return links, nil
}

View File

@ -0,0 +1,39 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package dgpu
import (
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
)
// NewForDevice creates a discoverer for the specified Device.
// nvsandboxutils is used for discovery if specified, otherwise NVML is used.
func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) {
o := &options{}
for _, opt := range opts {
opt(o)
}
if o.logger == nil {
o.logger = logger.New()
}
return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d})
}

View File

@ -0,0 +1,84 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package dgpu
import (
"fmt"
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
)
type requiredInfo interface {
GetMinorNumber() (int, error)
GetPCIBusID() (string, error)
}
func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) {
minor, err := d.GetMinorNumber()
if err != nil {
return nil, fmt.Errorf("error getting GPU device minor number: %w", err)
}
path := fmt.Sprintf("/dev/nvidia%d", minor)
pciBusID, err := d.GetPCIBusID()
if err != nil {
return nil, fmt.Errorf("error getting PCI info for device: %w", err)
}
drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID)
if err != nil {
return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err)
}
deviceNodePaths := append([]string{path}, drmDeviceNodes...)
deviceNodes := discover.NewCharDeviceDiscoverer(
o.logger,
o.devRoot,
deviceNodePaths,
)
byPathHooks := &byPathHookDiscoverer{
logger: o.logger,
devRoot: o.devRoot,
nvidiaCDIHookPath: o.nvidiaCDIHookPath,
pciBusID: pciBusID,
deviceNodes: deviceNodes,
}
dd := discover.Merge(
deviceNodes,
byPathHooks,
)
return dd, nil
}
type toRequiredInfo struct {
device.Device
}
func (d *toRequiredInfo) GetMinorNumber() (int, error) {
minor, ret := d.Device.GetMinorNumber()
if ret != nvml.SUCCESS {
return 0, ret
}
return minor, nil
}

View File

@ -0,0 +1,87 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package dgpu
import (
"testing"
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/NVIDIA/go-nvml/pkg/nvml/mock"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
)
// TODO: In order to properly test this, we need a mechanism to inject /
// override the char device discoverer.
func TestNewNvmlDGPUDiscoverer(t *testing.T) {
logger, _ := testlog.NewNullLogger()
nvmllib := &mock.Interface{}
devicelib := device.New(
nvmllib,
)
testCases := []struct {
description string
device nvml.Device
expectedError error
expectedDevices []discover.Device
expectedHooks []discover.Hook
expectedMounts []discover.Mount
}{
{
description: "",
device: &mock.Device{
GetMinorNumberFunc: func() (int, nvml.Return) {
return 3, nvml.SUCCESS
},
GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) {
var busID [32]int8
for i, b := range []byte("00000000:45:00:00") {
busID[i] = int8(b)
}
info := nvml.PciInfo{
BusId: busID,
}
return info, nvml.SUCCESS
},
},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
o := &options{logger: logger}
device, err := devicelib.NewDevice(tc.device)
require.NoError(t, err)
d, err := o.newNvmlDGPUDiscoverer(&toRequiredInfo{device})
require.ErrorIs(t, err, tc.expectedError)
devices, _ := d.Devices()
require.EqualValues(t, tc.expectedDevices, devices)
hooks, _ := d.Hooks()
require.EqualValues(t, tc.expectedHooks, hooks)
mounts, _ := d.Mounts()
require.EqualValues(t, tc.expectedMounts, mounts)
})
}
}

View File

@ -0,0 +1,50 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package dgpu
import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
)
type options struct {
logger logger.Interface
devRoot string
nvidiaCDIHookPath string
}
type Option func(*options)
// WithDevRoot sets the root where /dev is located.
func WithDevRoot(root string) Option {
return func(l *options) {
l.devRoot = root
}
}
// WithLogger sets the logger for the library
func WithLogger(logger logger.Interface) Option {
return func(l *options) {
l.logger = logger
}
}
// WithNVIDIACDIHookPath sets the path to the NVIDIA Container Toolkit CLI path for the library
func WithNVIDIACDIHookPath(path string) Option {
return func(l *options) {
l.nvidiaCDIHookPath = path
}
}

View File

@ -18,19 +18,14 @@ package nvcdi
import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"tags.cncf.io/container-device-interface/pkg/cdi"
"tags.cncf.io/container-device-interface/specs-go"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu"
)
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
@ -58,7 +53,7 @@ func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, err
// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.
func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error) {
device, err := newFullGPUDiscoverer(l.logger, l.devRoot, l.nvidiaCDIHookPath, d)
device, err := l.newFullGPUDiscoverer(d)
if err != nil {
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
}
@ -71,164 +66,28 @@ func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error
return editsForDevice, nil
}
// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links
type byPathHookDiscoverer struct {
logger logger.Interface
devRoot string
nvidiaCDIHookPath string
pciBusID string
deviceNodes discover.Discover
}
var _ discover.Discover = (*byPathHookDiscoverer)(nil)
// newFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device.
func newFullGPUDiscoverer(logger logger.Interface, devRoot string, nvidiaCDIHookPath string, d device.Device) (discover.Discover, error) {
// TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface.
// This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin.
minor, ret := d.GetMinorNumber()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
}
path := fmt.Sprintf("/dev/nvidia%d", minor)
pciInfo, ret := d.GetPciInfo()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting PCI info for device: %v", ret)
}
pciBusID := getBusID(pciInfo)
drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID)
if err != nil {
return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err)
}
deviceNodePaths := append([]string{path}, drmDeviceNodes...)
deviceNodes := discover.NewCharDeviceDiscoverer(
logger,
devRoot,
deviceNodePaths,
func (l *nvmllib) newFullGPUDiscoverer(d device.Device) (discover.Discover, error) {
deviceNodes, err := dgpu.NewForDevice(d,
dgpu.WithDevRoot(l.devRoot),
dgpu.WithLogger(l.logger),
dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath),
)
byPathHooks := &byPathHookDiscoverer{
logger: logger,
devRoot: devRoot,
nvidiaCDIHookPath: nvidiaCDIHookPath,
pciBusID: pciBusID,
deviceNodes: deviceNodes,
if err != nil {
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
}
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
logger,
devRoot,
nvidiaCDIHookPath,
l.logger,
l.devRoot,
l.nvidiaCDIHookPath,
deviceNodes,
)
dd := discover.Merge(
deviceNodes,
byPathHooks,
deviceFolderPermissionHooks,
)
return dd, nil
}
// Devices returns the empty list for the by-path hook discoverer
func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) {
return nil, nil
}
// Hooks returns the hooks for the GPU device.
// The following hooks are detected:
// 1. A hook to create /dev/dri/by-path symlinks
func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) {
links, err := d.deviceNodeLinks()
if err != nil {
return nil, fmt.Errorf("failed to discover DRA device links: %v", err)
}
if len(links) == 0 {
return nil, nil
}
var args []string
for _, l := range links {
args = append(args, "--link", l)
}
hook := discover.CreateNvidiaCDIHook(
d.nvidiaCDIHookPath,
"create-symlinks",
args...,
)
return []discover.Hook{hook}, nil
}
// Mounts returns an empty slice for a full GPU
func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) {
return nil, nil
}
func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) {
devices, err := d.deviceNodes.Devices()
if err != nil {
return nil, fmt.Errorf("failed to discover device nodes: %v", err)
}
if len(devices) == 0 {
return nil, nil
}
selectedDevices := make(map[string]bool)
for _, d := range devices {
selectedDevices[d.HostPath] = true
}
candidates := []string{
fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID),
fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID),
}
var links []string
for _, c := range candidates {
linkPath := filepath.Join(d.devRoot, c)
device, err := os.Readlink(linkPath)
if err != nil {
d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath)
continue
}
deviceNode := device
if !filepath.IsAbs(device) {
deviceNode = filepath.Join(filepath.Dir(linkPath), device)
}
if !selectedDevices[deviceNode] {
d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode)
continue
}
d.logger.Debugf("adding device symlink %v -> %v", linkPath, device)
links = append(links, fmt.Sprintf("%v::%v", device, linkPath))
}
return links, nil
}
// getBusID provides a utility function that returns the string representation of the bus ID.
func getBusID(p nvml.PciInfo) string {
var bytes []byte
for _, b := range p.BusId {
if byte(b) == '\x00' {
break
}
bytes = append(bytes, byte(b))
}
id := strings.ToLower(string(bytes))
if id != "0000" {
id = strings.TrimPrefix(id, "0000")
}
return id
}