mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-22 00:08:11 +00:00
Merge pull request #552 from elezar/refactor-dgpu-discovery
Refactor dGPU device discovery
This commit is contained in:
commit
448a3853ad
117
internal/platform-support/dgpu/by-path-hooks.go
Normal file
117
internal/platform-support/dgpu/by-path-hooks.go
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
/**
|
||||||
|
# Copyright 2024 NVIDIA CORPORATION
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
**/
|
||||||
|
|
||||||
|
package dgpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||||
|
)
|
||||||
|
|
||||||
|
// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links
|
||||||
|
type byPathHookDiscoverer struct {
|
||||||
|
logger logger.Interface
|
||||||
|
devRoot string
|
||||||
|
nvidiaCDIHookPath string
|
||||||
|
pciBusID string
|
||||||
|
deviceNodes discover.Discover
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ discover.Discover = (*byPathHookDiscoverer)(nil)
|
||||||
|
|
||||||
|
// Devices returns the empty list for the by-path hook discoverer
|
||||||
|
func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hooks returns the hooks for the GPU device.
|
||||||
|
// The following hooks are detected:
|
||||||
|
// 1. A hook to create /dev/dri/by-path symlinks
|
||||||
|
func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) {
|
||||||
|
links, err := d.deviceNodeLinks()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to discover DRA device links: %v", err)
|
||||||
|
}
|
||||||
|
if len(links) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var args []string
|
||||||
|
for _, l := range links {
|
||||||
|
args = append(args, "--link", l)
|
||||||
|
}
|
||||||
|
|
||||||
|
hook := discover.CreateNvidiaCDIHook(
|
||||||
|
d.nvidiaCDIHookPath,
|
||||||
|
"create-symlinks",
|
||||||
|
args...,
|
||||||
|
)
|
||||||
|
|
||||||
|
return []discover.Hook{hook}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mounts returns an empty slice for a full GPU
|
||||||
|
func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) {
|
||||||
|
devices, err := d.deviceNodes.Devices()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to discover device nodes: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(devices) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedDevices := make(map[string]bool)
|
||||||
|
for _, d := range devices {
|
||||||
|
selectedDevices[d.HostPath] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
candidates := []string{
|
||||||
|
fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID),
|
||||||
|
fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID),
|
||||||
|
}
|
||||||
|
|
||||||
|
var links []string
|
||||||
|
for _, c := range candidates {
|
||||||
|
linkPath := filepath.Join(d.devRoot, c)
|
||||||
|
device, err := os.Readlink(linkPath)
|
||||||
|
if err != nil {
|
||||||
|
d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
deviceNode := device
|
||||||
|
if !filepath.IsAbs(device) {
|
||||||
|
deviceNode = filepath.Join(filepath.Dir(linkPath), device)
|
||||||
|
}
|
||||||
|
if !selectedDevices[deviceNode] {
|
||||||
|
d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
d.logger.Debugf("adding device symlink %v -> %v", linkPath, device)
|
||||||
|
links = append(links, fmt.Sprintf("%v::%v", device, linkPath))
|
||||||
|
}
|
||||||
|
|
||||||
|
return links, nil
|
||||||
|
}
|
57
internal/platform-support/dgpu/dgpu.go
Normal file
57
internal/platform-support/dgpu/dgpu.go
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
/**
|
||||||
|
# Copyright 2024 NVIDIA CORPORATION
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
**/
|
||||||
|
|
||||||
|
package dgpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NewForDevice creates a discoverer for the specified Device.
|
||||||
|
func NewForDevice(d device.Device, opts ...Option) (discover.Discover, error) {
|
||||||
|
o := &options{}
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(o)
|
||||||
|
}
|
||||||
|
|
||||||
|
if o.logger == nil {
|
||||||
|
o.logger = logger.New()
|
||||||
|
}
|
||||||
|
|
||||||
|
return o.newNvmlDGPUDiscoverer(&toRequiredInfo{d})
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewForDevice creates a discoverer for the specified device and its associated MIG device.
|
||||||
|
func NewForMigDevice(d device.Device, mig device.MigDevice, opts ...Option) (discover.Discover, error) {
|
||||||
|
o := &options{}
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(o)
|
||||||
|
}
|
||||||
|
|
||||||
|
if o.logger == nil {
|
||||||
|
o.logger = logger.New()
|
||||||
|
}
|
||||||
|
|
||||||
|
return o.newNvmlMigDiscoverer(
|
||||||
|
&toRequiredMigInfo{
|
||||||
|
MigDevice: mig,
|
||||||
|
parent: &toRequiredInfo{d},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
168
internal/platform-support/dgpu/nvml.go
Normal file
168
internal/platform-support/dgpu/nvml.go
Normal file
@ -0,0 +1,168 @@
|
|||||||
|
/**
|
||||||
|
# Copyright 2024 NVIDIA CORPORATION
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
**/
|
||||||
|
|
||||||
|
package dgpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
||||||
|
)
|
||||||
|
|
||||||
|
type requiredInfo interface {
|
||||||
|
GetMinorNumber() (int, error)
|
||||||
|
GetPCIBusID() (string, error)
|
||||||
|
getDevNodePath() (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *options) newNvmlDGPUDiscoverer(d requiredInfo) (discover.Discover, error) {
|
||||||
|
path, err := d.getDevNodePath()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error getting device node path: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
pciBusID, err := d.GetPCIBusID()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error getting PCI info for device: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
deviceNodePaths := append([]string{path}, drmDeviceNodes...)
|
||||||
|
|
||||||
|
deviceNodes := discover.NewCharDeviceDiscoverer(
|
||||||
|
o.logger,
|
||||||
|
o.devRoot,
|
||||||
|
deviceNodePaths,
|
||||||
|
)
|
||||||
|
|
||||||
|
byPathHooks := &byPathHookDiscoverer{
|
||||||
|
logger: o.logger,
|
||||||
|
devRoot: o.devRoot,
|
||||||
|
nvidiaCDIHookPath: o.nvidiaCDIHookPath,
|
||||||
|
pciBusID: pciBusID,
|
||||||
|
deviceNodes: deviceNodes,
|
||||||
|
}
|
||||||
|
|
||||||
|
dd := discover.Merge(
|
||||||
|
deviceNodes,
|
||||||
|
byPathHooks,
|
||||||
|
)
|
||||||
|
return dd, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type requiredMigInfo interface {
|
||||||
|
getPlacementInfo() (int, int, int, error)
|
||||||
|
getDevNodePath() (string, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *options) newNvmlMigDiscoverer(d requiredMigInfo) (discover.Discover, error) {
|
||||||
|
gpu, gi, ci, err := d.getPlacementInfo()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error getting placement info: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
migCaps, err := nvcaps.NewMigCaps()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
|
||||||
|
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
|
||||||
|
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
parentPath, err := d.getDevNodePath()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
deviceNodes := discover.NewCharDeviceDiscoverer(
|
||||||
|
o.logger,
|
||||||
|
o.devRoot,
|
||||||
|
[]string{
|
||||||
|
parentPath,
|
||||||
|
giCapDevicePath,
|
||||||
|
ciCapDevicePath,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
return deviceNodes, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type toRequiredInfo struct {
|
||||||
|
device.Device
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *toRequiredInfo) GetMinorNumber() (int, error) {
|
||||||
|
minor, ret := d.Device.GetMinorNumber()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return 0, ret
|
||||||
|
}
|
||||||
|
return minor, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *toRequiredInfo) getDevNodePath() (string, error) {
|
||||||
|
minor, err := d.GetMinorNumber()
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error getting GPU device minor number: %w", err)
|
||||||
|
}
|
||||||
|
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
||||||
|
return path, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type toRequiredMigInfo struct {
|
||||||
|
device.MigDevice
|
||||||
|
parent requiredInfo
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *toRequiredMigInfo) getPlacementInfo() (int, int, int, error) {
|
||||||
|
gpu, ret := d.parent.GetMinorNumber()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return 0, 0, 0, fmt.Errorf("error getting GPU minor: %v", ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
gi, ret := d.GetGpuInstanceId()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return 0, 0, 0, fmt.Errorf("error getting GPU Instance ID: %v", ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
ci, ret := d.GetComputeInstanceId()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return 0, 0, 0, fmt.Errorf("error getting Compute Instance ID: %v", ret)
|
||||||
|
}
|
||||||
|
|
||||||
|
return gpu, gi, ci, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *toRequiredMigInfo) getDevNodePath() (string, error) {
|
||||||
|
return d.parent.getDevNodePath()
|
||||||
|
}
|
87
internal/platform-support/dgpu/nvml_test.go
Normal file
87
internal/platform-support/dgpu/nvml_test.go
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
/**
|
||||||
|
# Copyright 2024 NVIDIA CORPORATION
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
**/
|
||||||
|
|
||||||
|
package dgpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
|
"github.com/NVIDIA/go-nvml/pkg/nvml/mock"
|
||||||
|
testlog "github.com/sirupsen/logrus/hooks/test"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TODO: In order to properly test this, we need a mechanism to inject /
|
||||||
|
// override the char device discoverer.
|
||||||
|
func TestNewNvmlDGPUDiscoverer(t *testing.T) {
|
||||||
|
logger, _ := testlog.NewNullLogger()
|
||||||
|
|
||||||
|
nvmllib := &mock.Interface{}
|
||||||
|
devicelib := device.New(
|
||||||
|
nvmllib,
|
||||||
|
)
|
||||||
|
|
||||||
|
testCases := []struct {
|
||||||
|
description string
|
||||||
|
device nvml.Device
|
||||||
|
expectedError error
|
||||||
|
expectedDevices []discover.Device
|
||||||
|
expectedHooks []discover.Hook
|
||||||
|
expectedMounts []discover.Mount
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
description: "",
|
||||||
|
device: &mock.Device{
|
||||||
|
GetMinorNumberFunc: func() (int, nvml.Return) {
|
||||||
|
return 3, nvml.SUCCESS
|
||||||
|
},
|
||||||
|
GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) {
|
||||||
|
var busID [32]int8
|
||||||
|
for i, b := range []byte("00000000:45:00:00") {
|
||||||
|
busID[i] = int8(b)
|
||||||
|
}
|
||||||
|
info := nvml.PciInfo{
|
||||||
|
BusId: busID,
|
||||||
|
}
|
||||||
|
return info, nvml.SUCCESS
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.description, func(t *testing.T) {
|
||||||
|
o := &options{logger: logger}
|
||||||
|
|
||||||
|
device, err := devicelib.NewDevice(tc.device)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
d, err := o.newNvmlDGPUDiscoverer(&toRequiredInfo{device})
|
||||||
|
require.ErrorIs(t, err, tc.expectedError)
|
||||||
|
|
||||||
|
devices, _ := d.Devices()
|
||||||
|
require.EqualValues(t, tc.expectedDevices, devices)
|
||||||
|
hooks, _ := d.Hooks()
|
||||||
|
require.EqualValues(t, tc.expectedHooks, hooks)
|
||||||
|
mounts, _ := d.Mounts()
|
||||||
|
require.EqualValues(t, tc.expectedMounts, mounts)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
50
internal/platform-support/dgpu/options.go
Normal file
50
internal/platform-support/dgpu/options.go
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
/**
|
||||||
|
# Copyright 2024 NVIDIA CORPORATION
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
**/
|
||||||
|
|
||||||
|
package dgpu
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||||
|
)
|
||||||
|
|
||||||
|
type options struct {
|
||||||
|
logger logger.Interface
|
||||||
|
devRoot string
|
||||||
|
nvidiaCDIHookPath string
|
||||||
|
}
|
||||||
|
|
||||||
|
type Option func(*options)
|
||||||
|
|
||||||
|
// WithDevRoot sets the root where /dev is located.
|
||||||
|
func WithDevRoot(root string) Option {
|
||||||
|
return func(l *options) {
|
||||||
|
l.devRoot = root
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithLogger sets the logger for the library
|
||||||
|
func WithLogger(logger logger.Interface) Option {
|
||||||
|
return func(l *options) {
|
||||||
|
l.logger = logger
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithNVIDIACDIHookPath sets the path to the NVIDIA Container Toolkit CLI path for the library
|
||||||
|
func WithNVIDIACDIHookPath(path string) Option {
|
||||||
|
return func(l *options) {
|
||||||
|
l.nvidiaCDIHookPath = path
|
||||||
|
}
|
||||||
|
}
|
@ -18,19 +18,14 @@ package nvcdi
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
|
||||||
"tags.cncf.io/container-device-interface/pkg/cdi"
|
"tags.cncf.io/container-device-interface/pkg/cdi"
|
||||||
"tags.cncf.io/container-device-interface/specs-go"
|
"tags.cncf.io/container-device-interface/specs-go"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
|
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
|
||||||
@ -58,7 +53,7 @@ func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, err
|
|||||||
|
|
||||||
// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.
|
// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.
|
||||||
func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error) {
|
func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error) {
|
||||||
device, err := newFullGPUDiscoverer(l.logger, l.devRoot, l.nvidiaCDIHookPath, d)
|
device, err := l.newFullGPUDiscoverer(d)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
|
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
|
||||||
}
|
}
|
||||||
@ -71,164 +66,28 @@ func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error
|
|||||||
return editsForDevice, nil
|
return editsForDevice, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links
|
|
||||||
type byPathHookDiscoverer struct {
|
|
||||||
logger logger.Interface
|
|
||||||
devRoot string
|
|
||||||
nvidiaCDIHookPath string
|
|
||||||
pciBusID string
|
|
||||||
deviceNodes discover.Discover
|
|
||||||
}
|
|
||||||
|
|
||||||
var _ discover.Discover = (*byPathHookDiscoverer)(nil)
|
|
||||||
|
|
||||||
// newFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device.
|
// newFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device.
|
||||||
func newFullGPUDiscoverer(logger logger.Interface, devRoot string, nvidiaCDIHookPath string, d device.Device) (discover.Discover, error) {
|
func (l *nvmllib) newFullGPUDiscoverer(d device.Device) (discover.Discover, error) {
|
||||||
// TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface.
|
deviceNodes, err := dgpu.NewForDevice(d,
|
||||||
// This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin.
|
dgpu.WithDevRoot(l.devRoot),
|
||||||
minor, ret := d.GetMinorNumber()
|
dgpu.WithLogger(l.logger),
|
||||||
if ret != nvml.SUCCESS {
|
dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath),
|
||||||
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
|
|
||||||
}
|
|
||||||
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
|
||||||
|
|
||||||
pciInfo, ret := d.GetPciInfo()
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return nil, fmt.Errorf("error getting PCI info for device: %v", ret)
|
|
||||||
}
|
|
||||||
pciBusID := getBusID(pciInfo)
|
|
||||||
|
|
||||||
drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
deviceNodePaths := append([]string{path}, drmDeviceNodes...)
|
|
||||||
|
|
||||||
deviceNodes := discover.NewCharDeviceDiscoverer(
|
|
||||||
logger,
|
|
||||||
devRoot,
|
|
||||||
deviceNodePaths,
|
|
||||||
)
|
)
|
||||||
|
if err != nil {
|
||||||
byPathHooks := &byPathHookDiscoverer{
|
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
|
||||||
logger: logger,
|
|
||||||
devRoot: devRoot,
|
|
||||||
nvidiaCDIHookPath: nvidiaCDIHookPath,
|
|
||||||
pciBusID: pciBusID,
|
|
||||||
deviceNodes: deviceNodes,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
|
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
|
||||||
logger,
|
l.logger,
|
||||||
devRoot,
|
l.devRoot,
|
||||||
nvidiaCDIHookPath,
|
l.nvidiaCDIHookPath,
|
||||||
deviceNodes,
|
deviceNodes,
|
||||||
)
|
)
|
||||||
|
|
||||||
dd := discover.Merge(
|
dd := discover.Merge(
|
||||||
deviceNodes,
|
deviceNodes,
|
||||||
byPathHooks,
|
|
||||||
deviceFolderPermissionHooks,
|
deviceFolderPermissionHooks,
|
||||||
)
|
)
|
||||||
|
|
||||||
return dd, nil
|
return dd, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Devices returns the empty list for the by-path hook discoverer
|
|
||||||
func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Hooks returns the hooks for the GPU device.
|
|
||||||
// The following hooks are detected:
|
|
||||||
// 1. A hook to create /dev/dri/by-path symlinks
|
|
||||||
func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) {
|
|
||||||
links, err := d.deviceNodeLinks()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to discover DRA device links: %v", err)
|
|
||||||
}
|
|
||||||
if len(links) == 0 {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var args []string
|
|
||||||
for _, l := range links {
|
|
||||||
args = append(args, "--link", l)
|
|
||||||
}
|
|
||||||
|
|
||||||
hook := discover.CreateNvidiaCDIHook(
|
|
||||||
d.nvidiaCDIHookPath,
|
|
||||||
"create-symlinks",
|
|
||||||
args...,
|
|
||||||
)
|
|
||||||
|
|
||||||
return []discover.Hook{hook}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Mounts returns an empty slice for a full GPU
|
|
||||||
func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) {
|
|
||||||
devices, err := d.deviceNodes.Devices()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to discover device nodes: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(devices) == 0 {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
selectedDevices := make(map[string]bool)
|
|
||||||
for _, d := range devices {
|
|
||||||
selectedDevices[d.HostPath] = true
|
|
||||||
}
|
|
||||||
|
|
||||||
candidates := []string{
|
|
||||||
fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID),
|
|
||||||
fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID),
|
|
||||||
}
|
|
||||||
|
|
||||||
var links []string
|
|
||||||
for _, c := range candidates {
|
|
||||||
linkPath := filepath.Join(d.devRoot, c)
|
|
||||||
device, err := os.Readlink(linkPath)
|
|
||||||
if err != nil {
|
|
||||||
d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
deviceNode := device
|
|
||||||
if !filepath.IsAbs(device) {
|
|
||||||
deviceNode = filepath.Join(filepath.Dir(linkPath), device)
|
|
||||||
}
|
|
||||||
if !selectedDevices[deviceNode] {
|
|
||||||
d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
d.logger.Debugf("adding device symlink %v -> %v", linkPath, device)
|
|
||||||
links = append(links, fmt.Sprintf("%v::%v", device, linkPath))
|
|
||||||
}
|
|
||||||
|
|
||||||
return links, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// getBusID provides a utility function that returns the string representation of the bus ID.
|
|
||||||
func getBusID(p nvml.PciInfo) string {
|
|
||||||
var bytes []byte
|
|
||||||
for _, b := range p.BusId {
|
|
||||||
if byte(b) == '\x00' {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
bytes = append(bytes, byte(b))
|
|
||||||
}
|
|
||||||
id := strings.ToLower(string(bytes))
|
|
||||||
|
|
||||||
if id != "0000" {
|
|
||||||
id = strings.TrimPrefix(id, "0000")
|
|
||||||
}
|
|
||||||
|
|
||||||
return id
|
|
||||||
}
|
|
||||||
|
@ -20,14 +20,11 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
|
||||||
"tags.cncf.io/container-device-interface/pkg/cdi"
|
"tags.cncf.io/container-device-interface/pkg/cdi"
|
||||||
"tags.cncf.io/container-device-interface/specs-go"
|
"tags.cncf.io/container-device-interface/specs-go"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/dgpu"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
|
// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
|
||||||
@ -54,74 +51,19 @@ func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.Mi
|
|||||||
|
|
||||||
// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'.
|
// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'.
|
||||||
func (l *nvmllib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) (*cdi.ContainerEdits, error) {
|
func (l *nvmllib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) (*cdi.ContainerEdits, error) {
|
||||||
gpu, ret := parent.GetMinorNumber()
|
deviceNodes, err := dgpu.NewForMigDevice(parent, mig,
|
||||||
if ret != nvml.SUCCESS {
|
dgpu.WithDevRoot(l.devRoot),
|
||||||
return nil, fmt.Errorf("error getting GPU minor: %v", ret)
|
dgpu.WithLogger(l.logger),
|
||||||
}
|
dgpu.WithNVIDIACDIHookPath(l.nvidiaCDIHookPath),
|
||||||
|
)
|
||||||
gi, ret := mig.GetGpuInstanceId()
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret)
|
|
||||||
}
|
|
||||||
|
|
||||||
ci, ret := mig.GetComputeInstanceId()
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret)
|
|
||||||
}
|
|
||||||
|
|
||||||
editsForDevice, err := l.GetEditsForComputeInstance(gpu, gi, ci)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create container edits for MIG device: %v", err)
|
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return editsForDevice, nil
|
editsForDevice, err := edits.FromDiscoverer(deviceNodes)
|
||||||
}
|
|
||||||
|
|
||||||
// GetEditsForComputeInstance returns the CDI edits for a particular compute instance defined by the (gpu, gi, ci) tuple
|
|
||||||
func (l *nvmllib) GetEditsForComputeInstance(gpu int, gi int, ci int) (*cdi.ContainerEdits, error) {
|
|
||||||
computeInstance, err := newComputeInstanceDiscoverer(l.logger, l.devRoot, gpu, gi, ci)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to create discoverer for Compute Instance: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
editsForDevice, err := edits.FromDiscoverer(computeInstance)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create container edits for Compute Instance: %v", err)
|
return nil, fmt.Errorf("failed to create container edits for Compute Instance: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return editsForDevice, nil
|
return editsForDevice, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// newComputeInstanceDiscoverer returns a discoverer for the specified compute instance
|
|
||||||
func newComputeInstanceDiscoverer(logger logger.Interface, devRoot string, gpu int, gi int, ci int) (discover.Discover, error) {
|
|
||||||
parentPath := fmt.Sprintf("/dev/nvidia%d", gpu)
|
|
||||||
|
|
||||||
migCaps, err := nvcaps.NewMigCaps()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
|
|
||||||
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
|
|
||||||
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
deviceNodes := discover.NewCharDeviceDiscoverer(
|
|
||||||
logger,
|
|
||||||
devRoot,
|
|
||||||
[]string{
|
|
||||||
parentPath,
|
|
||||||
giCapDevicePath,
|
|
||||||
ciCapDevicePath,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
return deviceNodes, nil
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user