mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-30 00:11:59 +00:00
149 lines
4.1 KiB
Go
149 lines
4.1 KiB
Go
|
/**
|
||
|
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
**/
|
||
|
|
||
|
package generate
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"os"
|
||
|
"path/filepath"
|
||
|
"strings"
|
||
|
|
||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
|
||
|
"github.com/sirupsen/logrus"
|
||
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||
|
)
|
||
|
|
||
|
// fullGPUDiscoverer wraps a deviceDiscoverer and adds specifics required for discovering full GPUs
|
||
|
type fullGPUDiscoverer struct {
|
||
|
deviceDiscoverer
|
||
|
|
||
|
pciBusID string
|
||
|
}
|
||
|
|
||
|
var _ discover.Discover = (*fullGPUDiscoverer)(nil)
|
||
|
|
||
|
// NewFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device.
|
||
|
func NewFullGPUDiscoverer(logger *logrus.Logger, root string, d device.Device) (discover.Discover, error) {
|
||
|
// TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface.
|
||
|
// This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin.
|
||
|
minor, ret := d.GetMinorNumber()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
|
||
|
}
|
||
|
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
||
|
|
||
|
pciInfo, ret := d.GetPciInfo()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting PCI info for device: %v", ret)
|
||
|
}
|
||
|
pciBusID := getBusID(pciInfo)
|
||
|
|
||
|
drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err)
|
||
|
}
|
||
|
|
||
|
deviceNodePaths := append([]string{path}, drmDeviceNodes...)
|
||
|
|
||
|
device := fullGPUDiscoverer{
|
||
|
deviceDiscoverer: deviceDiscoverer{
|
||
|
logger: logger,
|
||
|
root: root,
|
||
|
deviceNodePaths: deviceNodePaths,
|
||
|
},
|
||
|
pciBusID: pciBusID,
|
||
|
}
|
||
|
|
||
|
return &device, nil
|
||
|
}
|
||
|
|
||
|
// Hooks returns the hooks for the GPU device.
|
||
|
// The following hooks are detected:
|
||
|
// 1. A hook to create /dev/dri/by-path symlinks
|
||
|
func (d *fullGPUDiscoverer) Hooks() ([]discover.Hook, error) {
|
||
|
links, err := d.deviceNodeLinks()
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("failed to discover DRA device links: %v", err)
|
||
|
}
|
||
|
if len(links) == 0 {
|
||
|
return nil, nil
|
||
|
}
|
||
|
|
||
|
hookPath := "nvidia-ctk"
|
||
|
args := []string{hookPath, "hook", "create-symlinks"}
|
||
|
for _, l := range links {
|
||
|
args = append(args, "--link", l)
|
||
|
}
|
||
|
|
||
|
var hooks []discover.Hook
|
||
|
hook := discover.Hook{
|
||
|
Lifecycle: "createContainer",
|
||
|
Path: hookPath,
|
||
|
Args: args,
|
||
|
}
|
||
|
hooks = append(hooks, hook)
|
||
|
|
||
|
return hooks, nil
|
||
|
}
|
||
|
|
||
|
// Mounts returns an empty slice for a full GPU
|
||
|
func (d *fullGPUDiscoverer) Mounts() ([]discover.Mount, error) {
|
||
|
return nil, nil
|
||
|
}
|
||
|
|
||
|
func (d *fullGPUDiscoverer) deviceNodeLinks() ([]string, error) {
|
||
|
candidates := []string{
|
||
|
fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID),
|
||
|
fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID),
|
||
|
}
|
||
|
|
||
|
var links []string
|
||
|
for _, c := range candidates {
|
||
|
linkPath := filepath.Join(d.root, c)
|
||
|
device, err := os.Readlink(linkPath)
|
||
|
if err != nil {
|
||
|
d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath)
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
d.logger.Debugf("adding device symlink %v -> %v", linkPath, device)
|
||
|
links = append(links, fmt.Sprintf("%v::%v", device, linkPath))
|
||
|
}
|
||
|
|
||
|
return links, nil
|
||
|
}
|
||
|
|
||
|
// getBusID provides a utility function that returns the string representation of the bus ID.
|
||
|
func getBusID(p nvml.PciInfo) string {
|
||
|
var bytes []byte
|
||
|
for _, b := range p.BusId {
|
||
|
if byte(b) == '\x00' {
|
||
|
break
|
||
|
}
|
||
|
bytes = append(bytes, byte(b))
|
||
|
}
|
||
|
id := strings.ToLower(string(bytes))
|
||
|
|
||
|
if id != "0000" {
|
||
|
id = strings.TrimPrefix(id, "0000")
|
||
|
}
|
||
|
|
||
|
return id
|
||
|
}
|