From 4ccb0b9a5392f3fad365d701119898631280a022 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 13 Feb 2023 16:04:30 +0100 Subject: [PATCH 1/2] Add and resolve auto discovery mode for cdi generation Signed-off-by: Evan Lezar --- cmd/nvidia-ctk/cdi/generate/generate.go | 6 ++++-- pkg/nvcdi/lib.go | 26 +++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 9daf97ca..7aef47b8 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -36,6 +36,7 @@ import ( ) const ( + discoveryModeAuto = "auto" discoveryModeNVML = "nvml" discoveryModeWSL = "wsl" @@ -96,8 +97,8 @@ func (m command) build() *cli.Command { }, &cli.StringFlag{ Name: "discovery-mode", - Usage: "The mode to use when discovering the available entities. One of [nvml | wsl]", - Value: discoveryModeNVML, + Usage: "The mode to use when discovering the available entities. One of [auto | nvml | wsl]. I mode is set to 'auto' the mode will be determined based on the system configuration.", + Value: discoveryModeAuto, Destination: &cfg.discoveryMode, }, &cli.StringFlag{ @@ -132,6 +133,7 @@ func (m command) validateFlags(r *cli.Context, cfg *config) error { cfg.discoveryMode = strings.ToLower(cfg.discoveryMode) switch cfg.discoveryMode { + case discoveryModeAuto: case discoveryModeNVML: case discoveryModeWSL: default: diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 4081e524..f3d7dba0 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -19,6 +19,7 @@ package nvcdi import ( "github.com/sirupsen/logrus" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" ) @@ -39,7 +40,7 @@ func New(opts ...Option) Interface { opt(l) } if l.mode == "" { - l.mode = "nvml" + l.mode = "auto" } if l.logger == nil { l.logger = logrus.StandardLogger() @@ -54,7 +55,7 @@ func New(opts ...Option) Interface { l.nvidiaCTKPath = "/usr/bin/nvidia-ctk" } - switch l.mode { + switch l.resolveMode() { case "nvml": if l.nvmllib == nil { l.nvmllib = nvml.New() @@ -71,3 +72,24 @@ func New(opts ...Option) Interface { // TODO: We want an error here. return nil } + +// resolveMode resolves the mode for CDI spec generation based on the current system. +func (l *nvcdilib) resolveMode() (rmode string) { + if l.mode != "auto" { + return l.mode + } + defer func() { + l.logger.Infof("Auto-detected mode as %q", rmode) + }() + + nvinfo := info.New() + + isWSL, reason := nvinfo.HasDXCore() + l.logger.Debugf("Is WSL-based system? %v: %v", isWSL, reason) + + if isWSL { + return "wsl" + } + + return "nvml" +} From d9859d66bf11bdea97203205716dd7e6bf6ef871 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 13 Feb 2023 16:05:26 +0100 Subject: [PATCH 2/2] Update go vendoring Signed-off-by: Evan Lezar --- go.mod | 4 +- go.sum | 4 + .../NVIDIA/go-nvml/pkg/nvml/const.go | 320 +++- .../NVIDIA/go-nvml/pkg/nvml/device.go | 363 ++++- .../NVIDIA/go-nvml/pkg/nvml/nvml.go | 387 +++++ .../github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h | 1336 ++++++++++++++++- .../NVIDIA/go-nvml/pkg/nvml/types_gen.go | 138 ++ .../NVIDIA/go-nvml/pkg/nvml/vgpu.go | 18 + .../go-nvlib/pkg/nvlib/info/info.go | 36 +- vendor/modules.txt | 4 +- 10 files changed, 2502 insertions(+), 108 deletions(-) diff --git a/go.mod b/go.mod index 597d84e5..a0368e77 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.18 require ( github.com/BurntSushi/toml v1.0.0 - github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82 + github.com/NVIDIA/go-nvml v0.12.0-0 github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a github.com/fsnotify/fsnotify v1.5.4 github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb @@ -12,7 +12,7 @@ require ( github.com/sirupsen/logrus v1.9.0 github.com/stretchr/testify v1.7.0 github.com/urfave/cli/v2 v2.3.0 - gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230119114711-6fe07bb33342 + gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438 golang.org/x/mod v0.5.0 golang.org/x/sys v0.0.0-20220927170352-d9d178bc13c6 sigs.k8s.io/yaml v1.3.0 diff --git a/go.sum b/go.sum index e4d04212..c37db91d 100644 --- a/go.sum +++ b/go.sum @@ -3,6 +3,8 @@ github.com/BurntSushi/toml v1.0.0 h1:dtDWrepsVPfW9H/4y7dDgFc2MBUSeJhlaDtK13CxFlU github.com/BurntSushi/toml v1.0.0/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82 h1:x751Xx1tdxkiA/sdkv2J769n21UbYKzVOpe9S/h1M3k= github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= +github.com/NVIDIA/go-nvml v0.12.0-0 h1:eHYNHbzAsMgWYshf6dEmTY66/GCXnORJFnzm3TNH4mc= +github.com/NVIDIA/go-nvml v0.12.0-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E= @@ -88,6 +90,8 @@ github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17 github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230119114711-6fe07bb33342 h1:083n9fJt2dWOpJd/X/q9Xgl5XtQLL22uSFYbzVqJssg= gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230119114711-6fe07bb33342/go.mod h1:GStidGxhaqJhYFW1YpOnLvYCbL2EsM0od7IW4u7+JgU= +gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438 h1:+qRai7XRl8omFQVCeHcaWzL542Yw64vfmuXG+79ZCIc= +gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438/go.mod h1:GStidGxhaqJhYFW1YpOnLvYCbL2EsM0od7IW4u7+JgU= golang.org/x/mod v0.5.0 h1:UG21uOlmZabA4fW5i7ZX6bjw1xELEGg/ZLgZq9auk/Q= golang.org/x/mod v0.5.0/go.mod h1:5OXOZSfqPIIbmVBIIKWRFfZjPR0E5r58TLhUjH0a2Ro= golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go index 420f0ae9..1a0efaf6 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go @@ -44,11 +44,13 @@ const ( // DEVICE_PCI_BUS_ID_FMT as defined in nvml/nvml.h DEVICE_PCI_BUS_ID_FMT = "%08X:%02X:%02X.0" // NVLINK_MAX_LINKS as defined in nvml/nvml.h - NVLINK_MAX_LINKS = 12 + NVLINK_MAX_LINKS = 18 // TOPOLOGY_CPU as defined in nvml/nvml.h TOPOLOGY_CPU = 0 // MAX_PHYSICAL_BRIDGE as defined in nvml/nvml.h MAX_PHYSICAL_BRIDGE = 128 + // MAX_THERMAL_SENSORS_PER_GPU as defined in nvml/nvml.h + MAX_THERMAL_SENSORS_PER_GPU = 3 // FlagDefault as defined in nvml/nvml.h FlagDefault = 0 // FlagForce as defined in nvml/nvml.h @@ -57,6 +59,8 @@ const ( SINGLE_BIT_ECC = 0 // DOUBLE_BIT_ECC as defined in nvml/nvml.h DOUBLE_BIT_ECC = 0 + // MAX_GPU_PERF_PSTATES as defined in nvml/nvml.h + MAX_GPU_PERF_PSTATES = 16 // GRID_LICENSE_EXPIRY_NOT_AVAILABLE as defined in nvml/nvml.h GRID_LICENSE_EXPIRY_NOT_AVAILABLE = 0 // GRID_LICENSE_EXPIRY_INVALID as defined in nvml/nvml.h @@ -73,6 +77,18 @@ const ( VGPU_NAME_BUFFER_SIZE = 64 // GRID_LICENSE_FEATURE_MAX_COUNT as defined in nvml/nvml.h GRID_LICENSE_FEATURE_MAX_COUNT = 3 + // VGPU_SCHEDULER_POLICY_UNKNOWN as defined in nvml/nvml.h + VGPU_SCHEDULER_POLICY_UNKNOWN = 0 + // VGPU_SCHEDULER_POLICY_BEST_EFFORT as defined in nvml/nvml.h + VGPU_SCHEDULER_POLICY_BEST_EFFORT = 1 + // VGPU_SCHEDULER_POLICY_EQUAL_SHARE as defined in nvml/nvml.h + VGPU_SCHEDULER_POLICY_EQUAL_SHARE = 2 + // VGPU_SCHEDULER_POLICY_FIXED_SHARE as defined in nvml/nvml.h + VGPU_SCHEDULER_POLICY_FIXED_SHARE = 3 + // SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT as defined in nvml/nvml.h + SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT = 3 + // SCHEDULER_SW_MAX_LOG_ENTRIES as defined in nvml/nvml.h + SCHEDULER_SW_MAX_LOG_ENTRIES = 200 // GRID_LICENSE_STATE_UNKNOWN as defined in nvml/nvml.h GRID_LICENSE_STATE_UNKNOWN = 0 // GRID_LICENSE_STATE_UNINITIALIZED as defined in nvml/nvml.h @@ -85,6 +101,8 @@ const ( GRID_LICENSE_STATE_UNLICENSED = 4 // GRID_LICENSE_STATE_LICENSED as defined in nvml/nvml.h GRID_LICENSE_STATE_LICENSED = 5 + // GSP_FIRMWARE_VERSION_BUF_SIZE as defined in nvml/nvml.h + GSP_FIRMWARE_VERSION_BUF_SIZE = 64 // DEVICE_ARCH_KEPLER as defined in nvml/nvml.h DEVICE_ARCH_KEPLER = 2 // DEVICE_ARCH_MAXWELL as defined in nvml/nvml.h @@ -97,6 +115,10 @@ const ( DEVICE_ARCH_TURING = 6 // DEVICE_ARCH_AMPERE as defined in nvml/nvml.h DEVICE_ARCH_AMPERE = 7 + // DEVICE_ARCH_ADA as defined in nvml/nvml.h + DEVICE_ARCH_ADA = 8 + // DEVICE_ARCH_HOPPER as defined in nvml/nvml.h + DEVICE_ARCH_HOPPER = 9 // DEVICE_ARCH_UNKNOWN as defined in nvml/nvml.h DEVICE_ARCH_UNKNOWN = 4294967295 // BUS_TYPE_UNKNOWN as defined in nvml/nvml.h @@ -109,6 +131,10 @@ const ( BUS_TYPE_FPCI = 3 // BUS_TYPE_AGP as defined in nvml/nvml.h BUS_TYPE_AGP = 4 + // FAN_POLICY_TEMPERATURE_CONTINOUS_SW as defined in nvml/nvml.h + FAN_POLICY_TEMPERATURE_CONTINOUS_SW = 0 + // FAN_POLICY_MANUAL as defined in nvml/nvml.h + FAN_POLICY_MANUAL = 1 // POWER_SOURCE_AC as defined in nvml/nvml.h POWER_SOURCE_AC = 0 // POWER_SOURCE_BATTERY as defined in nvml/nvml.h @@ -125,10 +151,14 @@ const ( PCIE_LINK_MAX_SPEED_16000MBPS = 4 // PCIE_LINK_MAX_SPEED_32000MBPS as defined in nvml/nvml.h PCIE_LINK_MAX_SPEED_32000MBPS = 5 + // PCIE_LINK_MAX_SPEED_64000MBPS as defined in nvml/nvml.h + PCIE_LINK_MAX_SPEED_64000MBPS = 6 // ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED as defined in nvml/nvml.h ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED = 0 // ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED as defined in nvml/nvml.h ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED = 1 + // MAX_GPU_UTILIZATIONS as defined in nvml/nvml.h + MAX_GPU_UTILIZATIONS = 8 // FI_DEV_ECC_CURRENT as defined in nvml/nvml.h FI_DEV_ECC_CURRENT = 1 // FI_DEV_ECC_PENDING as defined in nvml/nvml.h @@ -449,8 +479,26 @@ const ( FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 = 159 // FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL as defined in nvml/nvml.h FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL = 160 + // FI_DEV_NVLINK_ERROR_DL_REPLAY as defined in nvml/nvml.h + FI_DEV_NVLINK_ERROR_DL_REPLAY = 161 + // FI_DEV_NVLINK_ERROR_DL_RECOVERY as defined in nvml/nvml.h + FI_DEV_NVLINK_ERROR_DL_RECOVERY = 162 + // FI_DEV_NVLINK_ERROR_DL_CRC as defined in nvml/nvml.h + FI_DEV_NVLINK_ERROR_DL_CRC = 163 + // FI_DEV_NVLINK_GET_SPEED as defined in nvml/nvml.h + FI_DEV_NVLINK_GET_SPEED = 164 + // FI_DEV_NVLINK_GET_STATE as defined in nvml/nvml.h + FI_DEV_NVLINK_GET_STATE = 165 + // FI_DEV_NVLINK_GET_VERSION as defined in nvml/nvml.h + FI_DEV_NVLINK_GET_VERSION = 166 + // FI_DEV_NVLINK_GET_POWER_STATE as defined in nvml/nvml.h + FI_DEV_NVLINK_GET_POWER_STATE = 167 + // FI_DEV_NVLINK_GET_POWER_THRESHOLD as defined in nvml/nvml.h + FI_DEV_NVLINK_GET_POWER_THRESHOLD = 168 + // FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER as defined in nvml/nvml.h + FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER = 169 // FI_MAX as defined in nvml/nvml.h - FI_MAX = 161 + FI_MAX = 170 // EventTypeSingleBitEccError as defined in nvml/nvml.h EventTypeSingleBitEccError = 1 // EventTypeDoubleBitEccError as defined in nvml/nvml.h @@ -503,6 +551,16 @@ const ( NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE = 8 // NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT as defined in nvml/nvml.h NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT = 16 + // GPU_FABRIC_UUID_LEN as defined in nvml/nvml.h + GPU_FABRIC_UUID_LEN = 16 + // GPU_FABRIC_STATE_NOT_SUPPORTED as defined in nvml/nvml.h + GPU_FABRIC_STATE_NOT_SUPPORTED = 0 + // GPU_FABRIC_STATE_NOT_STARTED as defined in nvml/nvml.h + GPU_FABRIC_STATE_NOT_STARTED = 1 + // GPU_FABRIC_STATE_IN_PROGRESS as defined in nvml/nvml.h + GPU_FABRIC_STATE_IN_PROGRESS = 2 + // GPU_FABRIC_STATE_COMPLETED as defined in nvml/nvml.h + GPU_FABRIC_STATE_COMPLETED = 3 // INIT_FLAG_NO_GPUS as defined in nvml/nvml.h INIT_FLAG_NO_GPUS = 1 // INIT_FLAG_NO_ATTACH as defined in nvml/nvml.h @@ -551,8 +609,12 @@ const ( GPU_INSTANCE_PROFILE_6_SLICE = 6 // GPU_INSTANCE_PROFILE_1_SLICE_REV1 as defined in nvml/nvml.h GPU_INSTANCE_PROFILE_1_SLICE_REV1 = 7 + // GPU_INSTANCE_PROFILE_2_SLICE_REV1 as defined in nvml/nvml.h + GPU_INSTANCE_PROFILE_2_SLICE_REV1 = 8 + // GPU_INSTANCE_PROFILE_1_SLICE_REV2 as defined in nvml/nvml.h + GPU_INSTANCE_PROFILE_1_SLICE_REV2 = 9 // GPU_INSTANCE_PROFILE_COUNT as defined in nvml/nvml.h - GPU_INSTANCE_PROFILE_COUNT = 8 + GPU_INSTANCE_PROFILE_COUNT = 10 // COMPUTE_INSTANCE_PROFILE_1_SLICE as defined in nvml/nvml.h COMPUTE_INSTANCE_PROFILE_1_SLICE = 0 // COMPUTE_INSTANCE_PROFILE_2_SLICE as defined in nvml/nvml.h @@ -567,12 +629,32 @@ const ( COMPUTE_INSTANCE_PROFILE_8_SLICE = 5 // COMPUTE_INSTANCE_PROFILE_6_SLICE as defined in nvml/nvml.h COMPUTE_INSTANCE_PROFILE_6_SLICE = 6 + // COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 as defined in nvml/nvml.h + COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = 7 // COMPUTE_INSTANCE_PROFILE_COUNT as defined in nvml/nvml.h - COMPUTE_INSTANCE_PROFILE_COUNT = 7 + COMPUTE_INSTANCE_PROFILE_COUNT = 8 // COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED as defined in nvml/nvml.h COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = 0 // COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT as defined in nvml/nvml.h COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = 1 + // GPM_METRICS_GET_VERSION as defined in nvml/nvml.h + GPM_METRICS_GET_VERSION = 1 + // GPM_SUPPORT_VERSION as defined in nvml/nvml.h + GPM_SUPPORT_VERSION = 1 + // COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE as defined in nvml/nvml.h + COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE = 0 + // COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE as defined in nvml/nvml.h + COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE = 1 + // NVLINK_POWER_STATE_HIGH_SPEED as defined in nvml/nvml.h + NVLINK_POWER_STATE_HIGH_SPEED = 0 + // NVLINK_POWER_STATE_LOW as defined in nvml/nvml.h + NVLINK_POWER_STATE_LOW = 1 + // NVLINK_LOW_POWER_THRESHOLD_MIN as defined in nvml/nvml.h + NVLINK_LOW_POWER_THRESHOLD_MIN = 1 + // NVLINK_LOW_POWER_THRESHOLD_MAX as defined in nvml/nvml.h + NVLINK_LOW_POWER_THRESHOLD_MAX = 8191 + // NVLINK_LOW_POWER_THRESHOLD_RESET as defined in nvml/nvml.h + NVLINK_LOW_POWER_THRESHOLD_RESET = 4294967295 ) // BridgeChipType as declared in nvml/nvml.h @@ -918,32 +1000,34 @@ type Return int32 // Return enumeration from nvml/nvml.h const ( - SUCCESS Return = iota - ERROR_UNINITIALIZED Return = 1 - ERROR_INVALID_ARGUMENT Return = 2 - ERROR_NOT_SUPPORTED Return = 3 - ERROR_NO_PERMISSION Return = 4 - ERROR_ALREADY_INITIALIZED Return = 5 - ERROR_NOT_FOUND Return = 6 - ERROR_INSUFFICIENT_SIZE Return = 7 - ERROR_INSUFFICIENT_POWER Return = 8 - ERROR_DRIVER_NOT_LOADED Return = 9 - ERROR_TIMEOUT Return = 10 - ERROR_IRQ_ISSUE Return = 11 - ERROR_LIBRARY_NOT_FOUND Return = 12 - ERROR_FUNCTION_NOT_FOUND Return = 13 - ERROR_CORRUPTED_INFOROM Return = 14 - ERROR_GPU_IS_LOST Return = 15 - ERROR_RESET_REQUIRED Return = 16 - ERROR_OPERATING_SYSTEM Return = 17 - ERROR_LIB_RM_VERSION_MISMATCH Return = 18 - ERROR_IN_USE Return = 19 - ERROR_MEMORY Return = 20 - ERROR_NO_DATA Return = 21 - ERROR_VGPU_ECC_NOT_SUPPORTED Return = 22 - ERROR_INSUFFICIENT_RESOURCES Return = 23 - ERROR_FREQ_NOT_SUPPORTED Return = 24 - ERROR_UNKNOWN Return = 999 + SUCCESS Return = iota + ERROR_UNINITIALIZED Return = 1 + ERROR_INVALID_ARGUMENT Return = 2 + ERROR_NOT_SUPPORTED Return = 3 + ERROR_NO_PERMISSION Return = 4 + ERROR_ALREADY_INITIALIZED Return = 5 + ERROR_NOT_FOUND Return = 6 + ERROR_INSUFFICIENT_SIZE Return = 7 + ERROR_INSUFFICIENT_POWER Return = 8 + ERROR_DRIVER_NOT_LOADED Return = 9 + ERROR_TIMEOUT Return = 10 + ERROR_IRQ_ISSUE Return = 11 + ERROR_LIBRARY_NOT_FOUND Return = 12 + ERROR_FUNCTION_NOT_FOUND Return = 13 + ERROR_CORRUPTED_INFOROM Return = 14 + ERROR_GPU_IS_LOST Return = 15 + ERROR_RESET_REQUIRED Return = 16 + ERROR_OPERATING_SYSTEM Return = 17 + ERROR_LIB_RM_VERSION_MISMATCH Return = 18 + ERROR_IN_USE Return = 19 + ERROR_MEMORY Return = 20 + ERROR_NO_DATA Return = 21 + ERROR_VGPU_ECC_NOT_SUPPORTED Return = 22 + ERROR_INSUFFICIENT_RESOURCES Return = 23 + ERROR_FREQ_NOT_SUPPORTED Return = 24 + ERROR_ARGUMENT_VERSION_MISMATCH Return = 25 + ERROR_DEPRECATED Return = 26 + ERROR_UNKNOWN Return = 999 ) // MemoryLocation as declared in nvml/nvml.h @@ -983,18 +1067,6 @@ const ( RESTRICTED_API_COUNT RestrictedAPI = 2 ) -// NvLinkEccLaneErrorCounter as declared in nvml/nvml.h -type NvLinkEccLaneErrorCounter int32 - -// NvLinkEccLaneErrorCounter enumeration from nvml/nvml.h -const ( - NVLINK_ERROR_DL_ECC_LANE0 NvLinkEccLaneErrorCounter = iota - NVLINK_ERROR_DL_ECC_LANE1 NvLinkEccLaneErrorCounter = 1 - NVLINK_ERROR_DL_ECC_LANE2 NvLinkEccLaneErrorCounter = 2 - NVLINK_ERROR_DL_ECC_LANE3 NvLinkEccLaneErrorCounter = 3 - NVLINK_ERROR_DL_ECC_COUNT NvLinkEccLaneErrorCounter = 4 -) - // GpuVirtualizationMode as declared in nvml/nvml.h type GpuVirtualizationMode int32 @@ -1034,6 +1106,50 @@ const ( VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED VgpuGuestInfoState = 1 ) +// VgpuCapability as declared in nvml/nvml.h +type VgpuCapability int32 + +// VgpuCapability enumeration from nvml/nvml.h +const ( + VGPU_CAP_NVLINK_P2P VgpuCapability = iota + VGPU_CAP_GPUDIRECT VgpuCapability = 1 + VGPU_CAP_MULTI_VGPU_EXCLUSIVE VgpuCapability = 2 + VGPU_CAP_EXCLUSIVE_TYPE VgpuCapability = 3 + VGPU_CAP_EXCLUSIVE_SIZE VgpuCapability = 4 + VGPU_CAP_COUNT VgpuCapability = 5 +) + +// VgpuDriverCapability as declared in nvml/nvml.h +type VgpuDriverCapability int32 + +// VgpuDriverCapability enumeration from nvml/nvml.h +const ( + VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU VgpuDriverCapability = iota + VGPU_DRIVER_CAP_COUNT VgpuDriverCapability = 1 +) + +// DeviceVgpuCapability as declared in nvml/nvml.h +type DeviceVgpuCapability int32 + +// DeviceVgpuCapability enumeration from nvml/nvml.h +const ( + DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU DeviceVgpuCapability = iota + DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES DeviceVgpuCapability = 1 + DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES DeviceVgpuCapability = 2 + DEVICE_VGPU_CAP_COUNT DeviceVgpuCapability = 3 +) + +// GpuUtilizationDomainId as declared in nvml/nvml.h +type GpuUtilizationDomainId int32 + +// GpuUtilizationDomainId enumeration from nvml/nvml.h +const ( + GPU_UTILIZATION_DOMAIN_GPU GpuUtilizationDomainId = iota + GPU_UTILIZATION_DOMAIN_FB GpuUtilizationDomainId = 1 + GPU_UTILIZATION_DOMAIN_VID GpuUtilizationDomainId = 2 + GPU_UTILIZATION_DOMAIN_BUS GpuUtilizationDomainId = 3 +) + // FanState as declared in nvml/nvml.h type FanState int32 @@ -1125,6 +1241,49 @@ const ( VGPU_COMPATIBILITY_LIMIT_OTHER VgpuPgpuCompatibilityLimitCode = -2147483648 ) +// ThermalTarget as declared in nvml/nvml.h +type ThermalTarget int32 + +// ThermalTarget enumeration from nvml/nvml.h +const ( + THERMAL_TARGET_NONE ThermalTarget = iota + THERMAL_TARGET_GPU ThermalTarget = 1 + THERMAL_TARGET_MEMORY ThermalTarget = 2 + THERMAL_TARGET_POWER_SUPPLY ThermalTarget = 4 + THERMAL_TARGET_BOARD ThermalTarget = 8 + THERMAL_TARGET_VCD_BOARD ThermalTarget = 9 + THERMAL_TARGET_VCD_INLET ThermalTarget = 10 + THERMAL_TARGET_VCD_OUTLET ThermalTarget = 11 + THERMAL_TARGET_ALL ThermalTarget = 15 + THERMAL_TARGET_UNKNOWN ThermalTarget = -1 +) + +// ThermalController as declared in nvml/nvml.h +type ThermalController int32 + +// ThermalController enumeration from nvml/nvml.h +const ( + THERMAL_CONTROLLER_NONE ThermalController = iota + THERMAL_CONTROLLER_GPU_INTERNAL ThermalController = 1 + THERMAL_CONTROLLER_ADM1032 ThermalController = 2 + THERMAL_CONTROLLER_ADT7461 ThermalController = 3 + THERMAL_CONTROLLER_MAX6649 ThermalController = 4 + THERMAL_CONTROLLER_MAX1617 ThermalController = 5 + THERMAL_CONTROLLER_LM99 ThermalController = 6 + THERMAL_CONTROLLER_LM89 ThermalController = 7 + THERMAL_CONTROLLER_LM64 ThermalController = 8 + THERMAL_CONTROLLER_G781 ThermalController = 9 + THERMAL_CONTROLLER_ADT7473 ThermalController = 10 + THERMAL_CONTROLLER_SBMAX6649 ThermalController = 11 + THERMAL_CONTROLLER_VBIOSEVT ThermalController = 12 + THERMAL_CONTROLLER_OS ThermalController = 13 + THERMAL_CONTROLLER_NVSYSCON_CANOAS ThermalController = 14 + THERMAL_CONTROLLER_NVSYSCON_E551 ThermalController = 15 + THERMAL_CONTROLLER_MAX6649R ThermalController = 16 + THERMAL_CONTROLLER_ADT7473S ThermalController = 17 + THERMAL_CONTROLLER_UNKNOWN ThermalController = -1 +) + // GridLicenseFeatureCode as declared in nvml/nvml.h type GridLicenseFeatureCode int32 @@ -1137,3 +1296,80 @@ const ( GRID_LICENSE_FEATURE_CODE_GAMING GridLicenseFeatureCode = 3 GRID_LICENSE_FEATURE_CODE_COMPUTE GridLicenseFeatureCode = 4 ) + +// GpmMetricId as declared in nvml/nvml.h +type GpmMetricId int32 + +// GpmMetricId enumeration from nvml/nvml.h +const ( + GPM_METRIC_GRAPHICS_UTIL GpmMetricId = 1 + GPM_METRIC_SM_UTIL GpmMetricId = 2 + GPM_METRIC_SM_OCCUPANCY GpmMetricId = 3 + GPM_METRIC_INTEGER_UTIL GpmMetricId = 4 + GPM_METRIC_ANY_TENSOR_UTIL GpmMetricId = 5 + GPM_METRIC_DFMA_TENSOR_UTIL GpmMetricId = 6 + GPM_METRIC_HMMA_TENSOR_UTIL GpmMetricId = 7 + GPM_METRIC_IMMA_TENSOR_UTIL GpmMetricId = 9 + GPM_METRIC_DRAM_BW_UTIL GpmMetricId = 10 + GPM_METRIC_FP64_UTIL GpmMetricId = 11 + GPM_METRIC_FP32_UTIL GpmMetricId = 12 + GPM_METRIC_FP16_UTIL GpmMetricId = 13 + GPM_METRIC_PCIE_TX_PER_SEC GpmMetricId = 20 + GPM_METRIC_PCIE_RX_PER_SEC GpmMetricId = 21 + GPM_METRIC_NVDEC_0_UTIL GpmMetricId = 30 + GPM_METRIC_NVDEC_1_UTIL GpmMetricId = 31 + GPM_METRIC_NVDEC_2_UTIL GpmMetricId = 32 + GPM_METRIC_NVDEC_3_UTIL GpmMetricId = 33 + GPM_METRIC_NVDEC_4_UTIL GpmMetricId = 34 + GPM_METRIC_NVDEC_5_UTIL GpmMetricId = 35 + GPM_METRIC_NVDEC_6_UTIL GpmMetricId = 36 + GPM_METRIC_NVDEC_7_UTIL GpmMetricId = 37 + GPM_METRIC_NVJPG_0_UTIL GpmMetricId = 40 + GPM_METRIC_NVJPG_1_UTIL GpmMetricId = 41 + GPM_METRIC_NVJPG_2_UTIL GpmMetricId = 42 + GPM_METRIC_NVJPG_3_UTIL GpmMetricId = 43 + GPM_METRIC_NVJPG_4_UTIL GpmMetricId = 44 + GPM_METRIC_NVJPG_5_UTIL GpmMetricId = 45 + GPM_METRIC_NVJPG_6_UTIL GpmMetricId = 46 + GPM_METRIC_NVJPG_7_UTIL GpmMetricId = 47 + GPM_METRIC_NVOFA_0_UTIL GpmMetricId = 50 + GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC GpmMetricId = 60 + GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC GpmMetricId = 61 + GPM_METRIC_NVLINK_L0_RX_PER_SEC GpmMetricId = 62 + GPM_METRIC_NVLINK_L0_TX_PER_SEC GpmMetricId = 63 + GPM_METRIC_NVLINK_L1_RX_PER_SEC GpmMetricId = 64 + GPM_METRIC_NVLINK_L1_TX_PER_SEC GpmMetricId = 65 + GPM_METRIC_NVLINK_L2_RX_PER_SEC GpmMetricId = 66 + GPM_METRIC_NVLINK_L2_TX_PER_SEC GpmMetricId = 67 + GPM_METRIC_NVLINK_L3_RX_PER_SEC GpmMetricId = 68 + GPM_METRIC_NVLINK_L3_TX_PER_SEC GpmMetricId = 69 + GPM_METRIC_NVLINK_L4_RX_PER_SEC GpmMetricId = 70 + GPM_METRIC_NVLINK_L4_TX_PER_SEC GpmMetricId = 71 + GPM_METRIC_NVLINK_L5_RX_PER_SEC GpmMetricId = 72 + GPM_METRIC_NVLINK_L5_TX_PER_SEC GpmMetricId = 73 + GPM_METRIC_NVLINK_L6_RX_PER_SEC GpmMetricId = 74 + GPM_METRIC_NVLINK_L6_TX_PER_SEC GpmMetricId = 75 + GPM_METRIC_NVLINK_L7_RX_PER_SEC GpmMetricId = 76 + GPM_METRIC_NVLINK_L7_TX_PER_SEC GpmMetricId = 77 + GPM_METRIC_NVLINK_L8_RX_PER_SEC GpmMetricId = 78 + GPM_METRIC_NVLINK_L8_TX_PER_SEC GpmMetricId = 79 + GPM_METRIC_NVLINK_L9_RX_PER_SEC GpmMetricId = 80 + GPM_METRIC_NVLINK_L9_TX_PER_SEC GpmMetricId = 81 + GPM_METRIC_NVLINK_L10_RX_PER_SEC GpmMetricId = 82 + GPM_METRIC_NVLINK_L10_TX_PER_SEC GpmMetricId = 83 + GPM_METRIC_NVLINK_L11_RX_PER_SEC GpmMetricId = 84 + GPM_METRIC_NVLINK_L11_TX_PER_SEC GpmMetricId = 85 + GPM_METRIC_NVLINK_L12_RX_PER_SEC GpmMetricId = 86 + GPM_METRIC_NVLINK_L12_TX_PER_SEC GpmMetricId = 87 + GPM_METRIC_NVLINK_L13_RX_PER_SEC GpmMetricId = 88 + GPM_METRIC_NVLINK_L13_TX_PER_SEC GpmMetricId = 89 + GPM_METRIC_NVLINK_L14_RX_PER_SEC GpmMetricId = 90 + GPM_METRIC_NVLINK_L14_TX_PER_SEC GpmMetricId = 91 + GPM_METRIC_NVLINK_L15_RX_PER_SEC GpmMetricId = 92 + GPM_METRIC_NVLINK_L15_TX_PER_SEC GpmMetricId = 93 + GPM_METRIC_NVLINK_L16_RX_PER_SEC GpmMetricId = 94 + GPM_METRIC_NVLINK_L16_TX_PER_SEC GpmMetricId = 95 + GPM_METRIC_NVLINK_L17_RX_PER_SEC GpmMetricId = 96 + GPM_METRIC_NVLINK_L17_TX_PER_SEC GpmMetricId = 97 + GPM_METRIC_MAX GpmMetricId = 98 +) diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go index b0eb68d8..f19da394 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go @@ -38,21 +38,21 @@ func DeviceGetHandleByIndex(Index int) (Device, Return) { // nvml.DeviceGetHandleBySerial() func DeviceGetHandleBySerial(Serial string) (Device, Return) { var Device Device - ret := nvmlDeviceGetHandleBySerial(Serial + string(rune(0)), &Device) + ret := nvmlDeviceGetHandleBySerial(Serial+string(rune(0)), &Device) return Device, ret } // nvml.DeviceGetHandleByUUID() func DeviceGetHandleByUUID(Uuid string) (Device, Return) { var Device Device - ret := nvmlDeviceGetHandleByUUID(Uuid + string(rune(0)), &Device) + ret := nvmlDeviceGetHandleByUUID(Uuid+string(rune(0)), &Device) return Device, ret } // nvml.DeviceGetHandleByPciBusId() func DeviceGetHandleByPciBusId(PciBusId string) (Device, Return) { var Device Device - ret := nvmlDeviceGetHandleByPciBusId(PciBusId + string(rune(0)), &Device) + ret := nvmlDeviceGetHandleByPciBusId(PciBusId+string(rune(0)), &Device) return Device, ret } @@ -2286,3 +2286,360 @@ func DeviceGetBusType(Device Device) (BusType, Return) { func (Device Device) GetBusType() (BusType, Return) { return DeviceGetBusType(Device) } + +// nvml.DeviceSetDefaultFanSpeed_v2() +func DeviceSetDefaultFanSpeed_v2(Device Device, Fan int) Return { + return nvmlDeviceSetDefaultFanSpeed_v2(Device, uint32(Fan)) +} + +func (Device Device) SetDefaultFanSpeed_v2(Fan int) Return { + return DeviceSetDefaultFanSpeed_v2(Device, Fan) +} + +// nvml.DeviceGetMinMaxFanSpeed() +func DeviceGetMinMaxFanSpeed(Device Device) (int, int, Return) { + var MinSpeed, MaxSpeed uint32 + ret := nvmlDeviceGetMinMaxFanSpeed(Device, &MinSpeed, &MaxSpeed) + return int(MinSpeed), int(MaxSpeed), ret +} + +func (Device Device) GetMinMaxFanSpeed() (int, int, Return) { + return DeviceGetMinMaxFanSpeed(Device) +} + +// nvml.DeviceGetThermalSettings() +func DeviceGetThermalSettings(Device Device, SensorIndex uint32) (GpuThermalSettings, Return) { + var PThermalSettings GpuThermalSettings + ret := nvmlDeviceGetThermalSettings(Device, SensorIndex, &PThermalSettings) + return PThermalSettings, ret +} + +func (Device Device) GetThermalSettings(SensorIndex uint32) (GpuThermalSettings, Return) { + return DeviceGetThermalSettings(Device, SensorIndex) +} + +// nvml.DeviceGetDefaultEccMode() +func DeviceGetDefaultEccMode(Device Device) (EnableState, Return) { + var DefaultMode EnableState + ret := nvmlDeviceGetDefaultEccMode(Device, &DefaultMode) + return DefaultMode, ret +} + +func (Device Device) GetDefaultEccMode() (EnableState, Return) { + return DeviceGetDefaultEccMode(Device) +} + +// nvml.DeviceGetPcieSpeed() +func DeviceGetPcieSpeed(Device Device) (int, Return) { + var PcieSpeed uint32 + ret := nvmlDeviceGetPcieSpeed(Device, &PcieSpeed) + return int(PcieSpeed), ret +} + +func (Device Device) GetPcieSpeed() (int, Return) { + return DeviceGetPcieSpeed(Device) +} + +// nvml.DeviceGetGspFirmwareVersion() +func DeviceGetGspFirmwareVersion(Device Device) (string, Return) { + Version := make([]byte, GSP_FIRMWARE_VERSION_BUF_SIZE) + ret := nvmlDeviceGetGspFirmwareVersion(Device, &Version[0]) + return string(Version[:clen(Version)]), ret +} + +func (Device Device) GetGspFirmwareVersion() (string, Return) { + return DeviceGetGspFirmwareVersion(Device) +} + +// nvml.DeviceGetGspFirmwareMode() +func DeviceGetGspFirmwareMode(Device Device) (bool, bool, Return) { + var IsEnabled, DefaultMode uint32 + ret := nvmlDeviceGetGspFirmwareMode(Device, &IsEnabled, &DefaultMode) + return (IsEnabled != 0), (DefaultMode != 0), ret +} + +func (Device Device) GetGspFirmwareMode() (bool, bool, Return) { + return DeviceGetGspFirmwareMode(Device) +} + +// nvml.DeviceGetDynamicPstatesInfo() +func DeviceGetDynamicPstatesInfo(Device Device) (GpuDynamicPstatesInfo, Return) { + var PDynamicPstatesInfo GpuDynamicPstatesInfo + ret := nvmlDeviceGetDynamicPstatesInfo(Device, &PDynamicPstatesInfo) + return PDynamicPstatesInfo, ret +} + +func (Device Device) GetDynamicPstatesInfo() (GpuDynamicPstatesInfo, Return) { + return DeviceGetDynamicPstatesInfo(Device) +} + +// nvml.DeviceSetFanSpeed_v2() +func DeviceSetFanSpeed_v2(Device Device, Fan int, Speed int) Return { + return nvmlDeviceSetFanSpeed_v2(Device, uint32(Fan), uint32(Speed)) +} + +func (Device Device) SetFanSpeed_v2(Fan int, Speed int) Return { + return DeviceSetFanSpeed_v2(Device, Fan, Speed) +} + +// nvml.DeviceGetGpcClkVfOffset() +func DeviceGetGpcClkVfOffset(Device Device) (int, Return) { + var Offset int32 + ret := nvmlDeviceGetGpcClkVfOffset(Device, &Offset) + return int(Offset), ret +} + +func (Device Device) GetGpcClkVfOffset() (int, Return) { + return DeviceGetGpcClkVfOffset(Device) +} + +// nvml.DeviceSetGpcClkVfOffset() +func DeviceSetGpcClkVfOffset(Device Device, Offset int) Return { + return nvmlDeviceSetGpcClkVfOffset(Device, int32(Offset)) +} + +func (Device Device) SetGpcClkVfOffset(Offset int) Return { + return DeviceSetGpcClkVfOffset(Device, Offset) +} + +// nvml.DeviceGetMinMaxClockOfPState() +func DeviceGetMinMaxClockOfPState(Device Device, _type ClockType, Pstate Pstates) (uint32, uint32, Return) { + var MinClockMHz, MaxClockMHz uint32 + ret := nvmlDeviceGetMinMaxClockOfPState(Device, _type, Pstate, &MinClockMHz, &MaxClockMHz) + return MinClockMHz, MaxClockMHz, ret +} + +func (Device Device) GetMinMaxClockOfPState(_type ClockType, Pstate Pstates) (uint32, uint32, Return) { + return DeviceGetMinMaxClockOfPState(Device, _type, Pstate) +} + +// nvml.DeviceGetSupportedPerformanceStates() +func DeviceGetSupportedPerformanceStates(Device Device) ([]Pstates, Return) { + Pstates := make([]Pstates, MAX_GPU_PERF_PSTATES) + ret := nvmlDeviceGetSupportedPerformanceStates(Device, &Pstates[0], MAX_GPU_PERF_PSTATES) + for i := 0; i < MAX_GPU_PERF_PSTATES; i++ { + if Pstates[i] == PSTATE_UNKNOWN { + return Pstates[0:i], ret + } + } + return Pstates, ret +} + +func (Device Device) GetSupportedPerformanceStates() ([]Pstates, Return) { + return DeviceGetSupportedPerformanceStates(Device) +} + +// nvml.DeviceGetTargetFanSpeed() +func DeviceGetTargetFanSpeed(Device Device, Fan int) (int, Return) { + var TargetSpeed uint32 + ret := nvmlDeviceGetTargetFanSpeed(Device, uint32(Fan), &TargetSpeed) + return int(TargetSpeed), ret +} + +func (Device Device) GetTargetFanSpeed(Fan int) (int, Return) { + return DeviceGetTargetFanSpeed(Device, Fan) +} + +// nvml.DeviceGetMemClkVfOffset() +func DeviceGetMemClkVfOffset(Device Device) (int, Return) { + var Offset int32 + ret := nvmlDeviceGetMemClkVfOffset(Device, &Offset) + return int(Offset), ret +} + +func (Device Device) GetMemClkVfOffset() (int, Return) { + return DeviceGetMemClkVfOffset(Device) +} + +// nvml.DeviceSetMemClkVfOffset() +func DeviceSetMemClkVfOffset(Device Device, Offset int) Return { + return nvmlDeviceSetMemClkVfOffset(Device, int32(Offset)) +} + +func (Device Device) SetMemClkVfOffset(Offset int) Return { + return DeviceSetMemClkVfOffset(Device, Offset) +} + +// nvml.DeviceGetGpcClkMinMaxVfOffset() +func DeviceGetGpcClkMinMaxVfOffset(Device Device) (int, int, Return) { + var MinOffset, MaxOffset int32 + ret := nvmlDeviceGetGpcClkMinMaxVfOffset(Device, &MinOffset, &MaxOffset) + return int(MinOffset), int(MaxOffset), ret +} + +func (Device Device) GetGpcClkMinMaxVfOffset() (int, int, Return) { + return DeviceGetGpcClkMinMaxVfOffset(Device) +} + +// nvml.DeviceGetMemClkMinMaxVfOffset() +func DeviceGetMemClkMinMaxVfOffset(Device Device) (int, int, Return) { + var MinOffset, MaxOffset int32 + ret := nvmlDeviceGetMemClkMinMaxVfOffset(Device, &MinOffset, &MaxOffset) + return int(MinOffset), int(MaxOffset), ret +} + +func (Device Device) GetMemClkMinMaxVfOffset() (int, int, Return) { + return DeviceGetMemClkMinMaxVfOffset(Device) +} + +// nvml.DeviceGetGpuMaxPcieLinkGeneration() +func DeviceGetGpuMaxPcieLinkGeneration(Device Device) (int, Return) { + var MaxLinkGenDevice uint32 + ret := nvmlDeviceGetGpuMaxPcieLinkGeneration(Device, &MaxLinkGenDevice) + return int(MaxLinkGenDevice), ret +} + +func (Device Device) GetGpuMaxPcieLinkGeneration() (int, Return) { + return DeviceGetGpuMaxPcieLinkGeneration(Device) +} + +// nvml.DeviceGetFanControlPolicy_v2() +func DeviceGetFanControlPolicy_v2(Device Device, Fan int) (FanControlPolicy, Return) { + var Policy FanControlPolicy + ret := nvmlDeviceGetFanControlPolicy_v2(Device, uint32(Fan), &Policy) + return Policy, ret +} + +func (Device Device) GetFanControlPolicy_v2(Fan int) (FanControlPolicy, Return) { + return DeviceGetFanControlPolicy_v2(Device, Fan) +} + +// nvml.DeviceSetFanControlPolicy() +func DeviceSetFanControlPolicy(Device Device, Fan int, Policy FanControlPolicy) Return { + return nvmlDeviceSetFanControlPolicy(Device, uint32(Fan), Policy) +} + +func (Device Device) SetFanControlPolicy(Fan int, Policy FanControlPolicy) Return { + return DeviceSetFanControlPolicy(Device, Fan, Policy) +} + +// nvml.DeviceClearFieldValues() +func DeviceClearFieldValues(Device Device, Values []FieldValue) Return { + ValuesCount := len(Values) + return nvmlDeviceClearFieldValues(Device, int32(ValuesCount), &Values[0]) +} + +func (Device Device) ClearFieldValues(Values []FieldValue) Return { + return DeviceClearFieldValues(Device, Values) +} + +// nvml.DeviceGetVgpuCapabilities() +func DeviceGetVgpuCapabilities(Device Device, Capability DeviceVgpuCapability) (bool, Return) { + var CapResult uint32 + ret := nvmlDeviceGetVgpuCapabilities(Device, Capability, &CapResult) + return (CapResult != 0), ret +} + +func (Device Device) GetVgpuCapabilities(Capability DeviceVgpuCapability) (bool, Return) { + return DeviceGetVgpuCapabilities(Device, Capability) +} + +// nvml.DeviceGetVgpuSchedulerLog() +func DeviceGetVgpuSchedulerLog(Device Device) (VgpuSchedulerLog, Return) { + var PSchedulerLog VgpuSchedulerLog + ret := nvmlDeviceGetVgpuSchedulerLog(Device, &PSchedulerLog) + return PSchedulerLog, ret +} + +func (Device Device) GetVgpuSchedulerLog() (VgpuSchedulerLog, Return) { + return DeviceGetVgpuSchedulerLog(Device) +} + +// nvml.DeviceGetVgpuSchedulerState() +func DeviceGetVgpuSchedulerState(Device Device) (VgpuSchedulerGetState, Return) { + var PSchedulerState VgpuSchedulerGetState + ret := nvmlDeviceGetVgpuSchedulerState(Device, &PSchedulerState) + return PSchedulerState, ret +} + +func (Device Device) GetVgpuSchedulerState() (VgpuSchedulerGetState, Return) { + return DeviceGetVgpuSchedulerState(Device) +} + +// nvml.DeviceSetVgpuSchedulerState() +func DeviceSetVgpuSchedulerState(Device Device, PSchedulerState *VgpuSchedulerSetState) Return { + return nvmlDeviceSetVgpuSchedulerState(Device, PSchedulerState) +} + +func (Device Device) SetVgpuSchedulerState(PSchedulerState *VgpuSchedulerSetState) Return { + return DeviceSetVgpuSchedulerState(Device, PSchedulerState) +} + +// nvml.DeviceGetVgpuSchedulerCapabilities() +func DeviceGetVgpuSchedulerCapabilities(Device Device) (VgpuSchedulerCapabilities, Return) { + var PCapabilities VgpuSchedulerCapabilities + ret := nvmlDeviceGetVgpuSchedulerCapabilities(Device, &PCapabilities) + return PCapabilities, ret +} + +func (Device Device) GetVgpuSchedulerCapabilities() (VgpuSchedulerCapabilities, Return) { + return DeviceGetVgpuSchedulerCapabilities(Device) +} + +// nvml.GpuInstanceGetComputeInstancePossiblePlacements() +func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, ProfileId int) ([]ComputeInstancePlacement, Return) { + var Count uint32 + ret := nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, uint32(ProfileId), nil, &Count) + if ret != SUCCESS { + return nil, ret + } + if Count == 0 { + return []ComputeInstancePlacement{}, ret + } + PlacementArray := make([]ComputeInstancePlacement, Count) + ret = nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, uint32(ProfileId), &PlacementArray[0], &Count) + return PlacementArray, ret +} + +func (GpuInstance GpuInstance) GetComputeInstancePossiblePlacements(ProfileId int) ([]ComputeInstancePlacement, Return) { + return GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, ProfileId) +} + +// nvml.GpuInstanceCreateComputeInstanceWithPlacement() +func GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance GpuInstance, ProfileId int, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return { + return nvmlGpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, uint32(ProfileId), Placement, ComputeInstance) +} + +func (GpuInstance GpuInstance) CreateComputeInstanceWithPlacement(ProfileId int, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return { + return GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, ProfileId, Placement, ComputeInstance) +} + +// nvml.DeviceGetGpuFabricInfo() +func DeviceGetGpuFabricInfo(Device Device) (GpuFabricInfo, Return) { + var GpuFabricInfo GpuFabricInfo + ret := nvmlDeviceGetGpuFabricInfo(Device, &GpuFabricInfo) + return GpuFabricInfo, ret +} + +func (Device Device) GetGpuFabricInfo() (GpuFabricInfo, Return) { + return DeviceGetGpuFabricInfo(Device) +} + +// nvml.DeviceCcuGetStreamState() +func DeviceCcuGetStreamState(Device Device) (int, Return) { + var State uint32 + ret := nvmlDeviceCcuGetStreamState(Device, &State) + return int(State), ret +} + +func (Device Device) CcuGetStreamState() (int, Return) { + return DeviceCcuGetStreamState(Device) +} + +// nvml.DeviceCcuSetStreamState() +func DeviceCcuSetStreamState(Device Device, State int) Return { + return nvmlDeviceCcuSetStreamState(Device, uint32(State)) +} + +func (Device Device) CcuSetStreamState(State int) Return { + return DeviceCcuSetStreamState(Device, State) +} + +// nvml.DeviceSetNvLinkDeviceLowPowerThreshold() +func DeviceSetNvLinkDeviceLowPowerThreshold(Device Device, Info *NvLinkPowerThres) Return { + return nvmlDeviceSetNvLinkDeviceLowPowerThreshold(Device, Info) +} + +func (Device Device) SetNvLinkDeviceLowPowerThreshold(Info *NvLinkPowerThres) Return { + return DeviceSetNvLinkDeviceLowPowerThreshold(Device, Info) +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go index d7dbfbaa..f63dfe8e 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go @@ -486,6 +486,15 @@ func nvmlDeviceGetMaxPcieLinkGeneration(Device Device, MaxLinkGen *uint32) Retur return __v } +// nvmlDeviceGetGpuMaxPcieLinkGeneration function as declared in nvml/nvml.h +func nvmlDeviceGetGpuMaxPcieLinkGeneration(Device Device, MaxLinkGenDevice *uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cMaxLinkGenDevice, _ := (*C.uint)(unsafe.Pointer(MaxLinkGenDevice)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGpuMaxPcieLinkGeneration(cDevice, cMaxLinkGenDevice) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetMaxPcieLinkWidth function as declared in nvml/nvml.h func nvmlDeviceGetMaxPcieLinkWidth(Device Device, MaxLinkWidth *uint32) Return { cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown @@ -670,6 +679,55 @@ func nvmlDeviceGetFanSpeed_v2(Device Device, Fan uint32, Speed *uint32) Return { return __v } +// nvmlDeviceGetTargetFanSpeed function as declared in nvml/nvml.h +func nvmlDeviceGetTargetFanSpeed(Device Device, Fan uint32, TargetSpeed *uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cFan, _ := (C.uint)(Fan), cgoAllocsUnknown + cTargetSpeed, _ := (*C.uint)(unsafe.Pointer(TargetSpeed)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetTargetFanSpeed(cDevice, cFan, cTargetSpeed) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetDefaultFanSpeed_v2 function as declared in nvml/nvml.h +func nvmlDeviceSetDefaultFanSpeed_v2(Device Device, Fan uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cFan, _ := (C.uint)(Fan), cgoAllocsUnknown + __ret := C.nvmlDeviceSetDefaultFanSpeed_v2(cDevice, cFan) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetMinMaxFanSpeed function as declared in nvml/nvml.h +func nvmlDeviceGetMinMaxFanSpeed(Device Device, MinSpeed *uint32, MaxSpeed *uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cMinSpeed, _ := (*C.uint)(unsafe.Pointer(MinSpeed)), cgoAllocsUnknown + cMaxSpeed, _ := (*C.uint)(unsafe.Pointer(MaxSpeed)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMinMaxFanSpeed(cDevice, cMinSpeed, cMaxSpeed) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetFanControlPolicy_v2 function as declared in nvml/nvml.h +func nvmlDeviceGetFanControlPolicy_v2(Device Device, Fan uint32, Policy *FanControlPolicy) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cFan, _ := (C.uint)(Fan), cgoAllocsUnknown + cPolicy, _ := (*C.nvmlFanControlPolicy_t)(unsafe.Pointer(Policy)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetFanControlPolicy_v2(cDevice, cFan, cPolicy) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetFanControlPolicy function as declared in nvml/nvml.h +func nvmlDeviceSetFanControlPolicy(Device Device, Fan uint32, Policy FanControlPolicy) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cFan, _ := (C.uint)(Fan), cgoAllocsUnknown + cPolicy, _ := (C.nvmlFanControlPolicy_t)(Policy), cgoAllocsUnknown + __ret := C.nvmlDeviceSetFanControlPolicy(cDevice, cFan, cPolicy) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetNumFans function as declared in nvml/nvml.h func nvmlDeviceGetNumFans(Device Device, NumFans *uint32) Return { cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown @@ -709,6 +767,16 @@ func nvmlDeviceSetTemperatureThreshold(Device Device, ThresholdType TemperatureT return __v } +// nvmlDeviceGetThermalSettings function as declared in nvml/nvml.h +func nvmlDeviceGetThermalSettings(Device Device, SensorIndex uint32, PThermalSettings *GpuThermalSettings) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cSensorIndex, _ := (C.uint)(SensorIndex), cgoAllocsUnknown + cPThermalSettings, _ := (*C.nvmlGpuThermalSettings_t)(unsafe.Pointer(PThermalSettings)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetThermalSettings(cDevice, cSensorIndex, cPThermalSettings) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetPerformanceState function as declared in nvml/nvml.h func nvmlDeviceGetPerformanceState(Device Device, PState *Pstates) Return { cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown @@ -866,6 +934,15 @@ func nvmlDeviceGetEccMode(Device Device, Current *EnableState, Pending *EnableSt return __v } +// nvmlDeviceGetDefaultEccMode function as declared in nvml/nvml.h +func nvmlDeviceGetDefaultEccMode(Device Device, DefaultMode *EnableState) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cDefaultMode, _ := (*C.nvmlEnableState_t)(unsafe.Pointer(DefaultMode)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetDefaultEccMode(cDevice, cDefaultMode) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetBoardId function as declared in nvml/nvml.h func nvmlDeviceGetBoardId(Device Device, BoardId *uint32) Return { cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown @@ -1153,6 +1230,15 @@ func nvmlDeviceGetPcieLinkMaxSpeed(Device Device, MaxSpeed *uint32) Return { return __v } +// nvmlDeviceGetPcieSpeed function as declared in nvml/nvml.h +func nvmlDeviceGetPcieSpeed(Device Device, PcieSpeed *uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cPcieSpeed, _ := (*C.uint)(unsafe.Pointer(PcieSpeed)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetPcieSpeed(cDevice, cPcieSpeed) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetAdaptiveClockInfoStatus function as declared in nvml/nvml.h func nvmlDeviceGetAdaptiveClockInfoStatus(Device Device, AdaptiveClockStatus *uint32) Return { cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown @@ -1635,6 +1721,16 @@ func nvmlDeviceGetFieldValues(Device Device, ValuesCount int32, Values *FieldVal return __v } +// nvmlDeviceClearFieldValues function as declared in nvml/nvml.h +func nvmlDeviceClearFieldValues(Device Device, ValuesCount int32, Values *FieldValue) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cValuesCount, _ := (C.int)(ValuesCount), cgoAllocsUnknown + cValues, _ := (*C.nvmlFieldValue_t)(unsafe.Pointer(Values)), cgoAllocsUnknown + __ret := C.nvmlDeviceClearFieldValues(cDevice, cValuesCount, cValues) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetVirtualizationMode function as declared in nvml/nvml.h func nvmlDeviceGetVirtualizationMode(Device Device, PVirtualMode *GpuVirtualizationMode) Return { cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown @@ -1682,6 +1778,44 @@ func nvmlDeviceGetProcessUtilization(Device Device, Utilization *ProcessUtilizat return __v } +// nvmlDeviceGetGspFirmwareVersion function as declared in nvml/nvml.h +func nvmlDeviceGetGspFirmwareVersion(Device Device, Version *byte) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cVersion, _ := (*C.char)(unsafe.Pointer(Version)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGspFirmwareVersion(cDevice, cVersion) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGspFirmwareMode function as declared in nvml/nvml.h +func nvmlDeviceGetGspFirmwareMode(Device Device, IsEnabled *uint32, DefaultMode *uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cIsEnabled, _ := (*C.uint)(unsafe.Pointer(IsEnabled)), cgoAllocsUnknown + cDefaultMode, _ := (*C.uint)(unsafe.Pointer(DefaultMode)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGspFirmwareMode(cDevice, cIsEnabled, cDefaultMode) + __v := (Return)(__ret) + return __v +} + +// nvmlGetVgpuDriverCapabilities function as declared in nvml/nvml.h +func nvmlGetVgpuDriverCapabilities(Capability VgpuDriverCapability, CapResult *uint32) Return { + cCapability, _ := (C.nvmlVgpuDriverCapability_t)(Capability), cgoAllocsUnknown + cCapResult, _ := (*C.uint)(unsafe.Pointer(CapResult)), cgoAllocsUnknown + __ret := C.nvmlGetVgpuDriverCapabilities(cCapability, cCapResult) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetVgpuCapabilities function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuCapabilities(Device Device, Capability DeviceVgpuCapability, CapResult *uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cCapability, _ := (C.nvmlDeviceVgpuCapability_t)(Capability), cgoAllocsUnknown + cCapResult, _ := (*C.uint)(unsafe.Pointer(CapResult)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuCapabilities(cDevice, cCapability, cCapResult) + __v := (Return)(__ret) + return __v +} + // nvmlDeviceGetSupportedVgpus function as declared in nvml/nvml.h func nvmlDeviceGetSupportedVgpus(Device Device, VgpuCount *uint32, VgpuTypeIds *VgpuTypeId) Return { cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown @@ -1971,6 +2105,16 @@ func nvmlVgpuInstanceGetGpuPciId(VgpuInstance VgpuInstance, VgpuPciId *byte, Len return __v } +// nvmlVgpuTypeGetCapabilities function as declared in nvml/nvml.h +func nvmlVgpuTypeGetCapabilities(VgpuTypeId VgpuTypeId, Capability VgpuCapability, CapResult *uint32) Return { + cVgpuTypeId, _ := (C.nvmlVgpuTypeId_t)(VgpuTypeId), cgoAllocsUnknown + cCapability, _ := (C.nvmlVgpuCapability_t)(Capability), cgoAllocsUnknown + cCapResult, _ := (*C.uint)(unsafe.Pointer(CapResult)), cgoAllocsUnknown + __ret := C.nvmlVgpuTypeGetCapabilities(cVgpuTypeId, cCapability, cCapResult) + __v := (Return)(__ret) + return __v +} + // nvmlVgpuInstanceGetMetadata function as declared in nvml/nvml.h func nvmlVgpuInstanceGetMetadata(VgpuInstance VgpuInstance, nvmlVgpuMetadata *nvmlVgpuMetadata, BufferSize *uint32) Return { cVgpuInstance, _ := (C.nvmlVgpuInstance_t)(VgpuInstance), cgoAllocsUnknown @@ -2011,6 +2155,42 @@ func nvmlDeviceGetPgpuMetadataString(Device Device, PgpuMetadata *byte, BufferSi return __v } +// nvmlDeviceGetVgpuSchedulerLog function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuSchedulerLog(Device Device, PSchedulerLog *VgpuSchedulerLog) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cPSchedulerLog, _ := (*C.nvmlVgpuSchedulerLog_t)(unsafe.Pointer(PSchedulerLog)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuSchedulerLog(cDevice, cPSchedulerLog) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetVgpuSchedulerState function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuSchedulerState(Device Device, PSchedulerState *VgpuSchedulerGetState) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cPSchedulerState, _ := (*C.nvmlVgpuSchedulerGetState_t)(unsafe.Pointer(PSchedulerState)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuSchedulerState(cDevice, cPSchedulerState) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetVgpuSchedulerState function as declared in nvml/nvml.h +func nvmlDeviceSetVgpuSchedulerState(Device Device, PSchedulerState *VgpuSchedulerSetState) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cPSchedulerState, _ := (*C.nvmlVgpuSchedulerSetState_t)(unsafe.Pointer(PSchedulerState)), cgoAllocsUnknown + __ret := C.nvmlDeviceSetVgpuSchedulerState(cDevice, cPSchedulerState) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetVgpuSchedulerCapabilities function as declared in nvml/nvml.h +func nvmlDeviceGetVgpuSchedulerCapabilities(Device Device, PCapabilities *VgpuSchedulerCapabilities) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cPCapabilities, _ := (*C.nvmlVgpuSchedulerCapabilities_t)(unsafe.Pointer(PCapabilities)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetVgpuSchedulerCapabilities(cDevice, cPCapabilities) + __v := (Return)(__ret) + return __v +} + // nvmlGetVgpuVersion function as declared in nvml/nvml.h func nvmlGetVgpuVersion(Supported *VgpuVersion, Current *VgpuVersion) Return { cSupported, _ := (*C.nvmlVgpuVersion_t)(unsafe.Pointer(Supported)), cgoAllocsUnknown @@ -2266,6 +2446,17 @@ func nvmlGpuInstanceGetComputeInstanceRemainingCapacity(GpuInstance GpuInstance, return __v } +// nvmlGpuInstanceGetComputeInstancePossiblePlacements function as declared in nvml/nvml.h +func nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, ProfileId uint32, Placements *ComputeInstancePlacement, Count *uint32) Return { + cGpuInstance, _ := *(*C.nvmlGpuInstance_t)(unsafe.Pointer(&GpuInstance)), cgoAllocsUnknown + cProfileId, _ := (C.uint)(ProfileId), cgoAllocsUnknown + cPlacements, _ := (*C.nvmlComputeInstancePlacement_t)(unsafe.Pointer(Placements)), cgoAllocsUnknown + cCount, _ := (*C.uint)(unsafe.Pointer(Count)), cgoAllocsUnknown + __ret := C.nvmlGpuInstanceGetComputeInstancePossiblePlacements(cGpuInstance, cProfileId, cPlacements, cCount) + __v := (Return)(__ret) + return __v +} + // nvmlGpuInstanceCreateComputeInstance function as declared in nvml/nvml.h func nvmlGpuInstanceCreateComputeInstance(GpuInstance GpuInstance, ProfileId uint32, ComputeInstance *ComputeInstance) Return { cGpuInstance, _ := *(*C.nvmlGpuInstance_t)(unsafe.Pointer(&GpuInstance)), cgoAllocsUnknown @@ -2276,6 +2467,17 @@ func nvmlGpuInstanceCreateComputeInstance(GpuInstance GpuInstance, ProfileId uin return __v } +// nvmlGpuInstanceCreateComputeInstanceWithPlacement function as declared in nvml/nvml.h +func nvmlGpuInstanceCreateComputeInstanceWithPlacement(GpuInstance GpuInstance, ProfileId uint32, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return { + cGpuInstance, _ := *(*C.nvmlGpuInstance_t)(unsafe.Pointer(&GpuInstance)), cgoAllocsUnknown + cProfileId, _ := (C.uint)(ProfileId), cgoAllocsUnknown + cPlacement, _ := (*C.nvmlComputeInstancePlacement_t)(unsafe.Pointer(Placement)), cgoAllocsUnknown + cComputeInstance, _ := (*C.nvmlComputeInstance_t)(unsafe.Pointer(ComputeInstance)), cgoAllocsUnknown + __ret := C.nvmlGpuInstanceCreateComputeInstanceWithPlacement(cGpuInstance, cProfileId, cPlacement, cComputeInstance) + __v := (Return)(__ret) + return __v +} + // nvmlComputeInstanceDestroy function as declared in nvml/nvml.h func nvmlComputeInstanceDestroy(ComputeInstance ComputeInstance) Return { cComputeInstance, _ := *(*C.nvmlComputeInstance_t)(unsafe.Pointer(&ComputeInstance)), cgoAllocsUnknown @@ -2378,6 +2580,191 @@ func nvmlDeviceGetBusType(Device Device, _type *BusType) Return { return __v } +// nvmlDeviceGetDynamicPstatesInfo function as declared in nvml/nvml.h +func nvmlDeviceGetDynamicPstatesInfo(Device Device, PDynamicPstatesInfo *GpuDynamicPstatesInfo) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cPDynamicPstatesInfo, _ := (*C.nvmlGpuDynamicPstatesInfo_t)(unsafe.Pointer(PDynamicPstatesInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetDynamicPstatesInfo(cDevice, cPDynamicPstatesInfo) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetFanSpeed_v2 function as declared in nvml/nvml.h +func nvmlDeviceSetFanSpeed_v2(Device Device, Fan uint32, Speed uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cFan, _ := (C.uint)(Fan), cgoAllocsUnknown + cSpeed, _ := (C.uint)(Speed), cgoAllocsUnknown + __ret := C.nvmlDeviceSetFanSpeed_v2(cDevice, cFan, cSpeed) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGpcClkVfOffset function as declared in nvml/nvml.h +func nvmlDeviceGetGpcClkVfOffset(Device Device, Offset *int32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cOffset, _ := (*C.int)(unsafe.Pointer(Offset)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGpcClkVfOffset(cDevice, cOffset) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetGpcClkVfOffset function as declared in nvml/nvml.h +func nvmlDeviceSetGpcClkVfOffset(Device Device, Offset int32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cOffset, _ := (C.int)(Offset), cgoAllocsUnknown + __ret := C.nvmlDeviceSetGpcClkVfOffset(cDevice, cOffset) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetMemClkVfOffset function as declared in nvml/nvml.h +func nvmlDeviceGetMemClkVfOffset(Device Device, Offset *int32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cOffset, _ := (*C.int)(unsafe.Pointer(Offset)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMemClkVfOffset(cDevice, cOffset) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetMemClkVfOffset function as declared in nvml/nvml.h +func nvmlDeviceSetMemClkVfOffset(Device Device, Offset int32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cOffset, _ := (C.int)(Offset), cgoAllocsUnknown + __ret := C.nvmlDeviceSetMemClkVfOffset(cDevice, cOffset) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetMinMaxClockOfPState function as declared in nvml/nvml.h +func nvmlDeviceGetMinMaxClockOfPState(Device Device, _type ClockType, Pstate Pstates, MinClockMHz *uint32, MaxClockMHz *uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + c_type, _ := (C.nvmlClockType_t)(_type), cgoAllocsUnknown + cPstate, _ := (C.nvmlPstates_t)(Pstate), cgoAllocsUnknown + cMinClockMHz, _ := (*C.uint)(unsafe.Pointer(MinClockMHz)), cgoAllocsUnknown + cMaxClockMHz, _ := (*C.uint)(unsafe.Pointer(MaxClockMHz)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMinMaxClockOfPState(cDevice, c_type, cPstate, cMinClockMHz, cMaxClockMHz) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetSupportedPerformanceStates function as declared in nvml/nvml.h +func nvmlDeviceGetSupportedPerformanceStates(Device Device, Pstates *Pstates, Size uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cPstates, _ := (*C.nvmlPstates_t)(unsafe.Pointer(Pstates)), cgoAllocsUnknown + cSize, _ := (C.uint)(Size), cgoAllocsUnknown + __ret := C.nvmlDeviceGetSupportedPerformanceStates(cDevice, cPstates, cSize) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGpcClkMinMaxVfOffset function as declared in nvml/nvml.h +func nvmlDeviceGetGpcClkMinMaxVfOffset(Device Device, MinOffset *int32, MaxOffset *int32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cMinOffset, _ := (*C.int)(unsafe.Pointer(MinOffset)), cgoAllocsUnknown + cMaxOffset, _ := (*C.int)(unsafe.Pointer(MaxOffset)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGpcClkMinMaxVfOffset(cDevice, cMinOffset, cMaxOffset) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetMemClkMinMaxVfOffset function as declared in nvml/nvml.h +func nvmlDeviceGetMemClkMinMaxVfOffset(Device Device, MinOffset *int32, MaxOffset *int32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cMinOffset, _ := (*C.int)(unsafe.Pointer(MinOffset)), cgoAllocsUnknown + cMaxOffset, _ := (*C.int)(unsafe.Pointer(MaxOffset)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetMemClkMinMaxVfOffset(cDevice, cMinOffset, cMaxOffset) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceGetGpuFabricInfo function as declared in nvml/nvml.h +func nvmlDeviceGetGpuFabricInfo(Device Device, GpuFabricInfo *GpuFabricInfo) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cGpuFabricInfo, _ := (*C.nvmlGpuFabricInfo_t)(unsafe.Pointer(GpuFabricInfo)), cgoAllocsUnknown + __ret := C.nvmlDeviceGetGpuFabricInfo(cDevice, cGpuFabricInfo) + __v := (Return)(__ret) + return __v +} + +// nvmlGpmMetricsGet function as declared in nvml/nvml.h +func nvmlGpmMetricsGet(MetricsGet *GpmMetricsGetType) Return { + cMetricsGet, _ := (*C.nvmlGpmMetricsGet_t)(unsafe.Pointer(MetricsGet)), cgoAllocsUnknown + __ret := C.nvmlGpmMetricsGet(cMetricsGet) + __v := (Return)(__ret) + return __v +} + +// nvmlGpmSampleFree function as declared in nvml/nvml.h +func nvmlGpmSampleFree(GpmSample GpmSample) Return { + cGpmSample, _ := *(*C.nvmlGpmSample_t)(unsafe.Pointer(&GpmSample)), cgoAllocsUnknown + __ret := C.nvmlGpmSampleFree(cGpmSample) + __v := (Return)(__ret) + return __v +} + +// nvmlGpmSampleAlloc function as declared in nvml/nvml.h +func nvmlGpmSampleAlloc(GpmSample *GpmSample) Return { + cGpmSample, _ := (*C.nvmlGpmSample_t)(unsafe.Pointer(GpmSample)), cgoAllocsUnknown + __ret := C.nvmlGpmSampleAlloc(cGpmSample) + __v := (Return)(__ret) + return __v +} + +// nvmlGpmSampleGet function as declared in nvml/nvml.h +func nvmlGpmSampleGet(Device Device, GpmSample GpmSample) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cGpmSample, _ := *(*C.nvmlGpmSample_t)(unsafe.Pointer(&GpmSample)), cgoAllocsUnknown + __ret := C.nvmlGpmSampleGet(cDevice, cGpmSample) + __v := (Return)(__ret) + return __v +} + +// nvmlGpmMigSampleGet function as declared in nvml/nvml.h +func nvmlGpmMigSampleGet(Device Device, GpuInstanceId uint32, GpmSample GpmSample) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cGpuInstanceId, _ := (C.uint)(GpuInstanceId), cgoAllocsUnknown + cGpmSample, _ := *(*C.nvmlGpmSample_t)(unsafe.Pointer(&GpmSample)), cgoAllocsUnknown + __ret := C.nvmlGpmMigSampleGet(cDevice, cGpuInstanceId, cGpmSample) + __v := (Return)(__ret) + return __v +} + +// nvmlGpmQueryDeviceSupport function as declared in nvml/nvml.h +func nvmlGpmQueryDeviceSupport(Device Device, GpmSupport *GpmSupport) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cGpmSupport, _ := (*C.nvmlGpmSupport_t)(unsafe.Pointer(GpmSupport)), cgoAllocsUnknown + __ret := C.nvmlGpmQueryDeviceSupport(cDevice, cGpmSupport) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceCcuGetStreamState function as declared in nvml/nvml.h +func nvmlDeviceCcuGetStreamState(Device Device, State *uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cState, _ := (*C.uint)(unsafe.Pointer(State)), cgoAllocsUnknown + __ret := C.nvmlDeviceCcuGetStreamState(cDevice, cState) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceCcuSetStreamState function as declared in nvml/nvml.h +func nvmlDeviceCcuSetStreamState(Device Device, State uint32) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cState, _ := (C.uint)(State), cgoAllocsUnknown + __ret := C.nvmlDeviceCcuSetStreamState(cDevice, cState) + __v := (Return)(__ret) + return __v +} + +// nvmlDeviceSetNvLinkDeviceLowPowerThreshold function as declared in nvml/nvml.h +func nvmlDeviceSetNvLinkDeviceLowPowerThreshold(Device Device, Info *NvLinkPowerThres) Return { + cDevice, _ := *(*C.nvmlDevice_t)(unsafe.Pointer(&Device)), cgoAllocsUnknown + cInfo, _ := (*C.nvmlNvLinkPowerThres_t)(unsafe.Pointer(Info)), cgoAllocsUnknown + __ret := C.nvmlDeviceSetNvLinkDeviceLowPowerThreshold(cDevice, cInfo) + __v := (Return)(__ret) + return __v +} + // nvmlInit_v1 function as declared in nvml/nvml.h func nvmlInit_v1() Return { __ret := C.nvmlInit() diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h index 804db91e..8c71ff8a 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.h @@ -1,7 +1,7 @@ -/*** NVML VERSION: 11.6.55 ***/ -/*** From https://api.anaconda.org/download/nvidia/cuda-nvml-dev/11.6.55/linux-64/cuda-nvml-dev-11.6.55-haa9ef22_0.tar.bz2 ***/ +/*** NVML VERSION: 12.0.76 ***/ +/*** From https://api.anaconda.org/download/nvidia/cuda-nvml-dev/12.0.76/linux-64/cuda-nvml-dev-12.0.76-0.tar.bz2 ***/ /* - * Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * Copyright 1993-2022 NVIDIA Corporation. All rights reserved. * * NOTICE TO USER: * @@ -284,7 +284,7 @@ typedef struct nvmlProcessInfo_v2_st /** * Information about running compute processes on the GPU - * Version 2 adds versioning for the struct and the conf compute protected memory in output. + * Version 2 adds versioning for the struct */ typedef struct nvmlProcessInfo_st { @@ -338,7 +338,7 @@ typedef enum nvmlBridgeChipType_enum /** * Maximum number of NvLink links supported */ -#define NVML_NVLINK_MAX_LINKS 12 +#define NVML_NVLINK_MAX_LINKS 18 /** * Enum to represent the NvLink utilization counter packet units @@ -553,7 +553,7 @@ typedef union nvmlValue_st typedef struct nvmlSample_st { unsigned long long timeStamp; //!< CPU Timestamp in microseconds - nvmlValue_t sampleValue; //!< Sample Value + nvmlValue_t sampleValue; //!< Sample Value }nvmlSample_t; /** @@ -584,6 +584,61 @@ typedef struct nvmlViolationTime_st unsigned long long violationTime; //!< violationTime in Nanoseconds }nvmlViolationTime_t; +#define NVML_MAX_THERMAL_SENSORS_PER_GPU 3 + +typedef enum +{ + NVML_THERMAL_TARGET_NONE = 0, + NVML_THERMAL_TARGET_GPU = 1, //!< GPU core temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_MEMORY = 2, //!< GPU memory temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_POWER_SUPPLY = 4, //!< GPU power supply temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_BOARD = 8, //!< GPU board ambient temperature requires NvPhysicalGpuHandle + NVML_THERMAL_TARGET_VCD_BOARD = 9, //!< Visual Computing Device Board temperature requires NvVisualComputingDeviceHandle + NVML_THERMAL_TARGET_VCD_INLET = 10, //!< Visual Computing Device Inlet temperature requires NvVisualComputingDeviceHandle + NVML_THERMAL_TARGET_VCD_OUTLET = 11, //!< Visual Computing Device Outlet temperature requires NvVisualComputingDeviceHandle + + NVML_THERMAL_TARGET_ALL = 15, + NVML_THERMAL_TARGET_UNKNOWN = -1, +} nvmlThermalTarget_t; + +typedef enum +{ + NVML_THERMAL_CONTROLLER_NONE = 0, + NVML_THERMAL_CONTROLLER_GPU_INTERNAL, + NVML_THERMAL_CONTROLLER_ADM1032, + NVML_THERMAL_CONTROLLER_ADT7461, + NVML_THERMAL_CONTROLLER_MAX6649, + NVML_THERMAL_CONTROLLER_MAX1617, + NVML_THERMAL_CONTROLLER_LM99, + NVML_THERMAL_CONTROLLER_LM89, + NVML_THERMAL_CONTROLLER_LM64, + NVML_THERMAL_CONTROLLER_G781, + NVML_THERMAL_CONTROLLER_ADT7473, + NVML_THERMAL_CONTROLLER_SBMAX6649, + NVML_THERMAL_CONTROLLER_VBIOSEVT, + NVML_THERMAL_CONTROLLER_OS, + NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS, + NVML_THERMAL_CONTROLLER_NVSYSCON_E551, + NVML_THERMAL_CONTROLLER_MAX6649R, + NVML_THERMAL_CONTROLLER_ADT7473S, + NVML_THERMAL_CONTROLLER_UNKNOWN = -1, +} nvmlThermalController_t; + +typedef struct { + nvmlThermalController_t controller; + int defaultMinTemp; + int defaultMaxTemp; + int currentTemp; + nvmlThermalTarget_t target; +} nvmlGpuThermalSettingsSensor_t; + +typedef struct +{ + unsigned int count; + nvmlGpuThermalSettingsSensor_t sensor[NVML_MAX_THERMAL_SENSORS_PER_GPU]; + +} nvmlGpuThermalSettings_t; + /** @} */ /***************************************************************************************************/ @@ -826,12 +881,15 @@ typedef enum nvmlClockId_enum * * Windows only. */ + typedef enum nvmlDriverModel_enum { - NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device - NVML_DRIVER_WDM = 1 //!< WDM (TCC) model (recommended) -- GPU treated as a generic device + NVML_DRIVER_WDDM = 0, //!< WDDM driver model -- GPU treated as a display device + NVML_DRIVER_WDM = 1 //!< WDM (TCC) model (recommended) -- GPU treated as a generic device } nvmlDriverModel_t; +#define NVML_MAX_GPU_PERF_PSTATES 16 + /** * Allowed PStates. */ @@ -893,32 +951,34 @@ typedef enum nvmlInforomObject_enum typedef enum nvmlReturn_enum { // cppcheck-suppress * - NVML_SUCCESS = 0, //!< The operation was successful - NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() - NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid - NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device - NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation - NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting - NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful - NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough - NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached - NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded - NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed - NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU - NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded - NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function - NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted - NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible - NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again - NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups - NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch - NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use - NVML_ERROR_MEMORY = 20, //!< Insufficient memory - NVML_ERROR_NO_DATA = 21, //!< No data - NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, becasue ECC is enabled - NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory - NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory - NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred + NVML_SUCCESS = 0, //!< The operation was successful + NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() + NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid + NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device + NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation + NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting + NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful + NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough + NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached + NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded + NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed + NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU + NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded + NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function + NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted + NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible + NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again + NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups + NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch + NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use + NVML_ERROR_MEMORY = 20, //!< Insufficient memory + NVML_ERROR_NO_DATA = 21, //!< No data + NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, becasue ECC is enabled + NVML_ERROR_INSUFFICIENT_RESOURCES = 23, //!< Ran out of critical resources, other than memory + NVML_ERROR_FREQ_NOT_SUPPORTED = 24, //!< Ran out of critical resources, other than memory + NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25, //!< The provided version is invalid/unsupported + NVML_ERROR_DEPRECATED = 26, //!< The requested functionality has been deprecated + NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred } nvmlReturn_t; /** @@ -964,20 +1024,6 @@ typedef enum nvmlRestrictedAPI_enum NVML_RESTRICTED_API_COUNT } nvmlRestrictedAPI_t; -/** - * Enum to represent NvLink ECC per-lane error counts - */ -typedef enum nvmlNvLinkEccLaneErrorCounter_enum -{ - NVML_NVLINK_ERROR_DL_ECC_LANE0 = 0, // Data link receive ECC error counter lane 0 - NVML_NVLINK_ERROR_DL_ECC_LANE1 = 1, // Data link receive ECC error counter lane 1 - NVML_NVLINK_ERROR_DL_ECC_LANE2 = 2, // Data link receive ECC error counter lane 2 - NVML_NVLINK_ERROR_DL_ECC_LANE3 = 3, // Data link receive ECC error counter lane 3 - - // this must be last - NVML_NVLINK_ERROR_DL_ECC_COUNT -} nvmlNvLinkEccLaneErrorCounter_t; - /** @} */ /***************************************************************************************************/ @@ -1019,7 +1065,7 @@ typedef enum nvmlVgpuVmIdType { } nvmlVgpuVmIdType_t; /** - * vGPU GUEST info state. + * vGPU GUEST info state */ typedef enum nvmlVgpuGuestInfoState_enum { @@ -1048,6 +1094,44 @@ typedef enum { #define NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE 3 //!< Expiry not applicable #define NVML_GRID_LICENSE_EXPIRY_PERMANENT 4 //!< Permanent expiry +/** + * vGPU queryable capabilities + */ +typedef enum nvmlVgpuCapability_enum +{ + NVML_VGPU_CAP_NVLINK_P2P = 0, //!< P2P over NVLink is supported + NVML_VGPU_CAP_GPUDIRECT = 1, //!< GPUDirect capability is supported + NVML_VGPU_CAP_MULTI_VGPU_EXCLUSIVE = 2, //!< vGPU profile cannot be mixed with other vGPU profiles in same VM + NVML_VGPU_CAP_EXCLUSIVE_TYPE = 3, //!< vGPU profile cannot run on a GPU alongside other profiles of different type + NVML_VGPU_CAP_EXCLUSIVE_SIZE = 4, //!< vGPU profile cannot run on a GPU alongside other profiles of different size + // Keep this last + NVML_VGPU_CAP_COUNT +} nvmlVgpuCapability_t; + + +/** +* vGPU driver queryable capabilities +*/ +typedef enum nvmlVgpuDriverCapability_enum +{ + NVML_VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU = 0, //!< Supports mixing of different vGPU profiles within one guest VM + // Keep this last + NVML_VGPU_DRIVER_CAP_COUNT +} nvmlVgpuDriverCapability_t; + + +/** +* Device vGPU queryable capabilities +*/ +typedef enum nvmlDeviceVgpuCapability_enum +{ + NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU = 0, //!< Fractional vGPU profiles on this GPU can be used in multi-vGPU configurations + NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES = 1, //!< Supports concurrent execution of timesliced vGPU profiles of differing types + NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES = 2, //!< Supports concurrent execution of timesliced vGPU profiles of differing framebuffer sizes + // Keep this last + NVML_DEVICE_VGPU_CAP_COUNT +} nvmlDeviceVgpuCapability_t; + /** @} */ /***************************************************************************************************/ @@ -1124,6 +1208,119 @@ typedef struct nvmlVgpuProcessUtilizationSample_st unsigned int decUtil; //!< Decoder Util Value } nvmlVgpuProcessUtilizationSample_t; +/** + * vGPU scheduler policies + */ +#define NVML_VGPU_SCHEDULER_POLICY_UNKNOWN 0 +#define NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT 1 +#define NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE 2 +#define NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE 3 + +#define NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT 3 + +#define NVML_SCHEDULER_SW_MAX_LOG_ENTRIES 200 + +typedef struct { + unsigned int avgFactor; + unsigned int timeslice; +} nvmlVgpuSchedulerParamsVgpuSchedDataWithARR_t; + +typedef struct { + unsigned int timeslice; +} nvmlVgpuSchedulerParamsVgpuSchedData_t; + +/** + * Union to represent the vGPU Scheduler Parameters + */ +typedef union +{ + nvmlVgpuSchedulerParamsVgpuSchedDataWithARR_t vgpuSchedDataWithARR; + + nvmlVgpuSchedulerParamsVgpuSchedData_t vgpuSchedData; + +} nvmlVgpuSchedulerParams_t; + +/** + * Structure to store the state and logs of a software runlist + */ +typedef struct nvmlVgpuSchedulerLogEntries_st +{ + unsigned long long timestamp; //!< Timestamp in ns when this software runlist was preeempted + unsigned long long timeRunTotal; //!< Total time in ns this software runlist has run + unsigned long long timeRun; //!< Time in ns this software runlist ran before preemption + unsigned int swRunlistId; //!< Software runlist Id + unsigned long long targetTimeSlice; //!< The actual timeslice after deduction + unsigned long long cumulativePreemptionTime; //!< Preemption time in ns for this SW runlist +} nvmlVgpuSchedulerLogEntry_t; + +/** + * Structure to store a vGPU software scheduler log + */ +typedef struct nvmlVgpuSchedulerLog_st +{ + unsigned int engineId; //!< Engine whose software runlist log entries are fetched + unsigned int schedulerPolicy; //!< Scheduler policy + unsigned int isEnabledARR; //!< Flag to check Adaptive Round Robin scheduler mode + nvmlVgpuSchedulerParams_t schedulerParams; + unsigned int entriesCount; //!< Count of log entries fetched + nvmlVgpuSchedulerLogEntry_t logEntries[NVML_SCHEDULER_SW_MAX_LOG_ENTRIES]; +} nvmlVgpuSchedulerLog_t; + +/** + * Structure to store the vGPU scheduler state + */ +typedef struct nvmlVgpuSchedulerGetState_st +{ + unsigned int schedulerPolicy; //!< Scheduler policy + unsigned int isEnabledARR; //!< Flag to check Adaptive Round Robin scheduler mode + nvmlVgpuSchedulerParams_t schedulerParams; +} nvmlVgpuSchedulerGetState_t; + +typedef struct { + unsigned int avgFactor; + unsigned int frequency; +} nvmlVgpuSchedulerSetParamsVgpuSchedDataWithARR_t; + +typedef struct { + unsigned int timeslice; +} nvmlVgpuSchedulerSetParamsVgpuSchedData_t; + +/** + * Union to represent the vGPU Scheduler set Parameters + */ +typedef union +{ + nvmlVgpuSchedulerSetParamsVgpuSchedDataWithARR_t vgpuSchedDataWithARR; + + nvmlVgpuSchedulerSetParamsVgpuSchedData_t vgpuSchedData; + +} nvmlVgpuSchedulerSetParams_t; + +/** + * Structure to set the vGPU scheduler state + */ +typedef struct nvmlVgpuSchedulerSetState_st +{ + unsigned int schedulerPolicy; //!< Scheduler policy + unsigned int enableARRMode; //!< Flag to enable/disable Adaptive Round Robin scheduler + nvmlVgpuSchedulerSetParams_t schedulerParams; +} nvmlVgpuSchedulerSetState_t; + +/** + * Structure to store the vGPU scheduler capabilities + */ +typedef struct nvmlVgpuSchedulerCapabilities_st +{ + unsigned int supportedSchedulers[NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT]; //!< List the supported vGPU schedulers on the device + unsigned int maxTimeslice; //!< Maximum timeslice value in ns + unsigned int minTimeslice; //!< Minimum timeslice value in ns + unsigned int isArrModeSupported; //!< Flag to check Adaptive Round Robin mode enabled/disabled. + unsigned int maxFrequencyForARR; //!< Maximum frequency for Adaptive Round Robin mode + unsigned int minFrequencyForARR; //!< Minimum frequency for Adaptive Round Robin mode + unsigned int maxAvgFactorForARR; //!< Maximum averaging factor for Adaptive Round Robin mode + unsigned int minAvgFactorForARR; //!< Minimum averaging factor for Adaptive Round Robin mode +} nvmlVgpuSchedulerCapabilities_t; + /** * Structure to store the vGPU license expiry details */ @@ -1205,6 +1402,11 @@ typedef struct nvmlGridLicensableFeatures_st nvmlGridLicensableFeature_t gridLicensableFeatures[NVML_GRID_LICENSE_FEATURE_MAX_COUNT]; //!< Array of vGPU software licensable features. } nvmlGridLicensableFeatures_t; +/** + * GSP firmware + */ +#define NVML_GSP_FIRMWARE_VERSION_BUF_SIZE 0x40 + /** * Simplified chip architecture */ @@ -1216,6 +1418,10 @@ typedef struct nvmlGridLicensableFeatures_st #define NVML_DEVICE_ARCH_AMPERE 7 // Devices based on the NVIDIA Ampere architecture +#define NVML_DEVICE_ARCH_ADA 8 // Devices based on the NVIDIA Ada architecture + +#define NVML_DEVICE_ARCH_HOPPER 9 // Devices based on the NVIDIA Hopper architecture + #define NVML_DEVICE_ARCH_UNKNOWN 0xffffffff // Anything else, presumably something newer typedef unsigned int nvmlDeviceArchitecture_t; @@ -1231,6 +1437,18 @@ typedef unsigned int nvmlDeviceArchitecture_t; typedef unsigned int nvmlBusType_t; +/** + * Device Power Modes + */ + +/** + * Device Fan control policy + */ +#define NVML_FAN_POLICY_TEMPERATURE_CONTINOUS_SW 0 +#define NVML_FAN_POLICY_MANUAL 1 + +typedef unsigned int nvmlFanControlPolicy_t; + /** * Device Power Source */ @@ -1248,6 +1466,7 @@ typedef unsigned int nvmlPowerSource_t; #define NVML_PCIE_LINK_MAX_SPEED_8000MBPS 0x00000003 #define NVML_PCIE_LINK_MAX_SPEED_16000MBPS 0x00000004 #define NVML_PCIE_LINK_MAX_SPEED_32000MBPS 0x00000005 +#define NVML_PCIE_LINK_MAX_SPEED_64000MBPS 0x00000006 /* * Adaptive clocking status @@ -1255,6 +1474,28 @@ typedef unsigned int nvmlPowerSource_t; #define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED 0x00000000 #define NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED 0x00000001 +#define NVML_MAX_GPU_UTILIZATIONS 8 +typedef enum nvmlGpuUtilizationDomainId_t +{ + NVML_GPU_UTILIZATION_DOMAIN_GPU = 0, //!< Graphics engine domain + NVML_GPU_UTILIZATION_DOMAIN_FB = 1, //!< Frame buffer domain + NVML_GPU_UTILIZATION_DOMAIN_VID = 2, //!< Video engine domain + NVML_GPU_UTILIZATION_DOMAIN_BUS = 3, //!< Bus interface domain +} nvmlGpuUtilizationDomainId_t; + +typedef struct { + unsigned int bIsPresent; + unsigned int percentage; + unsigned int incThreshold; + unsigned int decThreshold; +} nvmlGpuDynamicPstatesInfoUtilization_t; + +typedef struct nvmlGpuDynamicPstatesInfo_st +{ + unsigned int flags; //!< Reserved for future use + nvmlGpuDynamicPstatesInfoUtilization_t utilization[NVML_MAX_GPU_UTILIZATIONS]; +} nvmlGpuDynamicPstatesInfo_t; + /** @} */ /** @} */ @@ -1513,7 +1754,19 @@ typedef unsigned int nvmlPowerSource_t; #define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 159 //!< NVLink data ECC Error Counter for Link 11 #define NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL 160 //!< NvLink data ECC Error Counter total for all Links -#define NVML_FI_MAX 161 //!< One greater than the largest field ID defined above +#define NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY 161 +#define NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY 162 +#define NVML_FI_DEV_NVLINK_ERROR_DL_CRC 163 +#define NVML_FI_DEV_NVLINK_GET_SPEED 164 +#define NVML_FI_DEV_NVLINK_GET_STATE 165 +#define NVML_FI_DEV_NVLINK_GET_VERSION 166 + +#define NVML_FI_DEV_NVLINK_GET_POWER_STATE 167 +#define NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD 168 + +#define NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER 169 + +#define NVML_FI_MAX 170 //!< One greater than the largest field ID defined above /** * Information for a Field Value Sample @@ -1999,6 +2252,23 @@ typedef enum nvmlPcieLinkState_enum /** @} */ +#define NVML_GPU_FABRIC_UUID_LEN 16 + +#define NVML_GPU_FABRIC_STATE_NOT_SUPPORTED 0 +#define NVML_GPU_FABRIC_STATE_NOT_STARTED 1 +#define NVML_GPU_FABRIC_STATE_IN_PROGRESS 2 +#define NVML_GPU_FABRIC_STATE_COMPLETED 3 + +typedef unsigned char nvmlGpuFabricState_t; + +typedef struct { + char clusterUuid[NVML_GPU_FABRIC_UUID_LEN]; //!< Uuid of the cluster to which this GPU belongs + nvmlReturn_t status; //!< Error status, if any. Must be checked only if state returns "complete". + unsigned int partitionId; //!< ID of the fabric partition to which this GPU belongs + nvmlGpuFabricState_t state; //!< Current state of GPU registration process +} nvmlGpuFabricInfo_t; +/** @} */ + /***************************************************************************************************/ /** @defgroup nvmlInitializationAndCleanup Initialization and Cleanup * This chapter describes the methods that handle NVML initialization and cleanup. @@ -3252,6 +3522,24 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v3(nvmlDevice_t device, nvmlPciInfo_t */ nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); +/** + * Retrieves the maximum PCIe link generation supported by this device + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkGenDevice Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkGenDevice has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGenDevice is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGenDevice); + /** * Retrieves the maximum PCIe link width possible with this device and system * @@ -3695,6 +3983,116 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *sp */ nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int * speed); +/** + * Retrieves the intended target speed of the device's specified fan. + * + * Normally, the driver dynamically adjusts the fan based on + * the needs of the GPU. But when user set fan speed using nvmlDeviceSetFanSpeed_v2, + * the driver will attempt to make the fan achieve the setting in + * nvmlDeviceSetFanSpeed_v2. The actual current speed of the fan + * is reported in nvmlDeviceGetFanSpeed_v2. + * + * For all discrete products with dedicated fans. + * + * The fan speed is expressed as a percentage of the product's maximum noise tolerance fan speed. + * This value may exceed 100% in certain cases. + * + * @param device The identifier of the target device + * @param fan The index of the target fan, zero indexed. + * @param targetSpeed Reference in which to return the fan speed percentage + * + * @return + * - \ref NVML_SUCCESS if \a speed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a fan is not an acceptable index, or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan or is newer than Maxwell + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTargetFanSpeed(nvmlDevice_t device, unsigned int fan, unsigned int *targetSpeed); + +/** + * Sets the speed of the fan control policy to default. + * + * For all cuda-capable discrete products with fans + * + * @param device The identifier of the target device + * @param fan The index of the fan, starting at zero + * + * return + * NVML_SUCCESS if speed has been adjusted + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if device is invalid + * NVML_ERROR_NOT_SUPPORTED if the device does not support this + * (doesn't have fans) + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultFanSpeed_v2(nvmlDevice_t device, unsigned int fan); + +/** + * Retrieves the min and max fan speed that user can set for the GPU fan. + * + * For all cuda-capable discrete products with fans + * + * @param device The identifier of the target device + * @param minSpeed The minimum speed allowed to set + * @param maxSpeed The maximum speed allowed to set + * + * return + * NVML_SUCCESS if speed has been adjusted + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if device is invalid + * NVML_ERROR_NOT_SUPPORTED if the device does not support this + * (doesn't have fans) + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxFanSpeed(nvmlDevice_t device, unsigned int * minSpeed, + unsigned int * maxSpeed); + +/** + * Gets current fan control policy. + * + * For Maxwell &tm; or newer fully supported devices. + * + * For all cuda-capable discrete products with fans + * + * device The identifier of the target \a device + * policy Reference in which to return the fan control \a policy + * + * return + * NVML_SUCCESS if \a policy has been populated + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference + * a fan that exists. + * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFanControlPolicy_v2(nvmlDevice_t device, unsigned int fan, + nvmlFanControlPolicy_t *policy); + +/** + * Sets current fan control policy. + * + * For Maxwell &tm; or newer fully supported devices. + * + * Requires privileged user. + * + * For all cuda-capable discrete products with fans + * + * device The identifier of the target \a device + * policy The fan control \a policy to set + * + * return + * NVML_SUCCESS if \a policy has been set + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a policy is null or the \a fan given doesn't reference + * a fan that exists. + * NVML_ERROR_NOT_SUPPORTED if the \a device is older than Maxwell + * NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetFanControlPolicy(nvmlDevice_t device, unsigned int fan, + nvmlFanControlPolicy_t policy); + /** * Retrieves the number of fans on the device. * @@ -3774,6 +4172,23 @@ nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvml */ nvmlReturn_t DECLDIR nvmlDeviceSetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, int *temp); +/** + * Used to execute a list of thermal system instructions. + * + * @param device The identifier of the target device + * @param sensorIndex The index of the thermal sensor + * @param pThermalSettings Reference in which to return the thermal sensor information + * + * @return + * - \ref NVML_SUCCESS if \a pThermalSettings has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pThermalSettings is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetThermalSettings(nvmlDevice_t device, unsigned int sensorIndex, nvmlGpuThermalSettings_t *pThermalSettings); + /** * Retrieves the current performance state for the device. * @@ -4151,6 +4566,30 @@ nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int */ nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); +/** + * Retrieves the default ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param defaultMode Reference in which to return the default ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a default is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDefaultEccMode(nvmlDevice_t device, nvmlEnableState_t *defaultMode); + /** * Retrieves the device boardId from 0-N. * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with @@ -4883,7 +5322,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPowerSource(nvmlDevice_t device, nvmlPowerSour * Gets the device's memory bus width * * @param device The identifier of the target device - * @param maxSpeed The devices's memory bus width + * @param busWidth The devices's memory bus width * * @return * - \ref NVML_SUCCESS if the memory bus width is successfully retrieved @@ -4911,6 +5350,21 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMemoryBusWidth(nvmlDevice_t device, unsigned i */ nvmlReturn_t DECLDIR nvmlDeviceGetPcieLinkMaxSpeed(nvmlDevice_t device, unsigned int *maxSpeed); +/** + * Gets the device's PCIe Link speed in Mbps + * + * @param device The identifier of the target device + * @param pcieSpeed The devices's PCIe Max Link speed in Mbps + * + * @return + * - \ref NVML_SUCCESS if \a pcieSpeed has been retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pcieSpeed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support PCIe speed getting + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieSpeed(nvmlDevice_t device, unsigned int *pcieSpeed); + /** * Gets the device's Adaptive Clock status * @@ -6305,6 +6759,21 @@ nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); */ nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); +/** + * Clear values for a list of fields for a device. This API allows multiple fields to be cleared at once. + * + * @param device The device handle of the GPU to request field values for + * @param valuesCount Number of entries in values that should be cleared + * @param values Array of \a valuesCount structures to hold field values. + * Each value's fieldId must be populated prior to this call + * + * @return + * - \ref NVML_SUCCESS if any values in \a values were cleared. Note that you must + * check the nvmlReturn field of each value for each individual + * status + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL + */ +nvmlReturn_t DECLDIR nvmlDeviceClearFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); /** @} */ @@ -6434,6 +6903,40 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures_v4(nvmlDevice_t device, nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); +/** + * Retrieve GSP firmware version. + * + * The caller passes in buffer via \a version and corresponding GSP firmware numbered version + * is returned with the same parameter in string format. + * + * @param device Device handle + * @param version The retrieved GSP firmware version + * + * @return + * - \ref NVML_SUCCESS if GSP firmware version is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or GSP \a version pointer is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if GSP firmware is not enabled for GPU + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareVersion(nvmlDevice_t device, char *version); + +/** + * Retrieve GSP firmware mode. + * + * The caller passes in integer pointers. GSP firmware enablement and default mode information is returned with + * corresponding parameters. The return value in \a isEnabled and \a defaultMode should be treated as boolean. + * + * @param device Device handle + * @param isEnabled Pointer to specify if GSP firmware is enabled + * @param defaultMode Pointer to specify if GSP firmware is supported by default on \a device + * + * @return + * - \ref NVML_SUCCESS if GSP firmware mode is sucessfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or any of \a isEnabled or \a defaultMode is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGspFirmwareMode(nvmlDevice_t device, unsigned int *isEnabled, unsigned int *defaultMode); + /** @} */ /***************************************************************************************************/ @@ -6444,6 +6947,49 @@ nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlPr */ /***************************************************************************************************/ +/** + * Retrieve the requested vGPU driver capability. + * + * Refer to the \a nvmlVgpuDriverCapability_t structure for the specific capabilities that can be queried. + * The return value in \a capResult should be treated as a boolean, with a non-zero value indicating that the capability + * is supported. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param capability Specifies the \a nvmlVgpuDriverCapability_t to be queried + * @param capResult A boolean for the queried capability indicating that feature is supported + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a capability is invalid, or \a capResult is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED the API is not supported in current state or \a devices not in vGPU mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlGetVgpuDriverCapabilities(nvmlVgpuDriverCapability_t capability, unsigned int *capResult); + +/** + * Retrieve the requested vGPU capability for GPU. + * + * Refer to the \a nvmlDeviceVgpuCapability_t structure for the specific capabilities that can be queried. + * The return value in \a capResult should be treated as a boolean, with a non-zero value indicating that the capability + * is supported. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param capability Specifies the \a nvmlDeviceVgpuCapability_t to be queried + * @param capResult A boolean for the queried capability indicating that feature is supported + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a capability is invalid, or \a capResult is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED the API is not supported in current state or \a device not in vGPU mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuCapabilities(nvmlDevice_t device, nvmlDeviceVgpuCapability_t capability, unsigned int *capResult); + /** * Retrieve the supported vGPU types on a physical GPU (device). * @@ -7068,6 +7614,25 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuInstanceId(nvmlVgpuInstance_t vgpuIns */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetGpuPciId(nvmlVgpuInstance_t vgpuInstance, char *vgpuPciId, unsigned int *length); +/** +* Retrieve the requested capability for a given vGPU type. Refer to the \a nvmlVgpuCapability_t structure +* for the specific capabilities that can be queried. The return value in \a capResult should be treated as +* a boolean, with a non-zero value indicating that the capability is supported. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param vgpuTypeId Handle to vGPU type +* @param capability Specifies the \a nvmlVgpuCapability_t to be queried +* @param capResult A boolean for the queried capability indicating that feature is supported +* +* @return +* - \ref NVML_SUCCESS successful completion +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a capability is invalid, or \a capResult is NULL +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetCapabilities(nvmlVgpuTypeId_t vgpuTypeId, nvmlVgpuCapability_t capability, unsigned int *capResult); + /** @} */ /***************************************************************************************************/ @@ -7246,6 +7811,90 @@ nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, */ nvmlReturn_t DECLDIR nvmlDeviceGetPgpuMetadataString(nvmlDevice_t device, char *pgpuMetadata, unsigned int *bufferSize); +/** + * Returns the vGPU Software scheduler logs. + * \a pSchedulerLog points to a caller-allocated structure to contain the logs. The number of elements returned will + * never exceed \a NVML_SCHEDULER_SW_MAX_LOG_ENTRIES. + * + * To get the entire logs, call the function atleast 5 times a second. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target \a device + * @param pSchedulerLog Reference in which \a pSchedulerLog is written + * + * @return + * - \ref NVML_SUCCESS vGPU scheduler logs were successfully obtained + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerLog is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerLog(nvmlDevice_t device, nvmlVgpuSchedulerLog_t *pSchedulerLog); + +/** + * Returns the vGPU scheduler state. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target \a device + * @param pSchedulerState Reference in which \a pSchedulerState is returned + * + * @return + * - \ref NVML_SUCCESS vGPU scheduler state is successfully obtained + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerGetState_t *pSchedulerState); + +/** + * Sets the vGPU scheduler state. + * + * For Pascal &tm; or newer fully supported devices. + * + * The scheduler state change won’t persist across module load/unload. + * Scheduler state and params will be allowed to set only when no VM is running. + * In \a nvmlVgpuSchedulerSetState_t, IFF enableARRMode=1 then + * provide avgFactorForARR and frequency as input. If enableARRMode is disabled + * then provide timeslice as input. + * + * @param device The identifier of the target \a device + * @param pSchedulerState vGPU \a pSchedulerState to set + * + * @return + * - \ref NVML_SUCCESS vGPU scheduler state has been successfully set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pSchedulerState is NULL or \a device is invalid + * - \ref NVML_ERROR_RESET_REQUIRED if setting \a pSchedulerState failed with fatal error, + * reboot is required to overcome from this error. + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode + * or if any vGPU instance currently exists on the \a device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVgpuSchedulerState(nvmlDevice_t device, nvmlVgpuSchedulerSetState_t *pSchedulerState); + +/** + * Returns the vGPU scheduler capabilities. + * The list of supported vGPU schedulers returned in \a nvmlVgpuSchedulerCapabilities_t is from + * the NVML_VGPU_SCHEDULER_POLICY_*. This list enumerates the supported scheduler policies + * if the engine is Graphics type. + * The other values in \a nvmlVgpuSchedulerCapabilities_t are also applicable if the engine is + * Graphics type. For other engine types, it is BEST EFFORT policy. + * If ARR is supported and enabled, scheduling frequency and averaging factor are applicable + * else timeSlice is applicable. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target \a device + * @param pCapabilities Reference in which \a pCapabilities is written + * + * @return + * - \ref NVML_SUCCESS vGPU scheduler capabilities were successfully obtained + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pCapabilities is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED The API is not supported in current state or \a device not in vGPU host mode + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuSchedulerCapabilities(nvmlDevice_t device, nvmlVgpuSchedulerCapabilities_t *pCapabilities); + /* * Virtual GPU (vGPU) version * @@ -7616,7 +8265,9 @@ nvmlReturn_t DECLDIR nvmlGetExcludedDeviceInfoByIndex(unsigned int index, nvmlEx #define NVML_GPU_INSTANCE_PROFILE_8_SLICE 0x5 #define NVML_GPU_INSTANCE_PROFILE_6_SLICE 0x6 #define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 0x7 -#define NVML_GPU_INSTANCE_PROFILE_COUNT 0x8 +#define NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 0x8 +#define NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 0x9 +#define NVML_GPU_INSTANCE_PROFILE_COUNT 0xA typedef struct nvmlGpuInstancePlacement_st { @@ -7698,7 +8349,8 @@ typedef struct #define NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE 0x4 #define NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE 0x5 #define NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE 0x6 -#define NVML_COMPUTE_INSTANCE_PROFILE_COUNT 0x7 +#define NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 0x7 +#define NVML_COMPUTE_INSTANCE_PROFILE_COUNT 0x8 #define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED 0x0 //!< All the engines except multiprocessors would be shared #define NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT 0x1 @@ -8143,6 +8795,37 @@ nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceProfileInfoV(nvmlGpuInstan nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuInstance_t gpuInstance, unsigned int profileId, unsigned int *count); +/** + * Get compute instance placements. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * A placement represents the location of a compute instance within a GPU instance. This API only returns all the possible + * placements for the given profile. + * A created compute instance occupies compute slices described by its placement. Creation of new compute instance will + * fail if there is overlap with the already occupied compute slices. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param placements Returns placements allowed for the profile. Can be NULL to discover number + * of allowed placements for this profile. If non-NULL must be large enough + * to accommodate the placements supported by the profile. + * @param count Returns number of allowed placemenets for the profile. + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profileId or \a count are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't have MIG mode enabled or \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstancePossiblePlacements(nvmlGpuInstance_t gpuInstance, + unsigned int profileId, + nvmlComputeInstancePlacement_t *placements, + unsigned int *count); + /** * Create compute instance. * @@ -8171,6 +8854,36 @@ nvmlReturn_t DECLDIR nvmlGpuInstanceGetComputeInstanceRemainingCapacity(nvmlGpuI nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstance(nvmlGpuInstance_t gpuInstance, unsigned int profileId, nvmlComputeInstance_t *computeInstance); +/** + * Create compute instance with the specified placement. + * + * For Ampere &tm; or newer fully supported devices. + * Supported on Linux only. + * Requires privileged user. + * + * If the parent device is unbound, reset or the parent GPU instance is destroyed or the compute instance is destroyed + * explicitly, the compute instance handle would become invalid. The compute instance must be recreated to acquire + * a valid handle. + * + * @param gpuInstance The identifier of the target GPU instance + * @param profileId The compute instance profile ID. + * See \ref nvmlGpuInstanceGetComputeInstanceProfileInfo + * @param placement The requested placement. See \ref nvmlGpuInstanceGetComputeInstancePossiblePlacements + * @param computeInstance Returns the compute instance handle + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_UNINITIALIZED If library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT If \a gpuInstance, \a profile, \a profileId or \a computeInstance + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED If \a profileId isn't supported + * - \ref NVML_ERROR_NO_PERMISSION If user doesn't have permission to perform the operation + * - \ref NVML_ERROR_INSUFFICIENT_RESOURCES If the requested compute instance could not be created + */ +nvmlReturn_t DECLDIR nvmlGpuInstanceCreateComputeInstanceWithPlacement(nvmlGpuInstance_t gpuInstance, unsigned int profileId, + const nvmlComputeInstancePlacement_t *placement, + nvmlComputeInstance_t *computeInstance); + /** * Destroy compute instance. * @@ -8396,8 +9109,523 @@ nvmlReturn_t DECLDIR nvmlDeviceGetDeviceHandleFromMigDeviceHandle(nvmlDevice_t m */ nvmlReturn_t DECLDIR nvmlDeviceGetBusType(nvmlDevice_t device, nvmlBusType_t *type); +/** + * Retrieve performance monitor samples from the associated subdevice. + * + * @param device + * @param pDynamicPstatesInfo + * + * @return + * - \ref NVML_SUCCESS if \a pDynamicPstatesInfo has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pDynamicPstatesInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDynamicPstatesInfo(nvmlDevice_t device, nvmlGpuDynamicPstatesInfo_t *pDynamicPstatesInfo); + +/** + * Sets the speed of a specified fan. + * + * WARNING: This function changes the fan control policy to manual. It means that YOU have to monitor + * the temperature and adjust the fan speed accordingly. + * If you set the fan speed too low you can burn your GPU! + * Use nvmlDeviceSetDefaultFanSpeed_v2 to restore default control policy. + * + * For all cuda-capable discrete products with fans that are Maxwell or Newer. + * + * device The identifier of the target device + * fan The index of the fan, starting at zero + * speed The target speed of the fan [0-100] in % of max speed + * + * return + * NVML_SUCCESS if the fan speed has been set + * NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * NVML_ERROR_INVALID_ARGUMENT if the device is not valid, or the speed is outside acceptable ranges, + * or if the fan index doesn't reference an actual fan. + * NVML_ERROR_NOT_SUPPORTED if the device is older than Maxwell. + * NVML_ERROR_UNKNOWN if there was an unexpected error. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetFanSpeed_v2(nvmlDevice_t device, unsigned int fan, unsigned int speed); + +/** + * Retrieve the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved GPCCLK VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkVfOffset(nvmlDevice_t device, int *offset); + +/** + * Set the GPCCLK VF offset value + * @param[in] device The identifier of the target device + * @param[in] offset The GPCCLK VF offset value to set + * + * @return + * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpcClkVfOffset(nvmlDevice_t device, int offset); + +/** + * Retrieve the MemClk (Memory Clock) VF offset value. + * @param[in] device The identifier of the target device + * @param[out] offset The retrieved MemClk VF offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkVfOffset(nvmlDevice_t device, int *offset); + +/** + * Set the MemClk (Memory Clock) VF offset value. It requires elevated privileges. + * @param[in] device The identifier of the target device + * @param[in] offset The MemClk VF offset value to set + * + * @return + * - \ref NVML_SUCCESS if \a offset has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetMemClkVfOffset(nvmlDevice_t device, int offset); + +/** + * Retrieve min and max clocks of some clock domain for a given PState + * + * @param device The identifier of the target device + * @param type Clock domain + * @param pstate PState to query + * @param minClockMHz Reference in which to return min clock frequency + * @param maxClockMHz Reference in which to return max clock frequency + * + * @return + * - \ref NVML_SUCCESS if everything worked + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a type or \a pstate are invalid or both + * \a minClockMHz and \a maxClockMHz are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinMaxClockOfPState(nvmlDevice_t device, nvmlClockType_t type, nvmlPstates_t pstate, + unsigned int * minClockMHz, unsigned int * maxClockMHz); + +/** + * Get all supported Performance States (P-States) for the device. + * + * The returned array would contain a contiguous list of valid P-States supported by + * the device. If the number of supported P-States is fewer than the size of the array + * supplied missing elements would contain \a NVML_PSTATE_UNKNOWN. + * + * The number of elements in the returned list will never exceed \a NVML_MAX_GPU_PERF_PSTATES. + * + * @param device The identifier of the target device + * @param pstates Container to return the list of performance states + * supported by device + * @param size Size of the supplied \a pstates array in bytes + * + * @return + * - \ref NVML_SUCCESS if \a pstates array has been retrieved + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if the the container supplied was not large enough to + * hold the resulting list + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a pstates is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support performance state readings + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedPerformanceStates(nvmlDevice_t device, + nvmlPstates_t *pstates, unsigned int size); + +/** + * Retrieve the GPCCLK min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved GPCCLK VF min offset value + * @param[out] maxOffset The retrieved GPCCLK VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpcClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + +/** + * Retrieve the MemClk (Memory Clock) min max VF offset value. + * @param[in] device The identifier of the target device + * @param[out] minOffset The retrieved MemClk VF min offset value + * @param[out] maxOffset The retrieved MemClk VF max offset value + * + * @return + * - \ref NVML_SUCCESS if \a offset has been successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a offset is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemClkMinMaxVfOffset(nvmlDevice_t device, + int *minOffset, int *maxOffset); + +/** + * Get fabric information associated with the device. + * + * %HOPPER_OR_NEWER% + * + * On Hopper + NVSwitch systems, GPU is registered with the NVIDIA Fabric Manager + * Upon successful registration, the GPU is added to the NVLink fabric to enable + * peer-to-peer communication. + * This API reports the current state of the GPU in the NVLink fabric + * along with other useful information. + * + * @param device The identifier of the target device + * @param gpuFabricInfo Information about GPU fabric state + * + * @return + * - \ref NVML_SUCCESS Upon success + * - \ref NVML_ERROR_NOT_SUPPORTED If \a device doesn't support gpu fabric + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuFabricInfo(nvmlDevice_t device, nvmlGpuFabricInfo_t *gpuFabricInfo); + /** @} */ +/***************************************************************************************************/ +/** @defgroup GPM NVML GPM + * @{ + */ +/***************************************************************************************************/ +/** @defgroup nvmlGpmEnums GPM Enums + * @{ + */ +/***************************************************************************************************/ + +/* GPM Metric Identifiers */ +typedef enum +{ + NVML_GPM_METRIC_GRAPHICS_UTIL = 1, /* Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 */ + NVML_GPM_METRIC_SM_UTIL = 2, /* Percentage of SMs that were busy. 0.0 - 100.0 */ + NVML_GPM_METRIC_SM_OCCUPANCY = 3, /* Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 */ + NVML_GPM_METRIC_INTEGER_UTIL = 4, /* Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5, /* Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6, /* Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7, /* Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9, /* Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 */ + NVML_GPM_METRIC_DRAM_BW_UTIL = 10, /* Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP64_UTIL = 11, /* Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP32_UTIL = 12, /* Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 */ + NVML_GPM_METRIC_FP16_UTIL = 13, /* Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 */ + NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20, /* PCIe traffic from this GPU in MiB/sec */ + NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21, /* PCIe traffic to this GPU in MiB/sec */ + NVML_GPM_METRIC_NVDEC_0_UTIL = 30, /* Percent utilization of NVDEC 0. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_1_UTIL = 31, /* Percent utilization of NVDEC 1. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_2_UTIL = 32, /* Percent utilization of NVDEC 2. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_3_UTIL = 33, /* Percent utilization of NVDEC 3. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_4_UTIL = 34, /* Percent utilization of NVDEC 4. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_5_UTIL = 35, /* Percent utilization of NVDEC 5. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_6_UTIL = 36, /* Percent utilization of NVDEC 6. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVDEC_7_UTIL = 37, /* Percent utilization of NVDEC 7. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_0_UTIL = 40, /* Percent utilization of NVJPG 0. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_1_UTIL = 41, /* Percent utilization of NVJPG 1. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_2_UTIL = 42, /* Percent utilization of NVJPG 2. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_3_UTIL = 43, /* Percent utilization of NVJPG 3. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_4_UTIL = 44, /* Percent utilization of NVJPG 4. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_5_UTIL = 45, /* Percent utilization of NVJPG 5. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_6_UTIL = 46, /* Percent utilization of NVJPG 6. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVJPG_7_UTIL = 47, /* Percent utilization of NVJPG 7. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVOFA_0_UTIL = 50, /* Percent utilization of NVOFA 0. 0.0 - 100.0 */ + NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60, /* NvLink read bandwidth for all links in MiB/sec */ + NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61, /* NvLink write bandwidth for all links in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62, /* NvLink read bandwidth for link 0 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63, /* NvLink write bandwidth for link 0 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64, /* NvLink read bandwidth for link 1 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65, /* NvLink write bandwidth for link 1 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66, /* NvLink read bandwidth for link 2 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67, /* NvLink write bandwidth for link 2 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68, /* NvLink read bandwidth for link 3 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69, /* NvLink write bandwidth for link 3 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70, /* NvLink read bandwidth for link 4 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71, /* NvLink write bandwidth for link 4 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72, /* NvLink read bandwidth for link 5 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73, /* NvLink write bandwidth for link 5 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74, /* NvLink read bandwidth for link 6 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75, /* NvLink write bandwidth for link 6 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76, /* NvLink read bandwidth for link 7 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77, /* NvLink write bandwidth for link 7 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78, /* NvLink read bandwidth for link 8 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79, /* NvLink write bandwidth for link 8 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80, /* NvLink read bandwidth for link 9 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81, /* NvLink write bandwidth for link 9 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82, /* NvLink read bandwidth for link 10 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83, /* NvLink write bandwidth for link 10 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84, /* NvLink read bandwidth for link 11 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85, /* NvLink write bandwidth for link 11 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86, /* NvLink read bandwidth for link 12 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87, /* NvLink write bandwidth for link 12 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88, /* NvLink read bandwidth for link 13 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89, /* NvLink write bandwidth for link 13 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90, /* NvLink read bandwidth for link 14 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91, /* NvLink write bandwidth for link 14 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92, /* NvLink read bandwidth for link 15 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93, /* NvLink write bandwidth for link 15 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94, /* NvLink read bandwidth for link 16 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95, /* NvLink write bandwidth for link 16 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96, /* NvLink read bandwidth for link 17 in MiB/sec */ + NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97, /* NvLink write bandwidth for link 17 in MiB/sec */ + NVML_GPM_METRIC_MAX = 98, /* Maximum value above +1. Note that changing this + should also change NVML_GPM_METRICS_GET_VERSION + due to struct size change */ +} nvmlGpmMetricId_t; + +/** @} */ // @defgroup nvmlGpmEnums + + +/***************************************************************************************************/ +/** @defgroup nvmlGpmStructs GPM Structs + * @{ + */ +/***************************************************************************************************/ + +/* Handle to an allocated GPM sample allocated with nvmlGpmSampleAlloc() + Free this with nvmlGpmSampleFree() */ +typedef struct +{ + struct nvmlGpmSample_st* handle; +} nvmlGpmSample_t; + +typedef struct { + char *shortName; + char *longName; + char *unit; +} nvmlGpmMetricMetricInfo_t; + +typedef struct +{ + unsigned int metricId; /* IN: NVML_GPM_METRIC_? #define of which metric to retrieve */ + nvmlReturn_t nvmlReturn; /* OUT: Status of this metric. If this is nonzero, then value is not valid */ + double value; /* OUT: Value of this metric. Is only valid if nvmlReturn is 0 (NVML_SUCCESS) */ + nvmlGpmMetricMetricInfo_t metricInfo; /* OUT: Metric name and unit. Those can be NULL if not defined */ +} nvmlGpmMetric_t; + +typedef struct +{ + unsigned int version; /* IN: Set to NVML_GPM_METRICS_GET_VERSION */ + unsigned int numMetrics; /* IN: How many metrics to retrieve in metrics[] */ + nvmlGpmSample_t sample1; /* IN: Sample buffer */ + nvmlGpmSample_t sample2; /* IN: Sample buffer */ + nvmlGpmMetric_t metrics[NVML_GPM_METRIC_MAX]; /* IN/OUT: Array of metrics. Set metricId on call. + see nvmlReturn and value on return */ +} nvmlGpmMetricsGet_t; + +#define NVML_GPM_METRICS_GET_VERSION 1 + +typedef struct +{ + unsigned int version; /* IN: Set to NVML_GPM_SUPPORT_VERSION */ + unsigned int isSupportedDevice; /* OUT: Indicates device support */ +} nvmlGpmSupport_t; + +#define NVML_GPM_SUPPORT_VERSION 1 + +/** @} */ // @defgroup nvmlGPMStructs + +/***************************************************************************************************/ +/** @defgroup nvmlGpmFunctions GPM Functions + * @{ + */ +/***************************************************************************************************/ + +/** + * Calculate GPM metrics from two samples. + * + * + * @param metricsGet IN/OUT: populated nvmlGpmMetricsGet_t struct + * + * %HOPPER_OR_NEWER% + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ +nvmlReturn_t DECLDIR nvmlGpmMetricsGet(nvmlGpmMetricsGet_t *metricsGet); + + +/** + * Free an allocated sample buffer that was allocated with \ref nvmlGpmSampleAlloc() + * + * %HOPPER_OR_NEWER% + * + * @param gpmSample Sample to free + * + * @return + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided + */ +nvmlReturn_t DECLDIR nvmlGpmSampleFree(nvmlGpmSample_t gpmSample); + + +/** + * Allocate a sample buffer to be used with NVML GPM . You will need to allocate + * at least two of these buffers to use with the NVML GPM feature + * + * %HOPPER_OR_NEWER% + * + * @param gpmSample Where the allocated sample will be stored + * + * @return + * - \ref NVML_SUCCESS on success + * - \ref NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided + * - \ref NVML_ERROR_MEMORY if system memory is insufficient + */ +nvmlReturn_t DECLDIR nvmlGpmSampleAlloc(nvmlGpmSample_t *gpmSample); + +/** + * Read a sample of GPM metrics into the provided \a gpmSample buffer. After + * two samples are gathered, you can call nvmlGpmMetricGet on those samples to + * retrive metrics + * + * %HOPPER_OR_NEWER% + * + * @param device Device to get samples for + * @param gpmSample Buffer to read samples into + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ +nvmlReturn_t DECLDIR nvmlGpmSampleGet(nvmlDevice_t device, nvmlGpmSample_t gpmSample); + +/** + * Read a sample of GPM metrics into the provided \a gpmSample buffer for a MIG GPU Instance. + * + * After two samples are gathered, you can call nvmlGpmMetricGet on those + * samples to retrive metrics + * + * %HOPPER_OR_NEWER% + * + * @param device Device to get samples for + * @param gpuInstanceId MIG GPU Instance ID + * @param gpmSample Buffer to read samples into + * + * @return + * - \ref NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum on error + */ +nvmlReturn_t DECLDIR nvmlGpmMigSampleGet(nvmlDevice_t device, unsigned int gpuInstanceId, nvmlGpmSample_t gpmSample); + +/** + * Indicate whether the supplied device supports GPM + * + * @param device NVML device to query for + * @param gpmSupport Structure to indicate GPM support. Indicates + * GPM support per system for the supplied device + * + * @return + * - NVML_SUCCESS on success + * - Nonzero NVML_ERROR_? enum if there is an error in processing the query + */ +nvmlReturn_t DECLDIR nvmlGpmQueryDeviceSupport(nvmlDevice_t device, nvmlGpmSupport_t *gpmSupport); + +/** @} */ // @defgroup nvmlGpmFunctions +/** @} */ // @defgroup GPM + +/***************************************************************************************************/ +/** @defgroup nvmlDevice definitions related to Counter Collection Unit + * @{ + */ +/***************************************************************************************************/ + +/* CCU Stream State */ +#define NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE 0 +#define NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE 1 + +/** + * Get counter collection unit stream state. + * + * %HOPPER_OR_NEWER% + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param state Returns counter collection unit stream state + * NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE or + * NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE + * + * @return + * - \ref NVML_SUCCESS if \a current counter collection unit stream state were successfully queried + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a state is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlDeviceCcuGetStreamState(nvmlDevice_t device, unsigned int *state); + +/** + * Set counter collection unit stream state. + * + * %HOPPER_OR_NEWER% + * Supported on Linux, Windows TCC. + * + * @param device The identifier of the target device + * @param state Counter collection unit stream state, + * NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE or + * NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE + * + * @return + * - \ref NVML_SUCCESS if \a current counter collection unit stream state is successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + */ +nvmlReturn_t DECLDIR nvmlDeviceCcuSetStreamState(nvmlDevice_t device, unsigned int state); + +/** @} */ // @defgroup CCU + +#define NVML_NVLINK_POWER_STATE_HIGH_SPEED 0x0 +#define NVML_NVLINK_POWER_STATE_LOW 0x1 + +#define NVML_NVLINK_LOW_POWER_THRESHOLD_MIN 0x1 +#define NVML_NVLINK_LOW_POWER_THRESHOLD_MAX 0x1FFF +#define NVML_NVLINK_LOW_POWER_THRESHOLD_RESET 0xFFFFFFFF + +/* Structure containing Low Power parameters */ +typedef struct nvmlNvLinkPowerThres_st +{ + unsigned int lowPwrThreshold; //!< Low power threshold (in units of 100us) +} nvmlNvLinkPowerThres_t; + +/** + * Set NvLink Low Power Threshold for device. + * + * %HOPPER_OR_NEWER% + * + * @param device The identifier of the target device + * @param info Reference to \a nvmlNvLinkPowerThres_t struct + * input parameters + * + * @return + * - \ref NVML_SUCCESS if the \a Threshold is successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a Threshold is not within range + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * + **/ +nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkDeviceLowPowerThreshold(nvmlDevice_t device, nvmlNvLinkPowerThres_t *info); + /** * NVML API versioning support */ @@ -8425,6 +9653,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses(nvmlDevice_t device nvmlReturn_t DECLDIR nvmlDeviceGetMPSComputeRunningProcesses_v2(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_v2_t *infos); nvmlReturn_t DECLDIR nvmlDeviceGetGpuInstancePossiblePlacements(nvmlDevice_t device, unsigned int profileId, nvmlGpuInstancePlacement_t *placements, unsigned int *count); nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuLicenseInfo_t *licenseInfo); + #endif // #ifdef NVML_NO_UNVERSIONED_FUNC_DEFS #if defined(NVML_NO_UNVERSIONED_FUNC_DEFS) @@ -8450,6 +9679,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseInfo(nvmlVgpuInstance_t vgpuInsta #undef nvmlGetBlacklistDeviceInfoByIndex #undef nvmlDeviceGetGpuInstancePossiblePlacements #undef nvmlVgpuInstanceGetLicenseInfo + #endif #ifdef __cplusplus diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go index 573c9299..396886d6 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/types_gen.go @@ -119,6 +119,19 @@ type ViolationTime struct { ViolationTime uint64 } +type GpuThermalSettingsSensor struct { + Controller int32 + DefaultMinTemp int32 + DefaultMaxTemp int32 + CurrentTemp int32 + Target int32 +} + +type GpuThermalSettings struct { + Count uint32 + Sensor [3]GpuThermalSettingsSensor +} + type ClkMonFaultInfo struct { ClkApiDomain uint32 ClkDomainFaultMask uint32 @@ -154,6 +167,73 @@ type VgpuProcessUtilizationSample struct { DecUtil uint32 } +type VgpuSchedulerParamsVgpuSchedDataWithARR struct { + AvgFactor uint32 + Timeslice uint32 +} + +type VgpuSchedulerParamsVgpuSchedData struct { + Timeslice uint32 +} + +const sizeofVgpuSchedulerParams = unsafe.Sizeof([8]byte{}) + +type VgpuSchedulerParams [sizeofVgpuSchedulerParams]byte + +type VgpuSchedulerLogEntry struct { + Timestamp uint64 + TimeRunTotal uint64 + TimeRun uint64 + SwRunlistId uint32 + TargetTimeSlice uint64 + CumulativePreemptionTime uint64 +} + +type VgpuSchedulerLog struct { + EngineId uint32 + SchedulerPolicy uint32 + IsEnabledARR uint32 + SchedulerParams [8]byte + EntriesCount uint32 + LogEntries [200]VgpuSchedulerLogEntry +} + +type VgpuSchedulerGetState struct { + SchedulerPolicy uint32 + IsEnabledARR uint32 + SchedulerParams [8]byte +} + +type VgpuSchedulerSetParamsVgpuSchedDataWithARR struct { + AvgFactor uint32 + Frequency uint32 +} + +type VgpuSchedulerSetParamsVgpuSchedData struct { + Timeslice uint32 +} + +const sizeofVgpuSchedulerSetParams = unsafe.Sizeof([8]byte{}) + +type VgpuSchedulerSetParams [sizeofVgpuSchedulerSetParams]byte + +type VgpuSchedulerSetState struct { + SchedulerPolicy uint32 + EnableARRMode uint32 + SchedulerParams [8]byte +} + +type VgpuSchedulerCapabilities struct { + SupportedSchedulers [3]uint32 + MaxTimeslice uint32 + MinTimeslice uint32 + IsArrModeSupported uint32 + MaxFrequencyForARR uint32 + MinFrequencyForARR uint32 + MaxAvgFactorForARR uint32 + MinAvgFactorForARR uint32 +} + type VgpuLicenseExpiry struct { Year uint32 Month uint16 @@ -210,8 +290,22 @@ type DeviceArchitecture uint32 type BusType uint32 +type FanControlPolicy uint32 + type PowerSource uint32 +type GpuDynamicPstatesInfoUtilization struct { + BIsPresent uint32 + Percentage uint32 + IncThreshold uint32 + DecThreshold uint32 +} + +type GpuDynamicPstatesInfo struct { + Flags uint32 + Utilization [8]GpuDynamicPstatesInfoUtilization +} + type FieldValue struct { FieldId uint32 ScopeId uint32 @@ -314,6 +408,16 @@ type FBCSessionInfo struct { AverageLatency uint32 } +type GpuFabricState byte + +type GpuFabricInfo struct { + ClusterUuid [16]int8 + Status uint32 + PartitionId uint32 + State uint8 + Pad_cgo_0 [3]byte +} + type AffinityScope uint32 type VgpuVersion struct { @@ -443,3 +547,37 @@ type ComputeInstanceInfo struct { type ComputeInstance struct { Handle *_Ctype_struct_nvmlComputeInstance_st } + +type GpmSample struct { + Handle *_Ctype_struct_nvmlGpmSample_st +} + +type GpmMetricMetricInfo struct { + ShortName *int8 + LongName *int8 + Unit *int8 +} + +type GpmMetric struct { + MetricId uint32 + NvmlReturn uint32 + Value float64 + MetricInfo GpmMetricMetricInfo +} + +type GpmMetricsGetType struct { + Version uint32 + NumMetrics uint32 + Sample1 GpmSample + Sample2 GpmSample + Metrics [98]GpmMetric +} + +type GpmSupport struct { + Version uint32 + IsSupportedDevice uint32 +} + +type NvLinkPowerThres struct { + LowPwrThreshold uint32 +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go index a9f9f24c..bbb93e3d 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go @@ -460,3 +460,21 @@ func VgpuInstanceGetMdevUUID(VgpuInstance VgpuInstance) (string, Return) { func (VgpuInstance VgpuInstance) GetMdevUUID() (string, Return) { return VgpuInstanceGetMdevUUID(VgpuInstance) } + +// nvml.VgpuTypeGetCapabilities() +func VgpuTypeGetCapabilities(VgpuTypeId VgpuTypeId, Capability VgpuCapability) (bool, Return) { + var CapResult uint32 + ret := nvmlVgpuTypeGetCapabilities(VgpuTypeId, Capability, &CapResult) + return (CapResult != 0), ret +} + +func (VgpuTypeId VgpuTypeId) GetCapabilities(Capability VgpuCapability) (bool, Return) { + return VgpuTypeGetCapabilities(VgpuTypeId, Capability) +} + +// nvml.GetVgpuDriverCapabilities() +func GetVgpuDriverCapabilities(Capability VgpuDriverCapability) (bool, Return) { + var CapResult uint32 + ret := nvmlGetVgpuDriverCapabilities(Capability, &CapResult) + return (CapResult != 0), ret +} diff --git a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info/info.go b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info/info.go index a901d547..460e852c 100644 --- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info/info.go +++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info/info.go @@ -27,6 +27,7 @@ import ( // Interface provides the API to the info package type Interface interface { + HasDXCore() (bool, string) HasNvml() (bool, string) IsTegraSystem() (bool, string) } @@ -37,17 +38,26 @@ type infolib struct { var _ Interface = &infolib{} +// HasDXCore returns true if DXCore is detected on the system. +func (i *infolib) HasDXCore() (bool, string) { + const ( + libraryName = "libdxcore.so" + ) + if err := assertHasLibrary(libraryName); err != nil { + return false, fmt.Sprintf("could not load DXCore library: %v", err) + } + + return true, "found DXCore library" +} + // HasNvml returns true if NVML is detected on the system func (i *infolib) HasNvml() (bool, string) { const ( - nvmlLibraryName = "libnvidia-ml.so.1" - nvmlLibraryLoadFlags = dl.RTLD_LAZY + libraryName = "libnvidia-ml.so.1" ) - lib := dl.New(nvmlLibraryName, nvmlLibraryLoadFlags) - if err := lib.Open(); err != nil { - return false, fmt.Sprintf("could not load NVML: %v", err) + if err := assertHasLibrary(libraryName); err != nil { + return false, fmt.Sprintf("could not load NVML library: %v", err) } - defer lib.Close() return true, "found NVML library" } @@ -76,3 +86,17 @@ func (i *infolib) IsTegraSystem() (bool, string) { return false, fmt.Sprintf("%v has no 'tegra' prefix", tegraFamilyFile) } + +// assertHasLibrary returns an error if the specified library cannot be loaded +func assertHasLibrary(libraryName string) error { + const ( + libraryLoadFlags = dl.RTLD_LAZY + ) + lib := dl.New(libraryName, libraryLoadFlags) + if err := lib.Open(); err != nil { + return err + } + defer lib.Close() + + return nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 3455ce80..ad7f95c3 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -2,7 +2,7 @@ ## explicit; go 1.16 github.com/BurntSushi/toml github.com/BurntSushi/toml/internal -# github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82 +# github.com/NVIDIA/go-nvml v0.12.0-0 ## explicit; go 1.15 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml @@ -62,7 +62,7 @@ github.com/syndtr/gocapability/capability github.com/urfave/cli/v2 # github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb ## explicit -# gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230119114711-6fe07bb33342 +# gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438 ## explicit; go 1.16 gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info