From 22672d2444e70964507f93b72df2c40bce169fa3 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Sun, 17 Mar 2024 19:13:57 +0200 Subject: [PATCH] Improve GPU monitoring --- clearml_agent/definitions.py | 2 + clearml_agent/helper/gpu/gpustat.py | 58 +- clearml_agent/helper/gpu/pynvml.py | 2321 +++++++++++----------- clearml_agent/helper/resource_monitor.py | 195 +- 4 files changed, 1349 insertions(+), 1227 deletions(-) diff --git a/clearml_agent/definitions.py b/clearml_agent/definitions.py index 1d3dbfb..182abf7 100644 --- a/clearml_agent/definitions.py +++ b/clearml_agent/definitions.py @@ -248,6 +248,8 @@ ENV_TEMP_STDOUT_FILE_DIR = EnvironmentConfig("CLEARML_AGENT_TEMP_STDOUT_FILE_DIR ENV_GIT_CLONE_VERBOSE = EnvironmentConfig("CLEARML_AGENT_GIT_CLONE_VERBOSE", type=bool) +ENV_GPU_FRACTIONS = EnvironmentConfig("CLEARML_AGENT_GPU_FRACTIONS") + class FileBuffering(IntEnum): """ diff --git a/clearml_agent/helper/gpu/gpustat.py b/clearml_agent/helper/gpu/gpustat.py index 210fb61..55b254a 100644 --- a/clearml_agent/helper/gpu/gpustat.py +++ b/clearml_agent/helper/gpu/gpustat.py @@ -57,6 +57,21 @@ class GPUStat(object): """ return self.entry['uuid'] + @property + def mig_index(self): + """ + Returns the index of the MIG partition (as in nvidia-smi). + """ + return self.entry.get("mig_index") + + @property + def mig_uuid(self): + """ + Returns the uuid of the MIG partition returned by nvidia-smi when running in MIG mode, + e.g. MIG-12345678-abcd-abcd-uuid-123456abcdef + """ + return self.entry.get("mig_uuid") + @property def name(self): """ @@ -161,6 +176,7 @@ class GPUStatCollection(object): _initialized = False _device_count = None _gpu_device_info = {} + _mig_device_info = {} def __init__(self, gpu_list, driver_version=None, driver_cuda_version=None): self.gpus = gpu_list @@ -191,7 +207,7 @@ class GPUStatCollection(object): return b.decode() # for python3, to unicode return b - def get_gpu_info(index, handle): + def get_gpu_info(index, handle, is_mig=False): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): @@ -227,12 +243,14 @@ class GPUStatCollection(object): pass return process - if not GPUStatCollection._gpu_device_info.get(index): + device_info = GPUStatCollection._mig_device_info if is_mig else GPUStatCollection._gpu_device_info + + if not device_info.get(index): name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) - GPUStatCollection._gpu_device_info[index] = (name, uuid) + device_info[index] = (name, uuid) - name, uuid = GPUStatCollection._gpu_device_info[index] + name, uuid = device_info[index] try: temperature = N.nvmlDeviceGetTemperature( @@ -328,8 +346,36 @@ class GPUStatCollection(object): for index in range(GPUStatCollection._device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(index, handle) - gpu_stat = GPUStat(gpu_info) - gpu_list.append(gpu_stat) + mig_cnt = 0 + # noinspection PyBroadException + try: + mig_cnt = N.nvmlDeviceGetMaxMigDeviceCount(handle) + except Exception: + pass + + if mig_cnt <= 0: + gpu_list.append(GPUStat(gpu_info)) + continue + + got_mig_info = False + for mig_index in range(mig_cnt): + try: + mig_handle = N.nvmlDeviceGetMigDeviceHandleByIndex(handle, mig_index) + mig_info = get_gpu_info(mig_index, mig_handle, is_mig=True) + mig_info["mig_name"] = mig_info["name"] + mig_info["name"] = gpu_info["name"] + mig_info["mig_index"] = mig_info["index"] + mig_info["mig_uuid"] = mig_info["uuid"] + mig_info["index"] = gpu_info["index"] + mig_info["uuid"] = gpu_info["uuid"] + mig_info["temperature.gpu"] = gpu_info["temperature.gpu"] + mig_info["fan.speed"] = gpu_info["fan.speed"] + gpu_list.append(GPUStat(mig_info)) + got_mig_info = True + except Exception as e: + pass + if not got_mig_info: + gpu_list.append(GPUStat(gpu_info)) # 2. additional info (driver version, etc). if get_driver_info: diff --git a/clearml_agent/helper/gpu/pynvml.py b/clearml_agent/helper/gpu/pynvml.py index 25d8246..e70590f 100644 --- a/clearml_agent/helper/gpu/pynvml.py +++ b/clearml_agent/helper/gpu/pynvml.py @@ -1,5 +1,5 @@ ##### -# Copyright (c) 2011-2023, NVIDIA Corporation. All rights reserved. +# Copyright (c) 2011-2022, NVIDIA Corporation. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -25,14 +25,12 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF # THE POSSIBILITY OF SUCH DAMAGE. ##### -# flake8: noqa -# This is only to ignore F405 errors ## # Python bindings for the NVML library ## -from ctypes import * # noqa: F403 -from ctypes.util import find_library # noqa +from ctypes import * +from ctypes.util import find_library from functools import wraps import sys import os @@ -42,50 +40,51 @@ import string ## C Type mappings ## ## Enums _nvmlEnableState_t = c_uint -NVML_FEATURE_DISABLED = 0 -NVML_FEATURE_ENABLED = 1 +NVML_FEATURE_DISABLED = 0 +NVML_FEATURE_ENABLED = 1 _nvmlBrandType_t = c_uint -NVML_BRAND_UNKNOWN = 0 -NVML_BRAND_QUADRO = 1 -NVML_BRAND_TESLA = 2 -NVML_BRAND_NVS = 3 -NVML_BRAND_GRID = 4 # Deprecated from API reporting. Keeping definition for backward compatibility. -NVML_BRAND_GEFORCE = 5 -NVML_BRAND_TITAN = 6 -NVML_BRAND_NVIDIA_VAPPS = 7 # NVIDIA Virtual Applications -NVML_BRAND_NVIDIA_VPC = 8 # NVIDIA Virtual PC -NVML_BRAND_NVIDIA_VCS = 9 # NVIDIA Virtual Compute Server -NVML_BRAND_NVIDIA_VWS = 10 # NVIDIA RTX Virtual Workstation +NVML_BRAND_UNKNOWN = 0 +NVML_BRAND_QUADRO = 1 +NVML_BRAND_TESLA = 2 +NVML_BRAND_NVS = 3 +NVML_BRAND_GRID = 4 # Deprecated from API reporting. Keeping definition for backward compatibility. +NVML_BRAND_GEFORCE = 5 +NVML_BRAND_TITAN = 6 +NVML_BRAND_NVIDIA_VAPPS = 7 # NVIDIA Virtual Applications +NVML_BRAND_NVIDIA_VPC = 8 # NVIDIA Virtual PC +NVML_BRAND_NVIDIA_VCS = 9 # NVIDIA Virtual Compute Server +NVML_BRAND_NVIDIA_VWS = 10 # NVIDIA RTX Virtual Workstation NVML_BRAND_NVIDIA_CLOUD_GAMING = 11 # NVIDIA Cloud Gaming -NVML_BRAND_NVIDIA_VGAMING = NVML_BRAND_NVIDIA_CLOUD_GAMING # Deprecated from API reporting. Keeping definition for backward compatibility. -NVML_BRAND_QUADRO_RTX = 12 -NVML_BRAND_NVIDIA_RTX = 13 -NVML_BRAND_NVIDIA = 14 -NVML_BRAND_GEFORCE_RTX = 15 # Unused -NVML_BRAND_TITAN_RTX = 16 # Unused -NVML_BRAND_COUNT = 17 +NVML_BRAND_NVIDIA_VGAMING = NVML_BRAND_NVIDIA_CLOUD_GAMING # Deprecated from API reporting. Keeping definition for backward compatibility. +NVML_BRAND_QUADRO_RTX = 12 +NVML_BRAND_NVIDIA_RTX = 13 +NVML_BRAND_NVIDIA = 14 +NVML_BRAND_GEFORCE_RTX = 15 # Unused +NVML_BRAND_TITAN_RTX = 16 # Unused +NVML_BRAND_COUNT = 17 _nvmlTemperatureThresholds_t = c_uint -NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0 -NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1 -NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2 -NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3 -NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN = 4 +NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0 +NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1 +NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2 +NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3 +NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN = 4 NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5 -NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX = 6 -NVML_TEMPERATURE_THRESHOLD_COUNT = 7 +NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX = 6 +NVML_TEMPERATURE_THRESHOLD_COUNT = 7 _nvmlTemperatureSensors_t = c_uint -NVML_TEMPERATURE_GPU = 0 -NVML_TEMPERATURE_COUNT = 1 +NVML_TEMPERATURE_GPU = 0 +NVML_TEMPERATURE_COUNT = 1 + _nvmlComputeMode_t = c_uint -NVML_COMPUTEMODE_DEFAULT = 0 -NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1 ## Support Removed -NVML_COMPUTEMODE_PROHIBITED = 2 +NVML_COMPUTEMODE_DEFAULT = 0 +NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1 ## Support Removed +NVML_COMPUTEMODE_PROHIBITED = 2 NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 -NVML_COMPUTEMODE_COUNT = 4 +NVML_COMPUTEMODE_COUNT = 4 _nvmlMemoryLocation_t = c_uint NVML_MEMORY_LOCATION_L1_CACHE = 0 @@ -122,154 +121,155 @@ NVML_NVLINK_ERROR_DL_ECC_COUNT = 5 _nvmlNvLinkCapability_t = c_uint NVML_NVLINK_CAP_P2P_SUPPORTED = 0 NVML_NVLINK_CAP_SYSMEM_ACCESS = 1 -NVML_NVLINK_CAP_P2P_ATOMICS = 2 -NVML_NVLINK_CAP_SYSMEM_ATOMICS = 3 -NVML_NVLINK_CAP_SLI_BRIDGE = 4 -NVML_NVLINK_CAP_VALID = 5 -NVML_NVLINK_CAP_COUNT = 6 +NVML_NVLINK_CAP_P2P_ATOMICS = 2 +NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3 +NVML_NVLINK_CAP_SLI_BRIDGE = 4 +NVML_NVLINK_CAP_VALID = 5 +NVML_NVLINK_CAP_COUNT = 6 _nvmlNvLinkUtilizationCountPktTypes_t = c_uint -NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1 -NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2 -NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4 -NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8 -NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10 -NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20 -NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40 +NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1 +NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2 +NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4 +NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8 +NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10 +NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20 +NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40 NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80 -NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF +NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF _nvmlNvLinkUtilizationCountUnits_t = c_uint -NVML_NVLINK_COUNTER_UNIT_CYCLES = 0 -NVML_NVLINK_COUNTER_UNIT_PACKETS = 1 -NVML_NVLINK_COUNTER_UNIT_BYTES = 2 +NVML_NVLINK_COUNTER_UNIT_CYCLES = 0 +NVML_NVLINK_COUNTER_UNIT_PACKETS = 1 +NVML_NVLINK_COUNTER_UNIT_BYTES = 2 NVML_NVLINK_COUNTER_UNIT_RESERVED = 3 -NVML_NVLINK_COUNTER_UNIT_COUNT = 4 +NVML_NVLINK_COUNTER_UNIT_COUNT = 4 _nvmlNvLinkDeviceType_t = c_uint -NVML_NVLINK_DEVICE_TYPE_GPU = 0x00 -NVML_NVLINK_DEVICE_TYPE_IBMNPU = 0x01 -NVML_NVLINK_DEVICE_TYPE_SWITCH = 0x02 +NVML_NVLINK_DEVICE_TYPE_GPU = 0x00 +NVML_NVLINK_DEVICE_TYPE_IBMNPU = 0x01 +NVML_NVLINK_DEVICE_TYPE_SWITCH = 0x02 NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF # These are deprecated, instead use _nvmlMemoryErrorType_t _nvmlEccBitType_t = c_uint -NVML_SINGLE_BIT_ECC = 0 -NVML_DOUBLE_BIT_ECC = 1 +NVML_SINGLE_BIT_ECC = 0 +NVML_DOUBLE_BIT_ECC = 1 NVML_ECC_ERROR_TYPE_COUNT = 2 _nvmlEccCounterType_t = c_uint -NVML_VOLATILE_ECC = 0 -NVML_AGGREGATE_ECC = 1 +NVML_VOLATILE_ECC = 0 +NVML_AGGREGATE_ECC = 1 NVML_ECC_COUNTER_TYPE_COUNT = 2 _nvmlMemoryErrorType_t = c_uint -NVML_MEMORY_ERROR_TYPE_CORRECTED = 0 +NVML_MEMORY_ERROR_TYPE_CORRECTED = 0 NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1 -NVML_MEMORY_ERROR_TYPE_COUNT = 2 +NVML_MEMORY_ERROR_TYPE_COUNT = 2 _nvmlClockType_t = c_uint -NVML_CLOCK_GRAPHICS = 0 -NVML_CLOCK_SM = 1 -NVML_CLOCK_MEM = 2 -NVML_CLOCK_VIDEO = 3 -NVML_CLOCK_COUNT = 4 +NVML_CLOCK_GRAPHICS = 0 +NVML_CLOCK_SM = 1 +NVML_CLOCK_MEM = 2 +NVML_CLOCK_VIDEO = 3 +NVML_CLOCK_COUNT = 4 _nvmlClockId_t = c_uint -NVML_CLOCK_ID_CURRENT = 0 -NVML_CLOCK_ID_APP_CLOCK_TARGET = 1 -NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2 +NVML_CLOCK_ID_CURRENT = 0 +NVML_CLOCK_ID_APP_CLOCK_TARGET = 1 +NVML_CLOCK_ID_APP_CLOCK_DEFAULT = 2 NVML_CLOCK_ID_CUSTOMER_BOOST_MAX = 3 -NVML_CLOCK_ID_COUNT = 4 +NVML_CLOCK_ID_COUNT = 4 _nvmlDriverModel_t = c_uint -NVML_DRIVER_WDDM = 0 -NVML_DRIVER_WDM = 1 -NVML_DRIVER_MCDM = 2 +NVML_DRIVER_WDDM = 0 +NVML_DRIVER_WDM = 1 +NVML_DRIVER_MCDM = 2 NVML_MAX_GPU_PERF_PSTATES = 16 _nvmlPstates_t = c_uint -NVML_PSTATE_0 = 0 -NVML_PSTATE_1 = 1 -NVML_PSTATE_2 = 2 -NVML_PSTATE_3 = 3 -NVML_PSTATE_4 = 4 -NVML_PSTATE_5 = 5 -NVML_PSTATE_6 = 6 -NVML_PSTATE_7 = 7 -NVML_PSTATE_8 = 8 -NVML_PSTATE_9 = 9 -NVML_PSTATE_10 = 10 -NVML_PSTATE_11 = 11 -NVML_PSTATE_12 = 12 -NVML_PSTATE_13 = 13 -NVML_PSTATE_14 = 14 -NVML_PSTATE_15 = 15 -NVML_PSTATE_UNKNOWN = 32 +NVML_PSTATE_0 = 0 +NVML_PSTATE_1 = 1 +NVML_PSTATE_2 = 2 +NVML_PSTATE_3 = 3 +NVML_PSTATE_4 = 4 +NVML_PSTATE_5 = 5 +NVML_PSTATE_6 = 6 +NVML_PSTATE_7 = 7 +NVML_PSTATE_8 = 8 +NVML_PSTATE_9 = 9 +NVML_PSTATE_10 = 10 +NVML_PSTATE_11 = 11 +NVML_PSTATE_12 = 12 +NVML_PSTATE_13 = 13 +NVML_PSTATE_14 = 14 +NVML_PSTATE_15 = 15 +NVML_PSTATE_UNKNOWN = 32 _nvmlInforomObject_t = c_uint -NVML_INFOROM_OEM = 0 -NVML_INFOROM_ECC = 1 -NVML_INFOROM_POWER = 2 -NVML_INFOROM_COUNT = 3 +NVML_INFOROM_OEM = 0 +NVML_INFOROM_ECC = 1 +NVML_INFOROM_POWER = 2 +NVML_INFOROM_COUNT = 3 _nvmlReturn_t = c_uint -NVML_SUCCESS = 0 -NVML_ERROR_UNINITIALIZED = 1 -NVML_ERROR_INVALID_ARGUMENT = 2 -NVML_ERROR_NOT_SUPPORTED = 3 -NVML_ERROR_NO_PERMISSION = 4 -NVML_ERROR_ALREADY_INITIALIZED = 5 -NVML_ERROR_NOT_FOUND = 6 -NVML_ERROR_INSUFFICIENT_SIZE = 7 -NVML_ERROR_INSUFFICIENT_POWER = 8 -NVML_ERROR_DRIVER_NOT_LOADED = 9 -NVML_ERROR_TIMEOUT = 10 -NVML_ERROR_IRQ_ISSUE = 11 -NVML_ERROR_LIBRARY_NOT_FOUND = 12 -NVML_ERROR_FUNCTION_NOT_FOUND = 13 -NVML_ERROR_CORRUPTED_INFOROM = 14 -NVML_ERROR_GPU_IS_LOST = 15 -NVML_ERROR_RESET_REQUIRED = 16 -NVML_ERROR_OPERATING_SYSTEM = 17 -NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18 -NVML_ERROR_IN_USE = 19 -NVML_ERROR_MEMORY = 20 -NVML_ERROR_NO_DATA = 21 -NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22 -NVML_ERROR_INSUFFICIENT_RESOURCES = 23 -NVML_ERROR_FREQ_NOT_SUPPORTED = 24 +NVML_SUCCESS = 0 +NVML_ERROR_UNINITIALIZED = 1 +NVML_ERROR_INVALID_ARGUMENT = 2 +NVML_ERROR_NOT_SUPPORTED = 3 +NVML_ERROR_NO_PERMISSION = 4 +NVML_ERROR_ALREADY_INITIALIZED = 5 +NVML_ERROR_NOT_FOUND = 6 +NVML_ERROR_INSUFFICIENT_SIZE = 7 +NVML_ERROR_INSUFFICIENT_POWER = 8 +NVML_ERROR_DRIVER_NOT_LOADED = 9 +NVML_ERROR_TIMEOUT = 10 +NVML_ERROR_IRQ_ISSUE = 11 +NVML_ERROR_LIBRARY_NOT_FOUND = 12 +NVML_ERROR_FUNCTION_NOT_FOUND = 13 +NVML_ERROR_CORRUPTED_INFOROM = 14 +NVML_ERROR_GPU_IS_LOST = 15 +NVML_ERROR_RESET_REQUIRED = 16 +NVML_ERROR_OPERATING_SYSTEM = 17 +NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18 +NVML_ERROR_IN_USE = 19 +NVML_ERROR_MEMORY = 20 +NVML_ERROR_NO_DATA = 21 +NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22 +NVML_ERROR_INSUFFICIENT_RESOURCES = 23 +NVML_ERROR_FREQ_NOT_SUPPORTED = 24 NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25 -NVML_ERROR_DEPRECATED = 26 -NVML_ERROR_UNKNOWN = 999 +NVML_ERROR_DEPRECATED = 26 +NVML_ERROR_NOT_READY = 27 +NVML_ERROR_UNKNOWN = 999 _nvmlFanState_t = c_uint -NVML_FAN_NORMAL = 0 -NVML_FAN_FAILED = 1 +NVML_FAN_NORMAL = 0 +NVML_FAN_FAILED = 1 _nvmlFanControlPolicy_t = c_uint NVML_FAN_POLICY_TEMPERATURE_CONTINOUS_SW = 0 -NVML_FAN_POLICY_MANUAL = 1 +NVML_FAN_POLICY_MANUAL = 1 _nvmlLedColor_t = c_uint -NVML_LED_COLOR_GREEN = 0 -NVML_LED_COLOR_AMBER = 1 +NVML_LED_COLOR_GREEN = 0 +NVML_LED_COLOR_AMBER = 1 _nvmlGpuOperationMode_t = c_uint -NVML_GOM_ALL_ON = 0 -NVML_GOM_COMPUTE = 1 -NVML_GOM_LOW_DP = 2 +NVML_GOM_ALL_ON = 0 +NVML_GOM_COMPUTE = 1 +NVML_GOM_LOW_DP = 2 _nvmlPageRetirementCause_t = c_uint NVML_PAGE_RETIREMENT_CAUSE_MULTIPLE_SINGLE_BIT_ECC_ERRORS = 0 -NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1 -NVML_PAGE_RETIREMENT_CAUSE_COUNT = 2 +NVML_PAGE_RETIREMENT_CAUSE_DOUBLE_BIT_ECC_ERROR = 1 +NVML_PAGE_RETIREMENT_CAUSE_COUNT = 2 _nvmlRestrictedAPI_t = c_uint -NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0 -NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1 -NVML_RESTRICTED_API_COUNT = 2 +NVML_RESTRICTED_API_SET_APPLICATION_CLOCKS = 0 +NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS = 1 +NVML_RESTRICTED_API_COUNT = 2 _nvmlBridgeChipType_t = c_uint NVML_BRIDGE_CHIP_PLX = 0 @@ -282,7 +282,8 @@ NVML_VALUE_TYPE_UNSIGNED_INT = 1 NVML_VALUE_TYPE_UNSIGNED_LONG = 2 NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3 NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4 -NVML_VALUE_TYPE_COUNT = 5 +NVML_VALUE_TYPE_SIGNED_INT = 5 +NVML_VALUE_TYPE_COUNT = 6 _nvmlPerfPolicyType_t = c_uint NVML_PERF_POLICY_POWER = 0 @@ -298,6 +299,8 @@ NVML_PERF_POLICY_COUNT = 12 _nvmlEncoderQueryType_t = c_uint NVML_ENCODER_QUERY_H264 = 0 NVML_ENCODER_QUERY_HEVC = 1 +NVML_ENCODER_QUERY_AV1 = 2 +NVML_ENCODER_QUERY_UNKNOWN = 255 _nvmlFBCSessionType_t = c_uint NVML_FBC_SESSION_TYPE_UNKNOWN = 0 @@ -322,7 +325,8 @@ NVML_ENC_UTILIZATION_SAMPLES = 3 NVML_DEC_UTILIZATION_SAMPLES = 4 NVML_PROCESSOR_CLK_SAMPLES = 5 NVML_MEMORY_CLK_SAMPLES = 6 -NVML_SAMPLINGTYPE_COUNT = 7 +NVML_MODULE_POWER_SAMPLES = 7 +NVML_SAMPLINGTYPE_COUNT = 8 _nvmlPcieUtilCounter_t = c_uint NVML_PCIE_UTIL_TX_BYTES = 0 @@ -339,45 +343,47 @@ NVML_TOPOLOGY_CPU = NVML_TOPOLOGY_NODE NVML_TOPOLOGY_SYSTEM = 50 _nvmlGpuP2PCapsIndex_t = c_uint -NVML_P2P_CAPS_INDEX_READ = 0 +NVML_P2P_CAPS_INDEX_READ = 0, NVML_P2P_CAPS_INDEX_WRITE = 1 -NVML_P2P_CAPS_INDEX_NVLINK = 2 +NVML_P2P_CAPS_INDEX_NVLINK =2 NVML_P2P_CAPS_INDEX_ATOMICS = 3 NVML_P2P_CAPS_INDEX_PROP = 4 NVML_P2P_CAPS_INDEX_LOOPBACK = 5 NVML_P2P_CAPS_INDEX_UNKNOWN = 6 _nvmlGpuP2PStatus_t = c_uint -NVML_P2P_STATUS_OK = 0 +NVML_P2P_STATUS_OK = 0 NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED = 1 +NVML_P2P_STATUS_CHIPSET_NOT_SUPPORTED = NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED NVML_P2P_STATUS_GPU_NOT_SUPPORTED = 2 -NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED = 3 -NVML_P2P_STATUS_DISABLED_BY_REGKEY = 4 -NVML_P2P_STATUS_NOT_SUPPORTED = 5 -NVML_P2P_STATUS_UNKNOWN = 6 +NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED =3 +NVML_P2P_STATUS_DISABLED_BY_REGKEY =4 +NVML_P2P_STATUS_NOT_SUPPORTED =5 +NVML_P2P_STATUS_UNKNOWN =6 _nvmlDeviceArchitecture_t = c_uint -NVML_DEVICE_ARCH_KEPLER = 2 -NVML_DEVICE_ARCH_MAXWELL = 3 -NVML_DEVICE_ARCH_PASCAL = 4 -NVML_DEVICE_ARCH_VOLTA = 5 -NVML_DEVICE_ARCH_TURING = 6 -NVML_DEVICE_ARCH_AMPERE = 7 -NVML_DEVICE_ARCH_ADA = 8 -NVML_DEVICE_ARCH_HOPPER = 9 -NVML_DEVICE_ARCH_UNKNOWN = 0xffffffff +NVML_DEVICE_ARCH_KEPLER = 2 +NVML_DEVICE_ARCH_MAXWELL = 3 +NVML_DEVICE_ARCH_PASCAL = 4 +NVML_DEVICE_ARCH_VOLTA = 5 +NVML_DEVICE_ARCH_TURING = 6 +NVML_DEVICE_ARCH_AMPERE = 7 +NVML_DEVICE_ARCH_ADA = 8 +NVML_DEVICE_ARCH_HOPPER = 9 +NVML_DEVICE_ARCH_UNKNOWN = 0xffffffff # PCI bus Types _nvmlBusType_t = c_uint NVML_BUS_TYPE_UNKNOWN = 0 -NVML_BUS_TYPE_PCI = 1 -NVML_BUS_TYPE_PCIE = 2 -NVML_BUS_TYPE_FPCI = 3 -NVML_BUS_TYPE_AGP = 4 +NVML_BUS_TYPE_PCI = 1 +NVML_BUS_TYPE_PCIE = 2 +NVML_BUS_TYPE_FPCI = 3 +NVML_BUS_TYPE_AGP = 4 _nvmlPowerSource_t = c_uint -NVML_POWER_SOURCE_AC = 0x00000000 -NVML_POWER_SOURCE_BATTERY = 0x00000001 +NVML_POWER_SOURCE_AC = 0x00000000 +NVML_POWER_SOURCE_BATTERY = 0x00000001 +NVML_POWER_SOURCE_UNDERSIZED = 0x00000002 _nvmlAdaptiveClockInfoStatus_t = c_uint NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED = 0x00000000 @@ -385,53 +391,54 @@ NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED = 0x00000001 _nvmlClockLimitId_t = c_uint NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00 -NVML_CLOCK_LIMIT_ID_TDP = 0xffffff01 -NVML_CLOCK_LIMIT_ID_UNLIMITED = 0xffffff02 +NVML_CLOCK_LIMIT_ID_TDP = 0xffffff01 +NVML_CLOCK_LIMIT_ID_UNLIMITED = 0xffffff02 _nvmlPcieLinkMaxSpeed_t = c_uint -NVML_PCIE_LINK_MAX_SPEED_INVALID = 0x00000000 -NVML_PCIE_LINK_MAX_SPEED_2500MBPS = 0x00000001 -NVML_PCIE_LINK_MAX_SPEED_5000MBPS = 0x00000002 -NVML_PCIE_LINK_MAX_SPEED_8000MBPS = 0x00000003 +NVML_PCIE_LINK_MAX_SPEED_INVALID = 0x00000000 +NVML_PCIE_LINK_MAX_SPEED_2500MBPS = 0x00000001 +NVML_PCIE_LINK_MAX_SPEED_5000MBPS = 0x00000002 +NVML_PCIE_LINK_MAX_SPEED_8000MBPS = 0x00000003 NVML_PCIE_LINK_MAX_SPEED_16000MBPS = 0x00000004 NVML_PCIE_LINK_MAX_SPEED_32000MBPS = 0x00000005 NVML_PCIE_LINK_MAX_SPEED_64000MBPS = 0x00000006 _nvmlAffinityScope_t = c_uint -NVML_AFFINITY_SCOPE_NODE = 0 +NVML_AFFINITY_SCOPE_NODE = 0 NVML_AFFINITY_SCOPE_SOCKET = 1 # C preprocessor defined values -nvmlFlagDefault = 0 -nvmlFlagForce = 1 -NVML_INIT_FLAG_NO_GPUS = 1 -NVML_INIT_FLAG_NO_ATTACH = 2 +nvmlFlagDefault = 0 +nvmlFlagForce = 1 +NVML_INIT_FLAG_NO_GPUS = 1 +NVML_INIT_FLAG_NO_ATTACH = 2 -NVML_MAX_GPC_COUNT = 32 +NVML_MAX_GPC_COUNT = 32 # buffer size -NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE = 16 -NVML_DEVICE_UUID_BUFFER_SIZE = 80 -NVML_DEVICE_UUID_V2_BUFFER_SIZE = 96 -NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE = 80 -NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE = 80 -NVML_DEVICE_NAME_BUFFER_SIZE = 64 -NVML_DEVICE_NAME_V2_BUFFER_SIZE = 96 -NVML_DEVICE_SERIAL_BUFFER_SIZE = 30 -NVML_DEVICE_PART_NUMBER_BUFFER_SIZE = 80 -NVML_DEVICE_GPU_PART_NUMBER_BUFFER_SIZE = 80 -NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 -NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE = 32 -NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE = 16 -NVML_GRID_LICENSE_BUFFER_SIZE = 128 -NVML_VGPU_NAME_BUFFER_SIZE = 64 -NVML_GRID_LICENSE_FEATURE_MAX_COUNT = 3 -NVML_VGPU_METADATA_OPAQUE_DATA_SIZE = sizeof(c_uint) + 256 -NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE = 256 +NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE = 16 +NVML_DEVICE_UUID_BUFFER_SIZE = 80 +NVML_DEVICE_UUID_V2_BUFFER_SIZE = 96 +NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE = 80 +NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE = 80 +NVML_DEVICE_NAME_BUFFER_SIZE = 64 +NVML_DEVICE_NAME_V2_BUFFER_SIZE = 96 +NVML_DEVICE_SERIAL_BUFFER_SIZE = 30 +NVML_DEVICE_PART_NUMBER_BUFFER_SIZE = 80 +NVML_DEVICE_GPU_PART_NUMBER_BUFFER_SIZE = 80 +NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 +NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE = 32 +NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE = 16 +NVML_GRID_LICENSE_BUFFER_SIZE = 128 +NVML_VGPU_NAME_BUFFER_SIZE = 64 +NVML_GRID_LICENSE_FEATURE_MAX_COUNT = 3 +NVML_VGPU_METADATA_OPAQUE_DATA_SIZE = sizeof(c_uint) + 256 +NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE = 256 +NVML_DEVICE_GPU_FRU_PART_NUMBER_BUFFER_SIZE = 0x14 # NV2080_GPU_MAX_PRODUCT_PART_NUMBER_LENGTH # Format strings -NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT = "%04X:%02X:%02X.0" -NVML_DEVICE_PCI_BUS_ID_FMT = "%08X:%02X:%02X.0" +NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT = "%04X:%02X:%02X.0" +NVML_DEVICE_PCI_BUS_ID_FMT = "%08X:%02X:%02X.0" NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1) NVML_VALUE_NOT_AVAILABLE_uint = c_uint(-1) @@ -441,120 +448,120 @@ NVML_VALUE_NOT_AVAILABLE_uint = c_uint(-1) All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. ''' -NVML_FI_DEV_ECC_CURRENT = 1 # Current ECC mode. 1=Active. 0=Inactive -NVML_FI_DEV_ECC_PENDING = 2 # Pending ECC mode. 1=Active. 0=Inactive +NVML_FI_DEV_ECC_CURRENT = 1 # Current ECC mode. 1=Active. 0=Inactive +NVML_FI_DEV_ECC_PENDING = 2 # Pending ECC mode. 1=Active. 0=Inactive -# ECC Count Totals -NVML_FI_DEV_ECC_SBE_VOL_TOTAL = 3 # Total single bit volatile ECC errors -NVML_FI_DEV_ECC_DBE_VOL_TOTAL = 4 # Total double bit volatile ECC errors -NVML_FI_DEV_ECC_SBE_AGG_TOTAL = 5 # Total single bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_DBE_AGG_TOTAL = 6 # Total double bit aggregate (persistent) ECC errors -# Individual ECC locations -NVML_FI_DEV_ECC_SBE_VOL_L1 = 7 # L1 cache single bit volatile ECC errors -NVML_FI_DEV_ECC_DBE_VOL_L1 = 8 # L1 cache double bit volatile ECC errors -NVML_FI_DEV_ECC_SBE_VOL_L2 = 9 # L2 cache single bit volatile ECC errors -NVML_FI_DEV_ECC_DBE_VOL_L2 = 10 # L2 cache double bit volatile ECC errors -NVML_FI_DEV_ECC_SBE_VOL_DEV = 11 # Device memory single bit volatile ECC errors -NVML_FI_DEV_ECC_DBE_VOL_DEV = 12 # Device memory double bit volatile ECC errors -NVML_FI_DEV_ECC_SBE_VOL_REG = 13 # Register file single bit volatile ECC errors -NVML_FI_DEV_ECC_DBE_VOL_REG = 14 # Register file double bit volatile ECC errors -NVML_FI_DEV_ECC_SBE_VOL_TEX = 15 # Texture memory single bit volatile ECC errors -NVML_FI_DEV_ECC_DBE_VOL_TEX = 16 # Texture memory double bit volatile ECC errors -NVML_FI_DEV_ECC_DBE_VOL_CBU = 17 # CBU double bit volatile ECC errors -NVML_FI_DEV_ECC_SBE_AGG_L1 = 18 # L1 cache single bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_DBE_AGG_L1 = 19 # L1 cache double bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_SBE_AGG_L2 = 20 # L2 cache single bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_DBE_AGG_L2 = 21 # L2 cache double bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_SBE_AGG_DEV = 22 # Device memory single bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_DBE_AGG_DEV = 23 # Device memory double bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_SBE_AGG_REG = 24 # Register File single bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_DBE_AGG_REG = 25 # Register File double bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_SBE_AGG_TEX = 26 # Texture memory single bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_DBE_AGG_TEX = 27 # Texture memory double bit aggregate (persistent) ECC errors -NVML_FI_DEV_ECC_DBE_AGG_CBU = 28 # CBU double bit aggregate ECC errors +#ECC Count Totals +NVML_FI_DEV_ECC_SBE_VOL_TOTAL = 3 # Total single bit volatile ECC errors +NVML_FI_DEV_ECC_DBE_VOL_TOTAL = 4 # Total double bit volatile ECC errors +NVML_FI_DEV_ECC_SBE_AGG_TOTAL = 5 # Total single bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_DBE_AGG_TOTAL = 6 # Total double bit aggregate (persistent) ECC errors +#Individual ECC locations +NVML_FI_DEV_ECC_SBE_VOL_L1 = 7 # L1 cache single bit volatile ECC errors +NVML_FI_DEV_ECC_DBE_VOL_L1 = 8 # L1 cache double bit volatile ECC errors +NVML_FI_DEV_ECC_SBE_VOL_L2 = 9 # L2 cache single bit volatile ECC errors +NVML_FI_DEV_ECC_DBE_VOL_L2 = 10 # L2 cache double bit volatile ECC errors +NVML_FI_DEV_ECC_SBE_VOL_DEV = 11 # Device memory single bit volatile ECC errors +NVML_FI_DEV_ECC_DBE_VOL_DEV = 12 # Device memory double bit volatile ECC errors +NVML_FI_DEV_ECC_SBE_VOL_REG = 13 # Register file single bit volatile ECC errors +NVML_FI_DEV_ECC_DBE_VOL_REG = 14 # Register file double bit volatile ECC errors +NVML_FI_DEV_ECC_SBE_VOL_TEX = 15 # Texture memory single bit volatile ECC errors +NVML_FI_DEV_ECC_DBE_VOL_TEX = 16 # Texture memory double bit volatile ECC errors +NVML_FI_DEV_ECC_DBE_VOL_CBU = 17 # CBU double bit volatile ECC errors +NVML_FI_DEV_ECC_SBE_AGG_L1 = 18 # L1 cache single bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_DBE_AGG_L1 = 19 # L1 cache double bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_SBE_AGG_L2 = 20 # L2 cache single bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_DBE_AGG_L2 = 21 # L2 cache double bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_SBE_AGG_DEV = 22 # Device memory single bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_DBE_AGG_DEV = 23 # Device memory double bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_SBE_AGG_REG = 24 # Register File single bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_DBE_AGG_REG = 25 # Register File double bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_SBE_AGG_TEX = 26 # Texture memory single bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_DBE_AGG_TEX = 27 # Texture memory double bit aggregate (persistent) ECC errors +NVML_FI_DEV_ECC_DBE_AGG_CBU = 28 # CBU double bit aggregate ECC errors # Page Retirement -NVML_FI_DEV_RETIRED_SBE = 29 # Number of retired pages because of single bit errors -NVML_FI_DEV_RETIRED_DBE = 30 # Number of retired pages because of double bit errors -NVML_FI_DEV_RETIRED_PENDING = 31 # If any pages are pending retirement. 1=yes. 0=no. +NVML_FI_DEV_RETIRED_SBE = 29 # Number of retired pages because of single bit errors +NVML_FI_DEV_RETIRED_DBE = 30 # Number of retired pages because of double bit errors +NVML_FI_DEV_RETIRED_PENDING = 31 # If any pages are pending retirement. 1=yes. 0=no. # NvLink Flit Error Counters -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 32 # NVLink flow control CRC Error Counter for Lane 0 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 33 # NVLink flow control CRC Error Counter for Lane 1 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 34 # NVLink flow control CRC Error Counter for Lane 2 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 35 # NVLink flow control CRC Error Counter for Lane 3 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 36 # NVLink flow control CRC Error Counter for Lane 4 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 37 # NVLink flow control CRC Error Counter for Lane 5 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 38 # NVLink flow control CRC Error Counter total for all Lanes +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 32 # NVLink flow control CRC Error Counter for Lane 0 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 33 # NVLink flow control CRC Error Counter for Lane 1 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 34 # NVLink flow control CRC Error Counter for Lane 2 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 35 # NVLink flow control CRC Error Counter for Lane 3 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 36 # NVLink flow control CRC Error Counter for Lane 4 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 37 # NVLink flow control CRC Error Counter for Lane 5 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 38 # NVLink flow control CRC Error Counter total for all Lanes # NvLink CRC Data Error Counters -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 39 # NVLink data CRC Error Counter for Lane 0 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 40 # NVLink data CRC Error Counter for Lane 1 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 41 # NVLink data CRC Error Counter for Lane 2 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 42 # NVLink data CRC Error Counter for Lane 3 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 43 # NVLink data CRC Error Counter for Lane 4 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 44 # NVLink data CRC Error Counter for Lane 5 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 45 # NvLink data CRC Error Counter total for all Lanes +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 39 # NVLink data CRC Error Counter for Lane 0 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 40 # NVLink data CRC Error Counter for Lane 1 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 41 # NVLink data CRC Error Counter for Lane 2 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 42 # NVLink data CRC Error Counter for Lane 3 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 43 # NVLink data CRC Error Counter for Lane 4 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 44 # NVLink data CRC Error Counter for Lane 5 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 45 # NvLink data CRC Error Counter total for all Lanes # NvLink Replay Error Counters -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 46 # NVLink Replay Error Counter for Lane 0 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 47 # NVLink Replay Error Counter for Lane 1 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 48 # NVLink Replay Error Counter for Lane 2 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 49 # NVLink Replay Error Counter for Lane 3 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 50 # NVLink Replay Error Counter for Lane 4 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 51 # NVLink Replay Error Counter for Lane 5 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 52 # NVLink Replay Error Counter total for all Lanes +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 46 # NVLink Replay Error Counter for Lane 0 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 47 # NVLink Replay Error Counter for Lane 1 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 48 # NVLink Replay Error Counter for Lane 2 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 49 # NVLink Replay Error Counter for Lane 3 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 50 # NVLink Replay Error Counter for Lane 4 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 51 # NVLink Replay Error Counter for Lane 5 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 52 # NVLink Replay Error Counter total for all Lanes # NvLink Recovery Error Counters -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 53 # NVLink Recovery Error Counter for Lane 0 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 54 # NVLink Recovery Error Counter for Lane 1 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 55 # NVLink Recovery Error Counter for Lane 2 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 56 # NVLink Recovery Error Counter for Lane 3 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 57 # NVLink Recovery Error Counter for Lane 4 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 58 # NVLink Recovery Error Counter for Lane 5 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 59 # NVLink Recovery Error Counter total for all Lanes +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 53 # NVLink Recovery Error Counter for Lane 0 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 54 # NVLink Recovery Error Counter for Lane 1 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 55 # NVLink Recovery Error Counter for Lane 2 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 56 # NVLink Recovery Error Counter for Lane 3 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 57 # NVLink Recovery Error Counter for Lane 4 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 58 # NVLink Recovery Error Counter for Lane 5 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 59 # NVLink Recovery Error Counter total for all Lanes # NvLink Bandwidth Counters -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0 = 60 # NVLink Bandwidth Counter for Counter Set 0, Lane 0 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1 = 61 # NVLink Bandwidth Counter for Counter Set 0, Lane 1 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2 = 62 # NVLink Bandwidth Counter for Counter Set 0, Lane 2 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3 = 63 # NVLink Bandwidth Counter for Counter Set 0, Lane 3 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4 = 64 # NVLink Bandwidth Counter for Counter Set 0, Lane 4 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5 = 65 # NVLink Bandwidth Counter for Counter Set 0, Lane 5 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL = 66 # NVLink Bandwidth Counter Total for Counter Set 0, All Lanes +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L0 = 60 # NVLink Bandwidth Counter for Counter Set 0, Lane 0 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L1 = 61 # NVLink Bandwidth Counter for Counter Set 0, Lane 1 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L2 = 62 # NVLink Bandwidth Counter for Counter Set 0, Lane 2 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L3 = 63 # NVLink Bandwidth Counter for Counter Set 0, Lane 3 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L4 = 64 # NVLink Bandwidth Counter for Counter Set 0, Lane 4 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L5 = 65 # NVLink Bandwidth Counter for Counter Set 0, Lane 5 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_TOTAL = 66 # NVLink Bandwidth Counter Total for Counter Set 0, All Lanes # NvLink Bandwidth Counters -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0 = 67 # NVLink Bandwidth Counter for Counter Set 1, Lane 0 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1 = 68 # NVLink Bandwidth Counter for Counter Set 1, Lane 1 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2 = 69 # NVLink Bandwidth Counter for Counter Set 1, Lane 2 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3 = 70 # NVLink Bandwidth Counter for Counter Set 1, Lane 3 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4 = 71 # NVLink Bandwidth Counter for Counter Set 1, Lane 4 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5 = 72 # NVLink Bandwidth Counter for Counter Set 1, Lane 5 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL = 73 # NVLink Bandwidth Counter Total for Counter Set 1, All Lanes +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L0 = 67 # NVLink Bandwidth Counter for Counter Set 1, Lane 0 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L1 = 68 # NVLink Bandwidth Counter for Counter Set 1, Lane 1 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L2 = 69 # NVLink Bandwidth Counter for Counter Set 1, Lane 2 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L3 = 70 # NVLink Bandwidth Counter for Counter Set 1, Lane 3 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L4 = 71 # NVLink Bandwidth Counter for Counter Set 1, Lane 4 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L5 = 72 # NVLink Bandwidth Counter for Counter Set 1, Lane 5 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_TOTAL = 73 # NVLink Bandwidth Counter Total for Counter Set 1, All Lanes # Perf Policy Counters -NVML_FI_DEV_PERF_POLICY_POWER = 74 # Perf Policy Counter for Power Policy -NVML_FI_DEV_PERF_POLICY_THERMAL = 75 # Perf Policy Counter for Thermal Policy -NVML_FI_DEV_PERF_POLICY_SYNC_BOOST = 76 # Perf Policy Counter for Sync boost Policy -NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT = 77 # Perf Policy Counter for Board Limit -NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION = 78 # Perf Policy Counter for Low GPU Utilization Policy -NVML_FI_DEV_PERF_POLICY_RELIABILITY = 79 # Perf Policy Counter for Reliability Policy -NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS = 80 # Perf Policy Counter for Total App Clock Policy -NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS = 81 # Perf Policy Counter for Total Base Clocks Policy +NVML_FI_DEV_PERF_POLICY_POWER = 74 # Perf Policy Counter for Power Policy +NVML_FI_DEV_PERF_POLICY_THERMAL = 75 # Perf Policy Counter for Thermal Policy +NVML_FI_DEV_PERF_POLICY_SYNC_BOOST = 76 # Perf Policy Counter for Sync boost Policy +NVML_FI_DEV_PERF_POLICY_BOARD_LIMIT = 77 # Perf Policy Counter for Board Limit +NVML_FI_DEV_PERF_POLICY_LOW_UTILIZATION = 78 # Perf Policy Counter for Low GPU Utilization Policy +NVML_FI_DEV_PERF_POLICY_RELIABILITY = 79 # Perf Policy Counter for Reliability Policy +NVML_FI_DEV_PERF_POLICY_TOTAL_APP_CLOCKS = 80 # Perf Policy Counter for Total App Clock Policy +NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS = 81 # Perf Policy Counter for Total Base Clocks Policy # Memory temperatures -NVML_FI_DEV_MEMORY_TEMP = 82 # Memory temperature for the device +NVML_FI_DEV_MEMORY_TEMP = 82 # Memory temperature for the device # Energy Counter -NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 83 # Total energy consumption for the GPU in mJ since the driver was last reloaded +NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 83 # Total energy consumption for the GPU in mJ since the driver was last reloaded # NVLink Speed -NVML_FI_DEV_NVLINK_SPEED_MBPS_L0 = 84 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L1 = 85 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L2 = 86 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L3 = 87 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L4 = 88 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L5 = 89 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L0 = 84 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L1 = 85 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L2 = 86 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L3 = 87 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L4 = 88 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L5 = 89 NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON = 90 # NVLink Link Count @@ -569,216 +576,272 @@ NVML_FI_DEV_PCIE_REPLAY_COUNTER = 94 NVML_FI_DEV_PCIE_REPLAY_ROLLOVER_COUNTER = 95 # NvLink Flit Error Counters -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 96 # NVLink flow control CRC Error Counter for Lane 6 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 97 # NVLink flow control CRC Error Counter for Lane 7 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 98 # NVLink flow control CRC Error Counter for Lane 8 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 99 # NVLink flow control CRC Error Counter for Lane 9 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 100 # NVLink flow control CRC Error Counter for Lane 10 -NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 101 # NVLink flow control CRC Error Counter for Lane 11 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 96 # NVLink flow control CRC Error Counter for Lane 6 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 97 # NVLink flow control CRC Error Counter for Lane 7 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 98 # NVLink flow control CRC Error Counter for Lane 8 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 99 # NVLink flow control CRC Error Counter for Lane 9 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 100 # NVLink flow control CRC Error Counter for Lane 10 +NVML_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 101 # NVLink flow control CRC Error Counter for Lane 11 # NvLink CRC Data Error Counters -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 102 # NVLink data CRC Error Counter for Lane 6 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 103 # NVLink data CRC Error Counter for Lane 7 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 104 # NVLink data CRC Error Counter for Lane 8 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 105 # NVLink data CRC Error Counter for Lane 9 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 106 # NVLink data CRC Error Counter for Lane 10 -NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 107 # NVLink data CRC Error Counter for Lane 11 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 102 # NVLink data CRC Error Counter for Lane 6 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 103 # NVLink data CRC Error Counter for Lane 7 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 104 # NVLink data CRC Error Counter for Lane 8 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 105 # NVLink data CRC Error Counter for Lane 9 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 106 # NVLink data CRC Error Counter for Lane 10 +NVML_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 107 # NVLink data CRC Error Counter for Lane 11 # NvLink Replay Error Counters -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 108 # NVLink Replay Error Counter for Lane 6 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 109 # NVLink Replay Error Counter for Lane 7 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 110 # NVLink Replay Error Counter for Lane 8 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 111 # NVLink Replay Error Counter for Lane 9 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 112 # NVLink Replay Error Counter for Lane 10 -NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 113 # NVLink Replay Error Counter for Lane 11 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 108 # NVLink Replay Error Counter for Lane 6 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 109 # NVLink Replay Error Counter for Lane 7 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 110 # NVLink Replay Error Counter for Lane 8 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 111 # NVLink Replay Error Counter for Lane 9 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 112 # NVLink Replay Error Counter for Lane 10 +NVML_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 113 # NVLink Replay Error Counter for Lane 11 # NvLink Recovery Error Counters -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 114 # NVLink Recovery Error Counter for Lane 6 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 115 # NVLink Recovery Error Counter for Lane 7 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 116 # NVLink Recovery Error Counter for Lane 8 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 117 # NVLink Recovery Error Counter for Lane 9 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 118 # NVLink Recovery Error Counter for Lane 10 -NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 119 # NVLink Recovery Error Counter for Lane 11 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 114 # NVLink Recovery Error Counter for Lane 6 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 115 # NVLink Recovery Error Counter for Lane 7 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 116 # NVLink Recovery Error Counter for Lane 8 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 117 # NVLink Recovery Error Counter for Lane 9 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 118 # NVLink Recovery Error Counter for Lane 10 +NVML_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 119 # NVLink Recovery Error Counter for Lane 11 # NvLink Bandwidth Counters -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6 = 120 # NVLink Bandwidth Counter for Counter Set 0, Lane 6 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7 = 121 # NVLink Bandwidth Counter for Counter Set 0, Lane 7 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8 = 122 # NVLink Bandwidth Counter for Counter Set 0, Lane 8 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9 = 123 # NVLink Bandwidth Counter for Counter Set 0, Lane 9 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10 = 124 # NVLink Bandwidth Counter for Counter Set 0, Lane 10 -NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11 = 125 # NVLink Bandwidth Counter for Counter Set 0, Lane 11 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L6 = 120 # NVLink Bandwidth Counter for Counter Set 0, Lane 6 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L7 = 121 # NVLink Bandwidth Counter for Counter Set 0, Lane 7 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L8 = 122 # NVLink Bandwidth Counter for Counter Set 0, Lane 8 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L9 = 123 # NVLink Bandwidth Counter for Counter Set 0, Lane 9 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L10 = 124 # NVLink Bandwidth Counter for Counter Set 0, Lane 10 +NVML_FI_DEV_NVLINK_BANDWIDTH_C0_L11 = 125 # NVLink Bandwidth Counter for Counter Set 0, Lane 11 # NvLink Bandwidth Counters -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6 = 126 # NVLink Bandwidth Counter for Counter Set 1, Lane 6 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7 = 127 # NVLink Bandwidth Counter for Counter Set 1, Lane 7 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8 = 128 # NVLink Bandwidth Counter for Counter Set 1, Lane 8 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9 = 129 # NVLink Bandwidth Counter for Counter Set 1, Lane 9 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10 = 130 # NVLink Bandwidth Counter for Counter Set 1, Lane 10 -NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11 = 131 # NVLink Bandwidth Counter for Counter Set 1, Lane 11 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L6 = 126 # NVLink Bandwidth Counter for Counter Set 1, Lane 6 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L7 = 127 # NVLink Bandwidth Counter for Counter Set 1, Lane 7 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L8 = 128 # NVLink Bandwidth Counter for Counter Set 1, Lane 8 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L9 = 129 # NVLink Bandwidth Counter for Counter Set 1, Lane 9 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L10 = 130 # NVLink Bandwidth Counter for Counter Set 1, Lane 10 +NVML_FI_DEV_NVLINK_BANDWIDTH_C1_L11 = 131 # NVLink Bandwidth Counter for Counter Set 1, Lane 11 # NVLink Speed -NVML_FI_DEV_NVLINK_SPEED_MBPS_L6 = 132 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L7 = 133 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L8 = 134 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L9 = 135 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L10 = 136 -NVML_FI_DEV_NVLINK_SPEED_MBPS_L11 = 137 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L6 = 132 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L7 = 133 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L8 = 134 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L9 = 135 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L10 = 136 +NVML_FI_DEV_NVLINK_SPEED_MBPS_L11 = 137 # NVLink Throughput Counters -NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX = 138 # NVLink TX Data throughput in KiB -NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX = 139 # NVLink RX Data throughput in KiB -NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX = 140 # NVLink TX Data + protocol overhead in KiB -NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX = 141 # NVLink RX Data + protocol overhead in KiB +NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_TX = 138 # NVLink TX Data throughput in KiB +NVML_FI_DEV_NVLINK_THROUGHPUT_DATA_RX = 139 # NVLink RX Data throughput in KiB +NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_TX = 140 # NVLink TX Data + protocol overhead in KiB +NVML_FI_DEV_NVLINK_THROUGHPUT_RAW_RX = 141 # NVLink RX Data + protocol overhead in KiB # Row Remapper -NVML_FI_DEV_REMAPPED_COR = 142 -NVML_FI_DEV_REMAPPED_UNC = 143 -NVML_FI_DEV_REMAPPED_PENDING = 144 -NVML_FI_DEV_REMAPPED_FAILURE = 145 +NVML_FI_DEV_REMAPPED_COR = 142 +NVML_FI_DEV_REMAPPED_UNC = 143 +NVML_FI_DEV_REMAPPED_PENDING = 144 +NVML_FI_DEV_REMAPPED_FAILURE = 145 -# Remote device NVLink ID +#Remote device NVLink ID NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID = 146 # Number of NVLinks connected to NVSwitch NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT = 147 # NvLink ECC Data Error Counters -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0 = 148 # < NVLink data ECC Error Counter for Link 0 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1 = 149 # < NVLink data ECC Error Counter for Link 1 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2 = 150 # < NVLink data ECC Error Counter for Link 2 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3 = 151 # < NVLink data ECC Error Counter for Link 3 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4 = 152 # < NVLink data ECC Error Counter for Link 4 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5 = 153 # < NVLink data ECC Error Counter for Link 5 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6 = 154 # < NVLink data ECC Error Counter for Link 6 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7 = 155 # < NVLink data ECC Error Counter for Link 7 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8 = 156 # < NVLink data ECC Error Counter for Link 8 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9 = 157 # < NVLink data ECC Error Counter for Link 9 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10 = 158 # < NVLink data ECC Error Counter for Link 10 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 = 159 # < NVLink data ECC Error Counter for Link 11 -NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL = 160 # < NvLink data ECC Error Counter total for all Links +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0 = 148 #< NVLink data ECC Error Counter for Link 0 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1 = 149 #< NVLink data ECC Error Counter for Link 1 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2 = 150 #< NVLink data ECC Error Counter for Link 2 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3 = 151 #< NVLink data ECC Error Counter for Link 3 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4 = 152 #< NVLink data ECC Error Counter for Link 4 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5 = 153 #< NVLink data ECC Error Counter for Link 5 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6 = 154 #< NVLink data ECC Error Counter for Link 6 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7 = 155 #< NVLink data ECC Error Counter for Link 7 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8 = 156 #< NVLink data ECC Error Counter for Link 8 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9 = 157 #< NVLink data ECC Error Counter for Link 9 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10 = 158 #< NVLink data ECC Error Counter for Link 10 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 = 159 #< NVLink data ECC Error Counter for Link 11 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL = 160 #< NvLink data ECC Error Counter total for all Links -NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY = 161 -NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY = 162 -NVML_FI_DEV_NVLINK_ERROR_DL_CRC = 163 -NVML_FI_DEV_NVLINK_GET_SPEED = 164 -NVML_FI_DEV_NVLINK_GET_STATE = 165 -NVML_FI_DEV_NVLINK_GET_VERSION = 166 +NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY = 161 +NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY = 162 +NVML_FI_DEV_NVLINK_ERROR_DL_CRC = 163 +NVML_FI_DEV_NVLINK_GET_SPEED = 164 +NVML_FI_DEV_NVLINK_GET_STATE = 165 +NVML_FI_DEV_NVLINK_GET_VERSION = 166 -NVML_FI_DEV_NVLINK_GET_POWER_STATE = 167 -NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD = 168 +NVML_FI_DEV_NVLINK_GET_POWER_STATE = 167 +NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD = 168 -NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER = 169 +NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER = 169 + +NVML_FI_DEV_C2C_LINK_COUNT = 170 +NVML_FI_DEV_C2C_LINK_GET_STATUS = 171 +NVML_FI_DEV_C2C_LINK_GET_MAX_BW = 172 + +NVML_FI_DEV_PCIE_COUNT_CORRECTABLE_ERRORS = 173 +NVML_FI_DEV_PCIE_COUNT_NAKS_RECEIVED = 174 +NVML_FI_DEV_PCIE_COUNT_RECEIVER_ERROR = 175 +NVML_FI_DEV_PCIE_COUNT_BAD_TLP = 176 +NVML_FI_DEV_PCIE_COUNT_NAKS_SENT = 177 +NVML_FI_DEV_PCIE_COUNT_BAD_DLLP = 178 +NVML_FI_DEV_PCIE_COUNT_NON_FATAL_ERROR = 179 +NVML_FI_DEV_PCIE_COUNT_FATAL_ERROR = 180 +NVML_FI_DEV_PCIE_COUNT_UNSUPPORTED_REQ = 181 +NVML_FI_DEV_PCIE_COUNT_LCRC_ERROR = 182 +NVML_FI_DEV_PCIE_COUNT_LANE_ERROR = 183 + +NVML_FI_DEV_IS_RESETLESS_MIG_SUPPORTED = 184 + +NVML_FI_DEV_POWER_AVERAGE = 185 +NVML_FI_DEV_POWER_INSTANT = 186 +NVML_FI_DEV_POWER_MIN_LIMIT = 187 +NVML_FI_DEV_POWER_MAX_LIMIT = 188 +NVML_FI_DEV_POWER_DEFAULT_LIMIT = 189 +NVML_FI_DEV_POWER_CURRENT_LIMIT = 190 +NVML_FI_DEV_ENERGY = 191 +NVML_FI_DEV_POWER_REQUESTED_LIMIT = 192 + +NVML_FI_DEV_TEMPERATURE_SHUTDOWN_TLIMIT = 193 +NVML_FI_DEV_TEMPERATURE_SLOWDOWN_TLIMIT = 194 +NVML_FI_DEV_TEMPERATURE_MEM_MAX_TLIMIT = 195 +NVML_FI_DEV_TEMPERATURE_GPU_MAX_TLIMIT = 196 + +NVML_FI_MAX = 197 # One greater than the largest field ID defined above -NVML_FI_MAX = 170 # One greater than the largest field ID defined above ## Enums needed for the method nvmlDeviceGetVirtualizationMode and nvmlDeviceSetVirtualizationMode -NVML_GPU_VIRTUALIZATION_MODE_NONE = 0 # Represents Bare Metal GPU +NVML_GPU_VIRTUALIZATION_MODE_NONE = 0 # Represents Bare Metal GPU NVML_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1 # Device is associated with GPU-Passthorugh -NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2 # Device is associated with vGPU inside virtual machine. -NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3 # Device is associated with VGX hypervisor in vGPU mode -NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4 # Device is associated with VGX hypervisor in vSGA mode +NVML_GPU_VIRTUALIZATION_MODE_VGPU = 2 # Device is associated with vGPU inside virtual machine. +NVML_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3 # Device is associated with VGX hypervisor in vGPU mode +NVML_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4 # Device is associated with VGX hypervisor in vSGA mode ## Lib loading ## nvmlLib = None libLoadLock = threading.Lock() -_nvmlLib_refcount = 0 # Incremented on each nvmlInit and decremented on nvmlShutdown +_nvmlLib_refcount = 0 # Incremented on each nvmlInit and decremented on nvmlShutdown ## vGPU Management -_nvmlVgpuTypeId_t = c_uint +_nvmlVgpuTypeId_t = c_uint _nvmlVgpuInstance_t = c_uint _nvmlVgpuVmIdType_t = c_uint -NVML_VGPU_VM_ID_DOMAIN_ID = 0 -NVML_VGPU_VM_ID_UUID = 1 +NVML_VGPU_VM_ID_DOMAIN_ID = 0 +NVML_VGPU_VM_ID_UUID = 1 _nvmlGridLicenseFeatureCode_t = c_uint -NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN = 0 -NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1 -NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX = 2 -NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2 # deprecated, use NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX. -NVML_GRID_LICENSE_FEATURE_CODE_GAMING = 3 -NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE = 4 +NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN = 0 +NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1 +NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX = 2 +NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2 # deprecated, use NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX. +NVML_GRID_LICENSE_FEATURE_CODE_GAMING = 3 +NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE = 4 _nvmlGridLicenseExpiryStatus_t = c_uint8 -NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE = 0 # Expiry information not available -NVML_GRID_LICENSE_EXPIRY_INVALID = 1 # Invalid expiry or error fetching expiry -NVML_GRID_LICENSE_EXPIRY_VALID = 2 # Valid expiry -NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE = 3 # Expiry not applicable -NVML_GRID_LICENSE_EXPIRY_PERMANENT = 4 # Permanent expiry +NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE = 0, # Expiry information not available +NVML_GRID_LICENSE_EXPIRY_INVALID = 1, # Invalid expiry or error fetching expiry +NVML_GRID_LICENSE_EXPIRY_VALID = 2, # Valid expiry +NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE = 3, # Expiry not applicable +NVML_GRID_LICENSE_EXPIRY_PERMANENT = 4, # Permanent expiry _nvmlVgpuCapability_t = c_uint -NVML_VGPU_CAP_NVLINK_P2P = 0 # vGPU P2P over NVLink is supported -NVML_VGPU_CAP_GPUDIRECT = 1 # GPUDirect capability is supported -NVML_VGPU_CAP_MULTI_VGPU_EXCLUSIVE = 2 # vGPU profile cannot be mixed with other vGPU profiles in same VM -NVML_VGPU_CAP_EXCLUSIVE_TYPE = 3 # vGPU profile cannot run on a GPU alongside other profiles of different type -NVML_VGPU_CAP_EXCLUSIVE_SIZE = 4 # vGPU profile cannot run on a GPU alongside other profiles of different size -NVML_VGPU_CAP_COUNT = 5 +NVML_VGPU_CAP_NVLINK_P2P = 0 # vGPU P2P over NVLink is supported +NVML_VGPU_CAP_GPUDIRECT = 1 # GPUDirect capability is supported +NVML_VGPU_CAP_MULTI_VGPU_EXCLUSIVE = 2 # vGPU profile cannot be mixed with other vGPU profiles in same VM +NVML_VGPU_CAP_EXCLUSIVE_TYPE = 3 # vGPU profile cannot run on a GPU alongside other profiles of different type +NVML_VGPU_CAP_EXCLUSIVE_SIZE = 4 # vGPU profile cannot run on a GPU alongside other profiles of different size +NVML_VGPU_CAP_COUNT = 5 _nvmlVgpuDriverCapability_t = c_uint -NVML_VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU = 0 # Supports mixing of different vGPU profiles within one guest VM -NVML_VGPU_DRIVER_CAP_COUNT = 1 +NVML_VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU = 0 # Supports mixing of different vGPU profiles within one guest VM +NVML_VGPU_DRIVER_CAP_COUNT = 1 _nvmlDeviceVgpuCapability_t = c_uint -NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU = 0 # Fractional vGPU profiles on this GPU can be used in multi-vGPU configurations -NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES = 1 # Supports concurrent execution of timesliced vGPU profiles of differing types -NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES = 2 # Supports concurrent execution of timesliced vGPU profiles of differing framebuffer sizes -NVML_DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW = 3 # GPU device's read_device_buffer expected bandwidth capacity in megabytes per second -NVML_DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW = 4 # GPU device's write_device_buffer expected bandwidth capacity in megabytes per second -NVML_DEVICE_VGPU_CAP_COUNT = 5 +NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU = 0 # Fractional vGPU profiles on this GPU can be used in multi-vGPU configurations +NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES = 1 # Supports concurrent execution of timesliced vGPU profiles of differing types +NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES = 2 # Supports concurrent execution of timesliced vGPU profiles of differing framebuffer sizes +NVML_DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW = 3 # GPU device's read_device_buffer expected bandwidth capacity in megabytes per second +NVML_DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW = 4 # GPU device's write_device_buffer expected bandwidth capacity in megabytes per second +NVML_DEVICE_VGPU_CAP_COUNT = 5 _nvmlVgpuGuestInfoState_t = c_uint NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0 -NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED = 1 +NVML_VGPU_INSTANCE_GUEST_INFO_STATE_INITIALIZED = 1 _nvmlVgpuVmCompatibility_t = c_uint -NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0 -NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1 -NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2 -NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4 -NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8 +NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0 +NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1 +NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2 +NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4 +NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8 _nvmlVgpuPgpuCompatibilityLimitCode_t = c_uint -NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0 -NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1 -NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2 -NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4 -NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000 +NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0 +NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1 +NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2 +NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4 +NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000 _nvmlHostVgpuMode_t = c_uint -NVML_HOST_VGPU_MODE_NON_SRIOV = 0 -NVML_HOST_VGPU_MODE_SRIOV = 1 +NVML_HOST_VGPU_MODE_NON_SRIOV = 0 +NVML_HOST_VGPU_MODE_SRIOV = 1 + +_nvmlConfComputeGpusReadyState_t = c_uint +NVML_CC_ACCEPTING_CLIENT_REQUESTS_FALSE = 0 +NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE = 1 + +_nvmlConfComputeGpuCaps_t = c_uint +NVML_CC_SYSTEM_GPUS_CC_NOT_CAPABLE = 0 +NVML_CC_SYSTEM_GPUS_CC_CAPABLE = 1 + +_nvmlConfComputeCpuCaps_t = c_uint +NVML_CC_SYSTEM_CPU_CAPS_NONE = 0 +NVML_CC_SYSTEM_CPU_CAPS_AMD_SEV = 1 +NVML_CC_SYSTEM_CPU_CAPS_INTEL_TDX = 2 + +_nvmlConfComputeDevToolsMode_t = c_uint +NVML_CC_SYSTEM_DEVTOOLS_MODE_OFF = 0 +NVML_CC_SYSTEM_DEVTOOLS_MODE_ON = 1 + +NVML_CC_SYSTEM_ENVIRONMENT_UNAVAILABLE = 0 +NVML_CC_SYSTEM_ENVIRONMENT_SIM = 1 +NVML_CC_SYSTEM_ENVIRONMENT_PROD = 2 + +_nvmlConfComputeCcFeature_t = c_uint +NVML_CC_SYSTEM_FEATURE_DISABLED = 0 +NVML_CC_SYSTEM_FEATURE_ENABLED = 1 # GSP firmware NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40 - ## Error Checking ## class NVMLError(Exception): _valClassMapping = dict() # List of currently known error codes _errcode_to_string = { - NVML_ERROR_UNINITIALIZED: "Uninitialized", - NVML_ERROR_INVALID_ARGUMENT: "Invalid Argument", - NVML_ERROR_NOT_SUPPORTED: "Not Supported", - NVML_ERROR_NO_PERMISSION: "Insufficient Permissions", + NVML_ERROR_UNINITIALIZED: "Uninitialized", + NVML_ERROR_INVALID_ARGUMENT: "Invalid Argument", + NVML_ERROR_NOT_SUPPORTED: "Not Supported", + NVML_ERROR_NO_PERMISSION: "Insufficient Permissions", NVML_ERROR_ALREADY_INITIALIZED: "Already Initialized", - NVML_ERROR_NOT_FOUND: "Not Found", - NVML_ERROR_INSUFFICIENT_SIZE: "Insufficient Size", - NVML_ERROR_INSUFFICIENT_POWER: "Insufficient External Power", - NVML_ERROR_DRIVER_NOT_LOADED: "Driver Not Loaded", - NVML_ERROR_TIMEOUT: "Timeout", - NVML_ERROR_IRQ_ISSUE: "Interrupt Request Issue", - NVML_ERROR_LIBRARY_NOT_FOUND: "NVML Shared Library Not Found", - NVML_ERROR_FUNCTION_NOT_FOUND: "Function Not Found", - NVML_ERROR_CORRUPTED_INFOROM: "Corrupted infoROM", - NVML_ERROR_GPU_IS_LOST: "GPU is lost", - NVML_ERROR_RESET_REQUIRED: "GPU requires restart", - NVML_ERROR_OPERATING_SYSTEM: "The operating system has blocked the request.", + NVML_ERROR_NOT_FOUND: "Not Found", + NVML_ERROR_INSUFFICIENT_SIZE: "Insufficient Size", + NVML_ERROR_INSUFFICIENT_POWER: "Insufficient External Power", + NVML_ERROR_DRIVER_NOT_LOADED: "Driver Not Loaded", + NVML_ERROR_TIMEOUT: "Timeout", + NVML_ERROR_IRQ_ISSUE: "Interrupt Request Issue", + NVML_ERROR_LIBRARY_NOT_FOUND: "NVML Shared Library Not Found", + NVML_ERROR_FUNCTION_NOT_FOUND: "Function Not Found", + NVML_ERROR_CORRUPTED_INFOROM: "Corrupted infoROM", + NVML_ERROR_GPU_IS_LOST: "GPU is lost", + NVML_ERROR_RESET_REQUIRED: "GPU requires restart", + NVML_ERROR_OPERATING_SYSTEM: "The operating system has blocked the request.", NVML_ERROR_LIB_RM_VERSION_MISMATCH: "RM has detected an NVML/RM version mismatch.", - NVML_ERROR_MEMORY: "Insufficient Memory", - NVML_ERROR_UNKNOWN: "Unknown Error", - } - + NVML_ERROR_MEMORY: "Insufficient Memory", + NVML_ERROR_UNKNOWN: "Unknown Error", + } def __new__(typ, value): ''' Maps value to a proper subclass of NVMLError. @@ -789,7 +852,6 @@ class NVMLError(Exception): obj = Exception.__new__(typ) obj.value = value return obj - def __str__(self): try: if self.value not in NVMLError._errcode_to_string: @@ -797,17 +859,14 @@ class NVMLError(Exception): return NVMLError._errcode_to_string[self.value] except NVMLError: return "NVML Error with code %d" % self.value - def __eq__(self, other): return self.value == other.value - def nvmlExceptionClass(nvmlErrorCode): if nvmlErrorCode not in NVMLError._valClassMapping: raise ValueError('nvmlErrorCode %s is not valid' % nvmlErrorCode) return NVMLError._valClassMapping[nvmlErrorCode] - def _extractNVMLErrorsAsClasses(): ''' Generates a hierarchy of classes on top of NVMLError class. @@ -824,33 +883,24 @@ def _extractNVMLErrorsAsClasses(): # e.g. Turn NVML_ERROR_ALREADY_INITIALIZED into NVMLError_AlreadyInitialized class_name = "NVMLError_" + string.capwords(err_name.replace("NVML_ERROR_", ""), "_").replace("_", "") err_val = getattr(this_module, err_name) - def gen_new(val): def new(typ): obj = NVMLError.__new__(typ, val) return obj - return new - new_error_class = type(class_name, (NVMLError,), {'__new__': gen_new(err_val)}) new_error_class.__module__ = __name__ setattr(this_module, class_name, new_error_class) NVMLError._valClassMapping[err_val] = new_error_class - - _extractNVMLErrorsAsClasses() - def _nvmlCheckReturn(ret): if (ret != NVML_SUCCESS): raise NVMLError(ret) return ret - ## Function access ## -_nvmlGetFunctionPointer_cache = dict() # function pointers are cached to prevent unnecessary libLoadLock locking - - +_nvmlGetFunctionPointer_cache = dict() # function pointers are cached to prevent unnecessary libLoadLock locking def _nvmlGetFunctionPointer(name): global nvmlLib @@ -860,7 +910,7 @@ def _nvmlGetFunctionPointer(name): libLoadLock.acquire() try: # ensure library was loaded - if nvmlLib is None: + if (nvmlLib == None): raise NVMLError(NVML_ERROR_UNINITIALIZED) try: _nvmlGetFunctionPointer_cache[name] = getattr(nvmlLib, name) @@ -871,7 +921,6 @@ def _nvmlGetFunctionPointer(name): # lock is always freed libLoadLock.release() - ## Alternative object # Allows the object to be printed # Allows mismatched types to be assigned @@ -880,11 +929,9 @@ class nvmlFriendlyObject(object): def __init__(self, dictionary): for x in dictionary: setattr(self, x, dictionary[x]) - def __str__(self): return self.__dict__.__str__() - def nvmlStructToFriendlyObject(struct): d = {} for x in struct._fields_: @@ -895,7 +942,6 @@ def nvmlStructToFriendlyObject(struct): obj = nvmlFriendlyObject(d) return obj - # pack the object so it can be passed to the NVML library def nvmlFriendlyObjectToStruct(obj, model): for x in model._fields_: @@ -908,15 +954,11 @@ def nvmlFriendlyObjectToStruct(obj, model): setattr(model, key, value) return model - ## Unit structures class struct_c_nvmlUnit_t(Structure): - pass # opaque handle - - + pass # opaque handle c_nvmlUnit_t = POINTER(struct_c_nvmlUnit_t) - class _PrintableStructure(Structure): """ Abstract class that produces nicer __str__ output than ctypes.Structure. @@ -937,7 +979,6 @@ class _PrintableStructure(Structure): Exact format of returned str from this class is subject to change in the future. """ _fmt_ = {} - def __str__(self): result = [] for x in self._fields_: @@ -949,7 +990,7 @@ class _PrintableStructure(Structure): elif "" in self._fmt_: fmt = self._fmt_[""] result.append(("%s: " + fmt) % (key, value)) - return self.__class__.__name__ + "(" + ", ".join(result) + ")" + return self.__class__.__name__ + "(" + ", ".join(result) + ")" def __getattribute__(self, name): res = super(_PrintableStructure, self).__getattribute__(name) @@ -970,7 +1011,6 @@ class _PrintableStructure(Structure): value = value.encode() super(_PrintableStructure, self).__setattr__(name, value) - class c_nvmlUnitInfo_t(_PrintableStructure): _fields_ = [ ('name', c_char * 96), @@ -979,14 +1019,12 @@ class c_nvmlUnitInfo_t(_PrintableStructure): ('firmwareVersion', c_char * 96), ] - class c_nvmlLedState_t(_PrintableStructure): _fields_ = [ ('cause', c_char * 256), ('color', _nvmlLedColor_t), ] - class c_nvmlPSUInfo_t(_PrintableStructure): _fields_ = [ ('state', c_char * 256), @@ -995,29 +1033,23 @@ class c_nvmlPSUInfo_t(_PrintableStructure): ('power', c_uint), ] - class c_nvmlUnitFanInfo_t(_PrintableStructure): _fields_ = [ ('speed', c_uint), ('state', _nvmlFanState_t), ] - class c_nvmlUnitFanSpeeds_t(_PrintableStructure): _fields_ = [ ('fans', c_nvmlUnitFanInfo_t * 24), ('count', c_uint) ] - ## Device structures class struct_c_nvmlDevice_t(Structure): - pass # opaque handle - - + pass # opaque handle c_nvmlDevice_t = POINTER(struct_c_nvmlDevice_t) - # Legacy pciInfo used for _v1 and _v2 class nvmlPciInfo_v2_t(_PrintableStructure): _fields_ = [ @@ -1035,13 +1067,12 @@ class nvmlPciInfo_v2_t(_PrintableStructure): ('reserved3', c_uint), ] _fmt_ = { - 'domain': "0x%04X", - 'bus': "0x%02X", - 'device': "0x%02X", - 'pciDeviceId': "0x%08X", - 'pciSubSystemId': "0x%08X", - } - + 'domain' : "0x%04X", + 'bus' : "0x%02X", + 'device' : "0x%02X", + 'pciDeviceId' : "0x%08X", + 'pciSubSystemId' : "0x%08X", + } class nvmlPciInfo_t(_PrintableStructure): _fields_ = [ @@ -1059,13 +1090,12 @@ class nvmlPciInfo_t(_PrintableStructure): ('busId', c_char * NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE), ] _fmt_ = { - 'domain': "0x%08X", - 'bus': "0x%02X", - 'device': "0x%02X", - 'pciDeviceId': "0x%08X", - 'pciSubSystemId': "0x%08X", - } - + 'domain' : "0x%08X", + 'bus' : "0x%02X", + 'device' : "0x%02X", + 'pciDeviceId' : "0x%08X", + 'pciSubSystemId' : "0x%08X", + } class c_nvmlExcludedDeviceInfo_t(_PrintableStructure): _fields_ = [ @@ -1073,14 +1103,12 @@ class c_nvmlExcludedDeviceInfo_t(_PrintableStructure): ('uuid', c_char * NVML_DEVICE_UUID_BUFFER_SIZE) ] - class nvmlNvLinkUtilizationControl_t(_PrintableStructure): _fields_ = [ ('units', _nvmlNvLinkUtilizationCountUnits_t), ('pktfilter', _nvmlNvLinkUtilizationCountPktTypes_t), ] - class c_nvmlMemory_t(_PrintableStructure): _fields_ = [ ('total', c_ulonglong), @@ -1089,7 +1117,6 @@ class c_nvmlMemory_t(_PrintableStructure): ] _fmt_ = {'': "%d B"} - class c_nvmlMemory_v2_t(_PrintableStructure): _fields_ = [ ('version', c_uint), @@ -1100,10 +1127,8 @@ class c_nvmlMemory_v2_t(_PrintableStructure): ] _fmt_ = {'': "%d B"} - nvmlMemory_v2 = 0x02000028 - class c_nvmlBAR1Memory_t(_PrintableStructure): _fields_ = [ ('bar1Total', c_ulonglong), @@ -1112,19 +1137,16 @@ class c_nvmlBAR1Memory_t(_PrintableStructure): ] _fmt_ = {'': "%d B"} - class nvmlClkMonFaultInfo_t(Structure): _fields_ = [("clkApiDomain", c_uint), ("clkDomainFaultMask", c_uint) - ] - + ] class nvmlClkMonStatus_t(Structure): _fields_ = [("bGlobalStatus", c_uint), ("clkMonListSize", c_uint), ("clkMonList", nvmlClkMonFaultInfo_t) - ] - + ] # On Windows with the WDDM driver, usedGpuMemory is reported as None # Code that processes this structure should check for None, I.E. @@ -1137,16 +1159,43 @@ class nvmlClkMonStatus_t(Structure): # endif # # See NVML documentation for more information -class c_nvmlProcessInfo_t(_PrintableStructure): +class c_nvmlProcessInfo_v2_t(_PrintableStructure): _fields_ = [ ('pid', c_uint), ('usedGpuMemory', c_ulonglong), ('gpuInstanceId', c_uint), ('computeInstanceId', c_uint), ] - _fmt_ = {'usedGpuMemory': "%d B", - } + _fmt_ = {'usedGpuMemory': "%d B"} +c_nvmlProcessInfo_t = c_nvmlProcessInfo_v2_t + +_nvmlProcessMode_t = c_uint +NVML_PROCESS_MODE_COMPUTE = 0 +NVML_PROCESS_MODE_GRAPHICS = 1 +NVML_PROCESS_MODE_MPS = 2 + +class c_nvmlProcessDetail_v1_t(Structure): + _fields_ = [ + ('pid', c_uint), + ('usedGpuMemory', c_ulonglong), + ('gpuInstanceId', c_uint), + ('computeInstanceId', c_uint), + ('usedGpuCcProtectedMemory', c_ulonglong), + ] + +class c_nvmlProcessDetailList_v1_t(_PrintableStructure): + _fields_ = [ + ('version', c_uint), + ('mode', _nvmlProcessMode_t), + ('numProcArrayEntries', c_uint), + ('procArray', POINTER(c_nvmlProcessDetail_v1_t)), + ] + _fmt_ = {'numProcArrayEntries': "%d B"} + +c_nvmlProcessDetailList_t = c_nvmlProcessDetailList_v1_t + +nvmlProcessDetailList_v1 = 0x1000018 class c_nvmlBridgeChipInfo_t(_PrintableStructure): _fields_ = [ @@ -1154,14 +1203,12 @@ class c_nvmlBridgeChipInfo_t(_PrintableStructure): ('fwVersion', c_uint), ] - class c_nvmlBridgeChipHierarchy_t(_PrintableStructure): _fields_ = [ ('bridgeCount', c_uint), ('bridgeChipInfo', c_nvmlBridgeChipInfo_t * 128), ] - class c_nvmlEccErrorCounts_t(_PrintableStructure): _fields_ = [ ('l1Cache', c_ulonglong), @@ -1170,7 +1217,6 @@ class c_nvmlEccErrorCounts_t(_PrintableStructure): ('registerFile', c_ulonglong), ] - class c_nvmlUtilization_t(_PrintableStructure): _fields_ = [ ('gpu', c_uint), @@ -1178,7 +1224,6 @@ class c_nvmlUtilization_t(_PrintableStructure): ] _fmt_ = {'': "%d %%"} - # Added in 2.285 class c_nvmlHwbcEntry_t(_PrintableStructure): _fields_ = [ @@ -1186,7 +1231,6 @@ class c_nvmlHwbcEntry_t(_PrintableStructure): ('firmwareVersion', c_char * 32), ] - class c_nvmlValue_t(Union): _fields_ = [ ('dVal', c_double), @@ -1194,23 +1238,21 @@ class c_nvmlValue_t(Union): ('ulVal', c_ulong), ('ullVal', c_ulonglong), ('sllVal', c_longlong), + ('siVal', c_int), ] - class c_nvmlSample_t(_PrintableStructure): _fields_ = [ ('timeStamp', c_ulonglong), ('sampleValue', c_nvmlValue_t), ] - class c_nvmlViolationTime_t(_PrintableStructure): _fields_ = [ ('referenceTime', c_ulonglong), ('violationTime', c_ulonglong), ] - class c_nvmlFieldValue_t(_PrintableStructure): _fields_ = [ ('fieldId', c_uint32), @@ -1222,7 +1264,6 @@ class c_nvmlFieldValue_t(_PrintableStructure): ('value', c_nvmlValue_t) ] - class c_nvmlVgpuInstanceUtilizationSample_t(_PrintableStructure): _fields_ = [ ('vgpuInstance', _nvmlVgpuInstance_t), @@ -1233,7 +1274,6 @@ class c_nvmlVgpuInstanceUtilizationSample_t(_PrintableStructure): ('decUtil', c_nvmlValue_t), ] - class c_nvmlVgpuProcessUtilizationSample_t(_PrintableStructure): _fields_ = [ ('vgpuInstance', _nvmlVgpuInstance_t), @@ -1246,35 +1286,31 @@ class c_nvmlVgpuProcessUtilizationSample_t(_PrintableStructure): ('decUtil', c_uint), ] - class c_nvmlVgpuLicenseExpiry_t(_PrintableStructure): _fields_ = [ - ('year', c_uint32), - ('month', c_uint16), - ('day', c_uint16), - ('hour', c_uint16), - ('min', c_uint16), - ('sec', c_uint16), - ('status', c_uint8), + ('year', c_uint32), + ('month', c_uint16), + ('day', c_uint16), + ('hour', c_uint16), + ('min', c_uint16), + ('sec', c_uint16), + ('status', c_uint8), ] - -NVML_GRID_LICENSE_STATE_UNKNOWN = 0 -NVML_GRID_LICENSE_STATE_UNINITIALIZED = 1 +NVML_GRID_LICENSE_STATE_UNKNOWN = 0 +NVML_GRID_LICENSE_STATE_UNINITIALIZED = 1 NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED = 2 -NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED = 3 -NVML_GRID_LICENSE_STATE_UNLICENSED = 4 -NVML_GRID_LICENSE_STATE_LICENSED = 5 - +NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED = 3 +NVML_GRID_LICENSE_STATE_UNLICENSED = 4 +NVML_GRID_LICENSE_STATE_LICENSED = 5 class c_nvmlVgpuLicenseInfo_t(_PrintableStructure): _fields_ = [ - ('isLicensed', c_uint8), - ('licenseExpiry', c_nvmlVgpuLicenseExpiry_t), - ('currentState', c_uint), + ('isLicensed', c_uint8), + ('licenseExpiry', c_nvmlVgpuLicenseExpiry_t), + ('currentState', c_uint), ] - class c_nvmlEncoderSession_t(_PrintableStructure): _fields_ = [ ('sessionId', c_uint), @@ -1287,7 +1323,6 @@ class c_nvmlEncoderSession_t(_PrintableStructure): ('encodeLatency', c_uint), ] - class c_nvmlProcessUtilizationSample_t(_PrintableStructure): _fields_ = [ ('pid', c_uint), @@ -1298,38 +1333,34 @@ class c_nvmlProcessUtilizationSample_t(_PrintableStructure): ('decUtil', c_uint), ] - class c_nvmlGridLicenseExpiry_t(_PrintableStructure): _fields_ = [ - ('year', c_uint32), - ('month', c_uint16), - ('day', c_uint16), - ('hour', c_uint16), - ('min', c_uint16), - ('sec', c_uint16), - ('status', c_uint8), + ('year', c_uint32), + ('month', c_uint16), + ('day', c_uint16), + ('hour', c_uint16), + ('min', c_uint16), + ('sec', c_uint16), + ('status', c_uint8), ] - class c_nvmlGridLicensableFeature_v4_t(_PrintableStructure): _fields_ = [ - ('featureCode', _nvmlGridLicenseFeatureCode_t), - ('featureState', c_uint), - ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE), - ('productName', c_char * NVML_GRID_LICENSE_BUFFER_SIZE), + ('featureCode', _nvmlGridLicenseFeatureCode_t), + ('featureState', c_uint), + ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE), + ('productName', c_char * NVML_GRID_LICENSE_BUFFER_SIZE), ('featureEnabled', c_uint), - ('licenseExpiry', c_nvmlGridLicenseExpiry_t), + ('licenseExpiry', c_nvmlGridLicenseExpiry_t), ] - class c_nvmlGridLicensableFeatures_v4_t(_PrintableStructure): _fields_ = [ - ('isGridLicenseSupported', c_int), + ('isGridLicenseSupported', c_int), ('licensableFeaturesCount', c_uint), - ('gridLicensableFeatures', c_nvmlGridLicensableFeature_v4_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT), + ('gridLicensableFeatures', c_nvmlGridLicensableFeature_v4_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT), ] - class c_nvmlGridLicensableFeature_v3_t(_PrintableStructure): _fields_ = [ ('featureCode', _nvmlGridLicenseFeatureCode_t), @@ -1339,7 +1370,6 @@ class c_nvmlGridLicensableFeature_v3_t(_PrintableStructure): ('featureEnabled', c_uint), ] - class c_nvmlGridLicensableFeatures_v3_t(_PrintableStructure): _fields_ = [ ('isGridLicenseSupported', c_int), @@ -1347,7 +1377,6 @@ class c_nvmlGridLicensableFeatures_v3_t(_PrintableStructure): ('gridLicensableFeatures', c_nvmlGridLicensableFeature_v3_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT), ] - class c_nvmlGridLicensableFeature_v2_t(_PrintableStructure): _fields_ = [ ('featureCode', _nvmlGridLicenseFeatureCode_t), @@ -1356,7 +1385,6 @@ class c_nvmlGridLicensableFeature_v2_t(_PrintableStructure): ('productName', c_char * NVML_GRID_LICENSE_BUFFER_SIZE), ] - class c_nvmlGridLicensableFeatures_v2_t(_PrintableStructure): _fields_ = [ ('isGridLicenseSupported', c_int), @@ -1364,7 +1392,6 @@ class c_nvmlGridLicensableFeatures_v2_t(_PrintableStructure): ('gridLicensableFeatures', c_nvmlGridLicensableFeature_v2_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT), ] - class c_nvmlGridLicensableFeature_t(_PrintableStructure): _fields_ = [ ('featureCode', _nvmlGridLicenseFeatureCode_t), @@ -1372,7 +1399,6 @@ class c_nvmlGridLicensableFeature_t(_PrintableStructure): ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE), ] - class c_nvmlGridLicensableFeatures_t(_PrintableStructure): _fields_ = [ ('isGridLicenseSupported', c_int), @@ -1380,58 +1406,79 @@ class c_nvmlGridLicensableFeatures_t(_PrintableStructure): ('gridLicensableFeatures', c_nvmlGridLicensableFeature_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT), ] - ## Event structures class struct_c_nvmlEventSet_t(Structure): - pass # opaque handle - - + pass # opaque handle c_nvmlEventSet_t = POINTER(struct_c_nvmlEventSet_t) -nvmlEventTypeSingleBitEccError = 0x0000000000000001 -nvmlEventTypeDoubleBitEccError = 0x0000000000000002 -nvmlEventTypePState = 0x0000000000000004 -nvmlEventTypeXidCriticalError = 0x0000000000000008 -nvmlEventTypeClock = 0x0000000000000010 -nvmlEventTypePowerSourceChange = 0x0000000000000080 -nvmlEventMigConfigChange = 0x0000000000000100 -nvmlEventTypeNone = 0x0000000000000000 -nvmlEventTypeAll = ( - nvmlEventTypeNone - | nvmlEventTypeSingleBitEccError - | nvmlEventTypeDoubleBitEccError - | nvmlEventTypePState - | nvmlEventTypeClock - | nvmlEventTypePowerSourceChange - | nvmlEventTypeXidCriticalError - | nvmlEventMigConfigChange -) +nvmlEventTypeSingleBitEccError = 0x0000000000000001 +nvmlEventTypeDoubleBitEccError = 0x0000000000000002 +nvmlEventTypePState = 0x0000000000000004 +nvmlEventTypeXidCriticalError = 0x0000000000000008 +nvmlEventTypeClock = 0x0000000000000010 +nvmlEventTypePowerSourceChange = 0x0000000000000080 +nvmlEventMigConfigChange = 0x0000000000000100 +nvmlEventTypeNone = 0x0000000000000000 +nvmlEventTypeAll = ( + nvmlEventTypeNone + | nvmlEventTypeSingleBitEccError + | nvmlEventTypeDoubleBitEccError + | nvmlEventTypePState + | nvmlEventTypeClock + | nvmlEventTypePowerSourceChange + | nvmlEventTypeXidCriticalError + | nvmlEventMigConfigChange + ) -## Clock Throttle Reasons defines -nvmlClocksThrottleReasonGpuIdle = 0x0000000000000001 +## Clock Event Reasons defines +nvmlClocksEventReasonGpuIdle = 0x0000000000000001 +nvmlClocksEventReasonApplicationsClocksSetting = 0x0000000000000002 +nvmlClocksEventReasonUserDefinedClocks = nvmlClocksEventReasonApplicationsClocksSetting # deprecated, use nvmlClocksEventReasonApplicationsClocksSetting +nvmlClocksEventReasonSwPowerCap = 0x0000000000000004 +nvmlClocksEventReasonHwSlowdown = 0x0000000000000008 +nvmlClocksEventReasonSyncBoost = 0x0000000000000010 +nvmlClocksEventReasonSwThermalSlowdown = 0x0000000000000020 +nvmlClocksEventReasonHwThermalSlowdown = 0x0000000000000040 +nvmlClocksEventReasonHwPowerBrakeSlowdown = 0x0000000000000080 +nvmlClocksEventReasonDisplayClockSetting = 0x0000000000000100 +nvmlClocksEventReasonNone = 0x0000000000000000 +nvmlClocksEventReasonAll = ( + nvmlClocksEventReasonNone | + nvmlClocksEventReasonGpuIdle | + nvmlClocksEventReasonApplicationsClocksSetting | + nvmlClocksEventReasonSwPowerCap | + nvmlClocksEventReasonHwSlowdown | + nvmlClocksEventReasonSyncBoost | + nvmlClocksEventReasonSwThermalSlowdown | + nvmlClocksEventReasonHwThermalSlowdown | + nvmlClocksEventReasonHwPowerBrakeSlowdown | + nvmlClocksEventReasonDisplayClockSetting + ) + +## Following have been deprecated +nvmlClocksThrottleReasonGpuIdle = 0x0000000000000001 nvmlClocksThrottleReasonApplicationsClocksSetting = 0x0000000000000002 -nvmlClocksThrottleReasonUserDefinedClocks = nvmlClocksThrottleReasonApplicationsClocksSetting # deprecated, use nvmlClocksThrottleReasonApplicationsClocksSetting -nvmlClocksThrottleReasonSwPowerCap = 0x0000000000000004 -nvmlClocksThrottleReasonHwSlowdown = 0x0000000000000008 -nvmlClocksThrottleReasonSyncBoost = 0x0000000000000010 -nvmlClocksThrottleReasonSwThermalSlowdown = 0x0000000000000020 -nvmlClocksThrottleReasonHwThermalSlowdown = 0x0000000000000040 +nvmlClocksThrottleReasonUserDefinedClocks = nvmlClocksThrottleReasonApplicationsClocksSetting # deprecated, use nvmlClocksThrottleReasonApplicationsClocksSetting +nvmlClocksThrottleReasonSwPowerCap = 0x0000000000000004 +nvmlClocksThrottleReasonHwSlowdown = 0x0000000000000008 +nvmlClocksThrottleReasonSyncBoost = 0x0000000000000010 +nvmlClocksThrottleReasonSwThermalSlowdown = 0x0000000000000020 +nvmlClocksThrottleReasonHwThermalSlowdown = 0x0000000000000040 nvmlClocksThrottleReasonHwPowerBrakeSlowdown = 0x0000000000000080 -nvmlClocksThrottleReasonDisplayClockSetting = 0x0000000000000100 -nvmlClocksThrottleReasonNone = 0x0000000000000000 -nvmlClocksThrottleReasonAll = ( - nvmlClocksThrottleReasonNone | - nvmlClocksThrottleReasonGpuIdle | - nvmlClocksThrottleReasonApplicationsClocksSetting | - nvmlClocksThrottleReasonSwPowerCap | - nvmlClocksThrottleReasonHwSlowdown | - nvmlClocksThrottleReasonSyncBoost | - nvmlClocksThrottleReasonSwThermalSlowdown | - nvmlClocksThrottleReasonHwThermalSlowdown | - nvmlClocksThrottleReasonHwPowerBrakeSlowdown | - nvmlClocksThrottleReasonDisplayClockSetting -) - +nvmlClocksThrottleReasonDisplayClockSetting = 0x0000000000000100 +nvmlClocksThrottleReasonNone = 0x0000000000000000 +nvmlClocksThrottleReasonAll = ( + nvmlClocksThrottleReasonNone | + nvmlClocksThrottleReasonGpuIdle | + nvmlClocksThrottleReasonApplicationsClocksSetting | + nvmlClocksThrottleReasonSwPowerCap | + nvmlClocksThrottleReasonHwSlowdown | + nvmlClocksThrottleReasonSyncBoost | + nvmlClocksThrottleReasonSwThermalSlowdown | + nvmlClocksThrottleReasonHwThermalSlowdown | + nvmlClocksThrottleReasonHwPowerBrakeSlowdown | + nvmlClocksThrottleReasonDisplayClockSetting + ) class c_nvmlEventData_t(_PrintableStructure): _fields_ = [ @@ -1443,7 +1490,6 @@ class c_nvmlEventData_t(_PrintableStructure): ] _fmt_ = {'eventType': "0x%08X"} - class c_nvmlAccountingStats_t(_PrintableStructure): _fields_ = [ ('gpuUtilization', c_uint), @@ -1455,12 +1501,10 @@ class c_nvmlAccountingStats_t(_PrintableStructure): ('reserved', c_uint * 5) ] - class c_nvmlVgpuVersion_t(Structure): _fields_ = [("minVersion", c_uint), ("maxVersion", c_uint) - ] - + ] class c_nvmlVgpuMetadata_t(_PrintableStructure): _fields_ = [("version", c_uint), @@ -1473,8 +1517,7 @@ class c_nvmlVgpuMetadata_t(_PrintableStructure): ("guestVgpuVersion", c_uint), ("opaqueDataSize", c_uint), ("opaqueData", c_char * NVML_VGPU_METADATA_OPAQUE_DATA_SIZE) - ] - + ] class c_nvmlVgpuPgpuMetadata_t(_PrintableStructure): _fields_ = [("version", c_uint), @@ -1485,109 +1528,113 @@ class c_nvmlVgpuPgpuMetadata_t(_PrintableStructure): ("hostSupportedVgpuRange", c_nvmlVgpuVersion_t), ("opaqueDataSize", c_uint), ("opaqueData", c_char * NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE) - ] - + ] class c_nvmlVgpuPgpuCompatibility_t(Structure): _fields_ = [("vgpuVmCompatibility", _nvmlVgpuVmCompatibility_t), ("compatibilityLimitCode", _nvmlVgpuPgpuCompatibilityLimitCode_t) - ] - + ] ## vGPU scheduler policy defines -NVML_VGPU_SCHEDULER_POLICY_UNKNOWN = 0 -NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT = 1 -NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE = 2 -NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE = 3 +NVML_VGPU_SCHEDULER_POLICY_UNKNOWN = 0 +NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT = 1 +NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE = 2 +NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE = 3 ## Supported vGPU scheduler policy count -NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT = 3 +NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT = 3 -NVML_SCHEDULER_SW_MAX_LOG_ENTRIES = 200 +NVML_SCHEDULER_SW_MAX_LOG_ENTRIES = 200 +NVML_VGPU_SCHEDULER_ARR_DEFAULT = 0 +NVML_VGPU_SCHEDULER_ARR_DISABLE = 1 +NVML_VGPU_SCHEDULER_ARR_ENABLE = 2 class c_nvmlVgpuSchedDataWithARR_t(_PrintableStructure): _fields_ = [ - ('avgFactor', c_uint), - ('timeslice', c_uint), + ('avgFactor', c_uint), + ('timeslice', c_uint), ] - class c_nvmlVgpuSchedData_t(_PrintableStructure): _fields_ = [ - ('timeslice', c_uint), + ('timeslice', c_uint), ] - class c_nvmlVgpuSchedulerParams_t(Union): _fields_ = [ ('vgpuSchedDataWithARR', c_nvmlVgpuSchedDataWithARR_t), - ('vgpuSchedData', c_nvmlVgpuSchedData_t), + ('vgpuSchedData', c_nvmlVgpuSchedData_t), ] - class c_nvmlVgpuSchedulerLogEntry_t(_PrintableStructure): _fields_ = [ - ('timestamp', c_ulonglong), - ('timeRunTotal', c_ulonglong), - ('timeRun', c_ulonglong), - ('swRunlistId', c_uint), - ('targetTimeSlice', c_ulonglong), - ('cumulativePreemptionTime', c_ulonglong), + ('timestamp', c_ulonglong), + ('timeRunTotal', c_ulonglong), + ('timeRun', c_ulonglong), + ('swRunlistId', c_uint), + ('targetTimeSlice', c_ulonglong), + ('cumulativePreemptionTime', c_ulonglong), ] - class c_nvmlVgpuSchedulerLog_t(_PrintableStructure): _fields_ = [ - ('engineId', c_uint), + ('engineId', c_uint), ('schedulerPolicy', c_uint), - ('isEnabledARR', c_uint), + ('arrMode', c_uint), ('schedulerParams', c_nvmlVgpuSchedulerParams_t), - ('entriesCount', c_uint), - ('logEntries', c_nvmlVgpuSchedulerLogEntry_t * NVML_SCHEDULER_SW_MAX_LOG_ENTRIES), + ('entriesCount', c_uint), + ('logEntries', c_nvmlVgpuSchedulerLogEntry_t * NVML_SCHEDULER_SW_MAX_LOG_ENTRIES), ] - class c_nvmlVgpuSchedulerGetState_t(_PrintableStructure): _fields_ = [ ('schedulerPolicy', c_uint), - ('isEnabledARR', c_uint), + ('arrMode', c_uint), ('schedulerParams', c_nvmlVgpuSchedulerParams_t), ] - class c_nvmlVgpuSchedSetDataWithARR_t(_PrintableStructure): _fields_ = [ - ('avgFactor', c_uint), - ('frequency', c_uint), + ('avgFactor', c_uint), + ('frequency', c_uint), ] - class c_nvmlVgpuSchedSetData_t(_PrintableStructure): _fields_ = [ - ('timeslice', c_uint), + ('timeslice', c_uint), ] +class c_nvmlVgpuSchedulerSetParams_t(Union): + _fields_ = [ + ('vgpuSchedDataWithARR', c_nvmlVgpuSchedSetDataWithARR_t), + ('vgpuSchedData', c_nvmlVgpuSchedSetData_t), + ] + +class c_nvmlVgpuSchedulerSetState_t(_PrintableStructure): + _fields_ = [ + ('schedulerPolicy', c_uint), + ('enableARRMode', c_uint), + ('schedulerParams', c_nvmlVgpuSchedulerSetParams_t), + ] class c_nvmlVgpuSchedulerCapabilities_t(_PrintableStructure): _fields_ = [ ('supportedSchedulers', c_uint * NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT), - ('maxTimeslice', c_uint), - ('minTimeslice', c_uint), - ('isArrModeSupported', c_uint), - ('maxFrequencyForARR', c_uint), - ('minFrequencyForARR', c_uint), - ('maxAvgFactorForARR', c_uint), - ('minAvgFactorForARR', c_uint), + ('maxTimeslice', c_uint), + ('minTimeslice', c_uint), + ('isArrModeSupported', c_uint), + ('maxFrequencyForARR', c_uint), + ('minFrequencyForARR', c_uint), + ('maxAvgFactorForARR', c_uint), + ('minAvgFactorForARR', c_uint), ] - class c_nvmlFBCStats_t(Structure): _fields_ = [("sessionsCount", c_uint), ("averageFPS", c_uint), ("averageLatency", c_uint) - ] - + ] class c_nvmlFBCSession_t(_PrintableStructure): _fields_ = [ @@ -1605,28 +1652,25 @@ class c_nvmlFBCSession_t(_PrintableStructure): ('averageLatency', c_uint), ] - NVML_DEVICE_MIG_DISABLE = 0x0 -NVML_DEVICE_MIG_ENABLE = 0x1 +NVML_DEVICE_MIG_ENABLE = 0x1 -NVML_GPU_INSTANCE_PROFILE_1_SLICE = 0x0 -NVML_GPU_INSTANCE_PROFILE_2_SLICE = 0x1 -NVML_GPU_INSTANCE_PROFILE_3_SLICE = 0x2 -NVML_GPU_INSTANCE_PROFILE_4_SLICE = 0x3 -NVML_GPU_INSTANCE_PROFILE_7_SLICE = 0x4 -NVML_GPU_INSTANCE_PROFILE_8_SLICE = 0x5 -NVML_GPU_INSTANCE_PROFILE_6_SLICE = 0x6 +NVML_GPU_INSTANCE_PROFILE_1_SLICE = 0x0 +NVML_GPU_INSTANCE_PROFILE_2_SLICE = 0x1 +NVML_GPU_INSTANCE_PROFILE_3_SLICE = 0x2 +NVML_GPU_INSTANCE_PROFILE_4_SLICE = 0x3 +NVML_GPU_INSTANCE_PROFILE_7_SLICE = 0x4 +NVML_GPU_INSTANCE_PROFILE_8_SLICE = 0x5 +NVML_GPU_INSTANCE_PROFILE_6_SLICE = 0x6 NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7 NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 = 0x8 NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 = 0x9 -NVML_GPU_INSTANCE_PROFILE_COUNT = 0xA - +NVML_GPU_INSTANCE_PROFILE_COUNT = 0xA class c_nvmlGpuInstancePlacement_t(Structure): _fields_ = [("start", c_uint), ("size", c_uint) - ] - + ] class c_nvmlGpuInstanceProfileInfo_t(Structure): _fields_ = [("id", c_uint), @@ -1640,12 +1684,10 @@ class c_nvmlGpuInstanceProfileInfo_t(Structure): ("jpegCount", c_uint), ("ofaCount", c_uint), ("memorySizeMB", c_ulonglong), - ] - + ] nvmlGpuInstanceProfileInfo_v2 = 0x02000098 - class c_nvmlGpuInstanceProfileInfo_v2_t(_PrintableStructure): _fields_ = [("version", c_uint), ("id", c_uint), @@ -1660,45 +1702,39 @@ class c_nvmlGpuInstanceProfileInfo_v2_t(_PrintableStructure): ("ofaCount", c_uint), ("memorySizeMB", c_ulonglong), ("name", c_char * NVML_DEVICE_NAME_V2_BUFFER_SIZE) - ] - + ] + def __init__(self): super(c_nvmlGpuInstanceProfileInfo_v2_t, self).__init__(version=nvmlGpuInstanceProfileInfo_v2) - class c_nvmlGpuInstanceInfo_t(Structure): _fields_ = [("device", c_nvmlDevice_t), ("id", c_uint), ("profileId", c_uint), ("placement", c_nvmlGpuInstancePlacement_t) - ] - + ] class struct_c_nvmlGpuInstance_t(Structure): - pass # opaque handle - - + pass # opaque handle c_nvmlGpuInstance_t = POINTER(struct_c_nvmlGpuInstance_t) -NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE = 0x0 -NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE = 0x1 -NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE = 0x2 -NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE = 0x3 -NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE = 0x4 -NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE = 0x5 -NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE = 0x6 +NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE = 0x0 +NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE = 0x1 +NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE = 0x2 +NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE = 0x3 +NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE = 0x4 +NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE = 0x5 +NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE = 0x6 NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7 -NVML_COMPUTE_INSTANCE_PROFILE_COUNT = 0x8 +NVML_COMPUTE_INSTANCE_PROFILE_COUNT = 0x8 NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = 0x0 NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = 0x1 - class c_nvmlComputeInstancePlacement_t(Structure): _fields_ = [("start", c_uint), ("size", c_uint) - ] - + ] class c_nvmlComputeInstanceProfileInfo_t(Structure): _fields_ = [("id", c_uint), @@ -1710,12 +1746,10 @@ class c_nvmlComputeInstanceProfileInfo_t(Structure): ("sharedEncoderCount", c_uint), ("sharedJpegCount", c_uint), ("sharedOfaCount", c_uint) - ] - + ] nvmlComputeInstanceProfileInfo_v2 = 0x02000088 - class c_nvmlComputeInstanceProfileInfo_v2_t(_PrintableStructure): _fields_ = [("version", c_uint), ("id", c_uint), @@ -1728,73 +1762,65 @@ class c_nvmlComputeInstanceProfileInfo_v2_t(_PrintableStructure): ("sharedJpegCount", c_uint), ("sharedOfaCount", c_uint), ("name", c_char * NVML_DEVICE_NAME_V2_BUFFER_SIZE) - ] + ] def __init__(self): super(c_nvmlComputeInstanceProfileInfo_v2_t, self).__init__(version=nvmlComputeInstanceProfileInfo_v2) - class c_nvmlComputeInstanceInfo_t(Structure): _fields_ = [("device", c_nvmlDevice_t), ("gpuInstance", c_nvmlGpuInstance_t), ("id", c_uint), ("profileId", c_uint), ("placement", c_nvmlComputeInstancePlacement_t) - ] - + ] NVML_MAX_GPU_UTILIZATIONS = 8 -NVML_GPU_UTILIZATION_DOMAIN_GPU = 0 -NVML_GPU_UTILIZATION_DOMAIN_FB = 1 -NVML_GPU_UTILIZATION_DOMAIN_VID = 2 -NVML_GPU_UTILIZATION_DOMAIN_BUS = 3 - - +NVML_GPU_UTILIZATION_DOMAIN_GPU = 0 +NVML_GPU_UTILIZATION_DOMAIN_FB = 1 +NVML_GPU_UTILIZATION_DOMAIN_VID = 2 +NVML_GPU_UTILIZATION_DOMAIN_BUS = 3 class c_nvmlGpuDynamicPstatesUtilization_t(Structure): _fields_ = [("bIsPresent", c_uint, 1), ("percentage", c_uint), ("incThreshold", c_uint), ("decThreshold", c_uint)] - - class c_nvmlGpuDynamicPstatesInfo_t(Structure): _fields_ = [("flags", c_uint), ("utilization", c_nvmlGpuDynamicPstatesUtilization_t * NVML_MAX_GPU_UTILIZATIONS)] - NVML_MAX_THERMAL_SENSORS_PER_GPU = 3 -NVML_THERMAL_TARGET_NONE = 0 -NVML_THERMAL_TARGET_GPU = 1 -NVML_THERMAL_TARGET_MEMORY = 2 -NVML_THERMAL_TARGET_POWER_SUPPLY = 4 -NVML_THERMAL_TARGET_BOARD = 8 -NVML_THERMAL_TARGET_VCD_BOARD = 9 -NVML_THERMAL_TARGET_VCD_INLET = 10 -NVML_THERMAL_TARGET_VCD_OUTLET = 11 -NVML_THERMAL_TARGET_ALL = 15 -NVML_THERMAL_TARGET_UNKNOWN = -1 +NVML_THERMAL_TARGET_NONE = 0 +NVML_THERMAL_TARGET_GPU = 1 +NVML_THERMAL_TARGET_MEMORY = 2 +NVML_THERMAL_TARGET_POWER_SUPPLY = 4 +NVML_THERMAL_TARGET_BOARD = 8 +NVML_THERMAL_TARGET_VCD_BOARD = 9 +NVML_THERMAL_TARGET_VCD_INLET = 10 +NVML_THERMAL_TARGET_VCD_OUTLET = 11 +NVML_THERMAL_TARGET_ALL = 15 +NVML_THERMAL_TARGET_UNKNOWN = -1 -NVML_THERMAL_CONTROLLER_NONE = 0 -NVML_THERMAL_CONTROLLER_GPU_INTERNAL = 1 -NVML_THERMAL_CONTROLLER_ADM1032 = 2 -NVML_THERMAL_CONTROLLER_ADT7461 = 3 -NVML_THERMAL_CONTROLLER_MAX6649 = 4 -NVML_THERMAL_CONTROLLER_MAX1617 = 5 -NVML_THERMAL_CONTROLLER_LM99 = 6 -NVML_THERMAL_CONTROLLER_LM89 = 7 -NVML_THERMAL_CONTROLLER_LM64 = 8 -NVML_THERMAL_CONTROLLER_G781 = 9 -NVML_THERMAL_CONTROLLER_ADT7473 = 10 -NVML_THERMAL_CONTROLLER_SBMAX6649 = 11 -NVML_THERMAL_CONTROLLER_VBIOSEVT = 12 -NVML_THERMAL_CONTROLLER_OS = 13 +NVML_THERMAL_CONTROLLER_NONE = 0 +NVML_THERMAL_CONTROLLER_GPU_INTERNAL = 1 +NVML_THERMAL_CONTROLLER_ADM1032 = 2 +NVML_THERMAL_CONTROLLER_ADT7461 = 3 +NVML_THERMAL_CONTROLLER_MAX6649 = 4 +NVML_THERMAL_CONTROLLER_MAX1617 = 5 +NVML_THERMAL_CONTROLLER_LM99 = 6 +NVML_THERMAL_CONTROLLER_LM89 = 7 +NVML_THERMAL_CONTROLLER_LM64 = 8 +NVML_THERMAL_CONTROLLER_G781 = 9 +NVML_THERMAL_CONTROLLER_ADT7473 = 10 +NVML_THERMAL_CONTROLLER_SBMAX6649 = 11 +NVML_THERMAL_CONTROLLER_VBIOSEVT = 12 +NVML_THERMAL_CONTROLLER_OS = 13 NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS = 14 -NVML_THERMAL_CONTROLLER_NVSYSCON_E551 = 15 -NVML_THERMAL_CONTROLLER_MAX6649R = 16 -NVML_THERMAL_CONTROLLER_ADT7473S = 17 -NVML_THERMAL_CONTROLLER_UNKNOWN = -1 - +NVML_THERMAL_CONTROLLER_NVSYSCON_E551 = 15 +NVML_THERMAL_CONTROLLER_MAX6649R = 16 +NVML_THERMAL_CONTROLLER_ADT7473S = 17 +NVML_THERMAL_CONTROLLER_UNKNOWN = -1 class c_nvmlGpuThermalSensor_t(Structure): _fields_ = [("controller", c_int), @@ -1802,20 +1828,14 @@ class c_nvmlGpuThermalSensor_t(Structure): ("defaultMaxTemp", c_int), ("currentTemp", c_int), ("target", c_int)] - - class c_nvmlGpuThermalSettings_t(Structure): _fields_ = [("count", c_uint), ("sensor", c_nvmlGpuThermalSensor_t * NVML_MAX_THERMAL_SENSORS_PER_GPU)] - class struct_c_nvmlComputeInstance_t(Structure): - pass # opaque handle - - + pass # opaque handle c_nvmlComputeInstance_t = POINTER(struct_c_nvmlComputeInstance_t) - class c_nvmlDeviceAttributes(Structure): _fields_ = [("multiprocessorCount", c_uint), ("sharedCopyEngineCount", c_uint), @@ -1826,8 +1846,7 @@ class c_nvmlDeviceAttributes(Structure): ("gpuInstanceSliceCount", c_uint), ("computeInstanceSliceCount", c_uint), ("memorySizeMB", c_ulonglong), - ] - + ] class c_nvmlRowRemapperHistogramValues(Structure): _fields_ = [("max", c_uint), @@ -1835,7 +1854,47 @@ class c_nvmlRowRemapperHistogramValues(Structure): ("partial", c_uint), ("low", c_uint), ("none", c_uint) - ] + ] + +NVML_GPU_CERT_CHAIN_SIZE = 0x1000 +NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE = 0x1400 +NVML_CC_GPU_CEC_NONCE_SIZE = 0x20 +NVML_CC_GPU_ATTESTATION_REPORT_SIZE = 0x2000 +NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE = 0x1000 +NVML_CC_CEC_ATTESTATION_REPORT_NOT_PRESENT = 0 +NVML_CC_CEC_ATTESTATION_REPORT_PRESENT = 1 + +class c_nvmlConfComputeSystemState_t(Structure): + _fields_ = [('environment', c_uint), + ('ccFeature', c_uint), + ('devToolsMode', c_uint), + ] + +class c_nvmlConfComputeSystemCaps_t(Structure): + _fields_ = [('cpuCaps', c_uint), + ('gpusCaps', c_uint), + ] + +class c_nvmlConfComputeMemSizeInfo_t(Structure): + _fields_ = [('protectedMemSizeKib', c_ulonglong), + ('unprotectedMemSizeKib', c_ulonglong), + ] + +class c_nvmlConfComputeGpuCertificate_t(Structure): + _fields_ = [('certChainSize', c_uint), + ('attestationCertChainSize', c_uint), + ('certChain', c_uint8 * NVML_GPU_CERT_CHAIN_SIZE), + ('attestationCertChain', c_uint8 * NVML_GPU_ATTESTATION_CERT_CHAIN_SIZE), + ] + +class c_nvmlConfComputeGpuAttestationReport_t(Structure): + _fields_ = [('isCecAttestationReportPresent', c_uint), + ('attestationReportSize', c_uint), + ('cecAttestationReportSize', c_uint), + ('nonce', c_uint8 * NVML_CC_GPU_CEC_NONCE_SIZE), + ('attestationReport', c_uint8 * NVML_CC_GPU_ATTESTATION_REPORT_SIZE), + ('cecAttestationReport', c_uint8 * NVML_CC_GPU_CEC_ATTESTATION_REPORT_SIZE), + ] ## string/bytes conversion for ease of use @@ -1848,7 +1907,6 @@ def convertStrBytes(func): Returned from function: b'returned string' Returned to caller: 'returned string' ''' - @wraps(func) def wrapper(*args, **kwargs): # encoding a str returns bytes in python 2 and 3 @@ -1867,7 +1925,6 @@ def convertStrBytes(func): return wrapper return func - ## C function wrappers ## def nvmlInitWithFlags(flags): _LoadNvmlLibrary() @@ -1886,25 +1943,23 @@ def nvmlInitWithFlags(flags): libLoadLock.release() return None - def nvmlInit(): nvmlInitWithFlags(0) return None - def _LoadNvmlLibrary(): ''' Load the library if it isn't loaded already ''' global nvmlLib - if nvmlLib is None: + if (nvmlLib == None): # lock to ensure only one caller loads the library libLoadLock.acquire() try: # ensure the library still isn't loaded - if nvmlLib is None: + if (nvmlLib == None): try: if (sys.platform[:3] == "win"): # cdecl calling convention @@ -1914,20 +1969,18 @@ def _LoadNvmlLibrary(): except OSError as ose: # If nvml.dll is not found in System32, it should be in ProgramFiles # load nvml.dll from %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll - nvmlLib = CDLL(os.path.join(os.getenv("ProgramFiles", "C:/Program Files"), - "NVIDIA Corporation/NVSMI/nvml.dll")) + nvmlLib = CDLL(os.path.join(os.getenv("ProgramFiles", "C:/Program Files"), "NVIDIA Corporation/NVSMI/nvml.dll")) else: # assume linux nvmlLib = CDLL("libnvidia-ml.so.1") except OSError as ose: _nvmlCheckReturn(NVML_ERROR_LIBRARY_NOT_FOUND) - if nvmlLib is None: + if (nvmlLib == None): _nvmlCheckReturn(NVML_ERROR_LIBRARY_NOT_FOUND) finally: # lock is always freed libLoadLock.release() - def nvmlShutdown(): # # Leave the library loaded, but shutdown the interface @@ -1944,16 +1997,14 @@ def nvmlShutdown(): libLoadLock.release() return None - # Added in 2.285 @convertStrBytes def nvmlErrorString(result): fn = _nvmlGetFunctionPointer("nvmlErrorString") - fn.restype = c_char_p # otherwise return is an int + fn.restype = c_char_p # otherwise return is an int ret = fn(result) return ret - # Added in 2.285 @convertStrBytes def nvmlSystemGetNVMLVersion(): @@ -1963,7 +2014,6 @@ def nvmlSystemGetNVMLVersion(): _nvmlCheckReturn(ret) return c_version.value - def nvmlSystemGetCudaDriverVersion(): c_cuda_version = c_int() fn = _nvmlGetFunctionPointer("nvmlSystemGetCudaDriverVersion") @@ -1971,7 +2021,6 @@ def nvmlSystemGetCudaDriverVersion(): _nvmlCheckReturn(ret) return c_cuda_version.value - def nvmlSystemGetCudaDriverVersion_v2(): c_cuda_version = c_int() fn = _nvmlGetFunctionPointer("nvmlSystemGetCudaDriverVersion_v2") @@ -1979,7 +2028,6 @@ def nvmlSystemGetCudaDriverVersion_v2(): _nvmlCheckReturn(ret) return c_cuda_version.value - # Added in 2.285 @convertStrBytes def nvmlSystemGetProcessName(pid): @@ -1989,7 +2037,6 @@ def nvmlSystemGetProcessName(pid): _nvmlCheckReturn(ret) return c_name.value - @convertStrBytes def nvmlSystemGetDriverVersion(): c_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) @@ -1998,7 +2045,6 @@ def nvmlSystemGetDriverVersion(): _nvmlCheckReturn(ret) return c_version.value - # Added in 2.285 def nvmlSystemGetHicVersion(): c_count = c_uint(0) @@ -2010,7 +2056,7 @@ def nvmlSystemGetHicVersion(): # this should only fail with insufficient size if ((ret != NVML_SUCCESS) and - (ret != NVML_ERROR_INSUFFICIENT_SIZE)): + (ret != NVML_ERROR_INSUFFICIENT_SIZE)): raise NVMLError(ret) # If there are no hics @@ -2023,7 +2069,6 @@ def nvmlSystemGetHicVersion(): _nvmlCheckReturn(ret) return hics - ## Unit get functions def nvmlUnitGetCount(): c_count = c_uint() @@ -2032,7 +2077,6 @@ def nvmlUnitGetCount(): _nvmlCheckReturn(ret) return c_count.value - def nvmlUnitGetHandleByIndex(index): c_index = c_uint(index) unit = c_nvmlUnit_t() @@ -2041,7 +2085,6 @@ def nvmlUnitGetHandleByIndex(index): _nvmlCheckReturn(ret) return unit - def nvmlUnitGetUnitInfo(unit): c_info = c_nvmlUnitInfo_t() fn = _nvmlGetFunctionPointer("nvmlUnitGetUnitInfo") @@ -2049,15 +2092,13 @@ def nvmlUnitGetUnitInfo(unit): _nvmlCheckReturn(ret) return c_info - def nvmlUnitGetLedState(unit): - c_state = c_nvmlLedState_t() + c_state = c_nvmlLedState_t() fn = _nvmlGetFunctionPointer("nvmlUnitGetLedState") ret = fn(unit, byref(c_state)) _nvmlCheckReturn(ret) return c_state - def nvmlUnitGetPsuInfo(unit): c_info = c_nvmlPSUInfo_t() fn = _nvmlGetFunctionPointer("nvmlUnitGetPsuInfo") @@ -2065,7 +2106,6 @@ def nvmlUnitGetPsuInfo(unit): _nvmlCheckReturn(ret) return c_info - def nvmlUnitGetTemperature(unit, type): c_temp = c_uint() fn = _nvmlGetFunctionPointer("nvmlUnitGetTemperature") @@ -2073,7 +2113,6 @@ def nvmlUnitGetTemperature(unit, type): _nvmlCheckReturn(ret) return c_temp.value - def nvmlUnitGetFanSpeedInfo(unit): c_speeds = c_nvmlUnitFanSpeeds_t() fn = _nvmlGetFunctionPointer("nvmlUnitGetFanSpeedInfo") @@ -2081,7 +2120,6 @@ def nvmlUnitGetFanSpeedInfo(unit): _nvmlCheckReturn(ret) return c_speeds - # added to API def nvmlUnitGetDeviceCount(unit): c_count = c_uint(0) @@ -2093,7 +2131,6 @@ def nvmlUnitGetDeviceCount(unit): _nvmlCheckReturn(ret) return c_count.value - def nvmlUnitGetDevices(unit): c_count = c_uint(nvmlUnitGetDeviceCount(unit)) device_array = c_nvmlDevice_t * c_count.value @@ -2103,7 +2140,6 @@ def nvmlUnitGetDevices(unit): _nvmlCheckReturn(ret) return c_devices - ## Device get functions def nvmlDeviceGetCount(): c_count = c_uint() @@ -2112,7 +2148,6 @@ def nvmlDeviceGetCount(): _nvmlCheckReturn(ret) return c_count.value - def nvmlDeviceGetHandleByIndex(index): c_index = c_uint(index) device = c_nvmlDevice_t() @@ -2121,7 +2156,6 @@ def nvmlDeviceGetHandleByIndex(index): _nvmlCheckReturn(ret) return device - @convertStrBytes def nvmlDeviceGetHandleBySerial(serial): c_serial = c_char_p(serial) @@ -2131,7 +2165,6 @@ def nvmlDeviceGetHandleBySerial(serial): _nvmlCheckReturn(ret) return device - @convertStrBytes def nvmlDeviceGetHandleByUUID(uuid): c_uuid = c_char_p(uuid) @@ -2141,7 +2174,6 @@ def nvmlDeviceGetHandleByUUID(uuid): _nvmlCheckReturn(ret) return device - @convertStrBytes def nvmlDeviceGetHandleByPciBusId(pciBusId): c_busId = c_char_p(pciBusId) @@ -2151,7 +2183,6 @@ def nvmlDeviceGetHandleByPciBusId(pciBusId): _nvmlCheckReturn(ret) return device - @convertStrBytes def nvmlDeviceGetName(handle): c_name = create_string_buffer(NVML_DEVICE_NAME_V2_BUFFER_SIZE) @@ -2160,23 +2191,20 @@ def nvmlDeviceGetName(handle): _nvmlCheckReturn(ret) return c_name.value - def nvmlDeviceGetBoardId(handle): - c_id = c_uint() + c_id = c_uint(); fn = _nvmlGetFunctionPointer("nvmlDeviceGetBoardId") ret = fn(handle, byref(c_id)) _nvmlCheckReturn(ret) return c_id.value - def nvmlDeviceGetMultiGpuBoard(handle): - c_multiGpu = c_uint() + c_multiGpu = c_uint(); fn = _nvmlGetFunctionPointer("nvmlDeviceGetMultiGpuBoard") ret = fn(handle, byref(c_multiGpu)) _nvmlCheckReturn(ret) return c_multiGpu.value - def nvmlDeviceGetBrand(handle): c_type = _nvmlBrandType_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetBrand") @@ -2184,7 +2212,6 @@ def nvmlDeviceGetBrand(handle): _nvmlCheckReturn(ret) return c_type.value - @convertStrBytes def nvmlDeviceGetBoardPartNumber(handle): c_part_number = create_string_buffer(NVML_DEVICE_PART_NUMBER_BUFFER_SIZE) @@ -2193,7 +2220,6 @@ def nvmlDeviceGetBoardPartNumber(handle): _nvmlCheckReturn(ret) return c_part_number.value - @convertStrBytes def nvmlDeviceGetSerial(handle): c_serial = create_string_buffer(NVML_DEVICE_SERIAL_BUFFER_SIZE) @@ -2202,6 +2228,10 @@ def nvmlDeviceGetSerial(handle): _nvmlCheckReturn(ret) return c_serial.value +def nvmlDeviceGetModuleId(handle, moduleId): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetModuleId") + ret = fn(handle, moduleId) + return ret def nvmlDeviceGetMemoryAffinity(handle, nodeSetSize, scope): affinity_array = c_ulonglong * nodeSetSize @@ -2211,7 +2241,6 @@ def nvmlDeviceGetMemoryAffinity(handle, nodeSetSize, scope): _nvmlCheckReturn(ret) return c_affinity - def nvmlDeviceGetCpuAffinityWithinScope(handle, cpuSetSize, scope): affinity_array = c_ulonglong * cpuSetSize c_affinity = affinity_array() @@ -2220,7 +2249,6 @@ def nvmlDeviceGetCpuAffinityWithinScope(handle, cpuSetSize, scope): _nvmlCheckReturn(ret) return c_affinity - def nvmlDeviceGetCpuAffinity(handle, cpuSetSize): affinity_array = c_ulonglong * cpuSetSize c_affinity = affinity_array() @@ -2229,21 +2257,18 @@ def nvmlDeviceGetCpuAffinity(handle, cpuSetSize): _nvmlCheckReturn(ret) return c_affinity - def nvmlDeviceSetCpuAffinity(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceSetCpuAffinity") ret = fn(handle) _nvmlCheckReturn(ret) return None - def nvmlDeviceClearCpuAffinity(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceClearCpuAffinity") ret = fn(handle) _nvmlCheckReturn(ret) return None - def nvmlDeviceGetMinorNumber(handle): c_minor_number = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinorNumber") @@ -2251,7 +2276,6 @@ def nvmlDeviceGetMinorNumber(handle): _nvmlCheckReturn(ret) return c_minor_number.value - @convertStrBytes def nvmlDeviceGetUUID(handle): c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE) @@ -2260,17 +2284,15 @@ def nvmlDeviceGetUUID(handle): _nvmlCheckReturn(ret) return c_uuid.value - @convertStrBytes def nvmlDeviceGetInforomVersion(handle, infoRomObject): c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomVersion") ret = fn(handle, _nvmlInforomObject_t(infoRomObject), - c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)) + c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)) _nvmlCheckReturn(ret) return c_version.value - # Added in 4.304 @convertStrBytes def nvmlDeviceGetInforomImageVersion(handle): @@ -2280,7 +2302,6 @@ def nvmlDeviceGetInforomImageVersion(handle): _nvmlCheckReturn(ret) return c_version.value - # Added in 4.304 def nvmlDeviceGetInforomConfigurationChecksum(handle): c_checksum = c_uint() @@ -2289,7 +2310,6 @@ def nvmlDeviceGetInforomConfigurationChecksum(handle): _nvmlCheckReturn(ret) return c_checksum.value - # Added in 4.304 def nvmlDeviceValidateInforom(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceValidateInforom") @@ -2297,6 +2317,13 @@ def nvmlDeviceValidateInforom(handle): _nvmlCheckReturn(ret) return None +def nvmlDeviceGetLastBBXFlushTime(handle): + c_timestamp = c_ulonglong() + c_durationUs = c_ulong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetLastBBXFlushTime") + ret = fn(handle, byref(c_timestamp), byref(c_durationUs)) + _nvmlCheckReturn(ret) + return [c_timestamp.value, c_durationUs.value] def nvmlDeviceGetDisplayMode(handle): c_mode = _nvmlEnableState_t() @@ -2305,7 +2332,6 @@ def nvmlDeviceGetDisplayMode(handle): _nvmlCheckReturn(ret) return c_mode.value - def nvmlDeviceGetDisplayActive(handle): c_mode = _nvmlEnableState_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetDisplayActive") @@ -2321,7 +2347,6 @@ def nvmlDeviceGetPersistenceMode(handle): _nvmlCheckReturn(ret) return c_state.value - def nvmlDeviceGetPciInfo_v3(handle): c_info = nvmlPciInfo_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetPciInfo_v3") @@ -2329,11 +2354,9 @@ def nvmlDeviceGetPciInfo_v3(handle): _nvmlCheckReturn(ret) return c_info - def nvmlDeviceGetPciInfo(handle): return nvmlDeviceGetPciInfo_v3(handle) - def nvmlDeviceGetClockInfo(handle, type): c_clock = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetClockInfo") @@ -2341,7 +2364,6 @@ def nvmlDeviceGetClockInfo(handle, type): _nvmlCheckReturn(ret) return c_clock.value - # Added in 2.285 def nvmlDeviceGetMaxClockInfo(handle, type): c_clock = c_uint() @@ -2350,7 +2372,6 @@ def nvmlDeviceGetMaxClockInfo(handle, type): _nvmlCheckReturn(ret) return c_clock.value - # Added in 4.304 def nvmlDeviceGetApplicationsClock(handle, type): c_clock = c_uint() @@ -2359,7 +2380,6 @@ def nvmlDeviceGetApplicationsClock(handle, type): _nvmlCheckReturn(ret) return c_clock.value - def nvmlDeviceGetMaxCustomerBoostClock(handle, type): c_clock = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxCustomerBoostClock") @@ -2367,7 +2387,6 @@ def nvmlDeviceGetMaxCustomerBoostClock(handle, type): _nvmlCheckReturn(ret) return c_clock.value - def nvmlDeviceGetClock(handle, type, id): c_clock = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetClock") @@ -2375,7 +2394,6 @@ def nvmlDeviceGetClock(handle, type, id): _nvmlCheckReturn(ret) return c_clock.value - # Added in 5.319 def nvmlDeviceGetDefaultApplicationsClock(handle, type): c_clock = c_uint() @@ -2384,7 +2402,6 @@ def nvmlDeviceGetDefaultApplicationsClock(handle, type): _nvmlCheckReturn(ret) return c_clock.value - # Added in 4.304 def nvmlDeviceGetSupportedMemoryClocks(handle): # first call to get the size @@ -2413,7 +2430,6 @@ def nvmlDeviceGetSupportedMemoryClocks(handle): # error case raise NVMLError(ret) - # Added in 4.304 def nvmlDeviceGetSupportedGraphicsClocks(handle, memoryClockMHz): # first call to get the size @@ -2442,7 +2458,6 @@ def nvmlDeviceGetSupportedGraphicsClocks(handle, memoryClockMHz): # error case raise NVMLError(ret) - def nvmlDeviceGetFanSpeed(handle): c_speed = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed") @@ -2450,7 +2465,6 @@ def nvmlDeviceGetFanSpeed(handle): _nvmlCheckReturn(ret) return c_speed.value - def nvmlDeviceGetFanSpeed_v2(handle, fan): c_speed = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed_v2") @@ -2458,7 +2472,6 @@ def nvmlDeviceGetFanSpeed_v2(handle, fan): _nvmlCheckReturn(ret) return c_speed.value - def nvmlDeviceGetTargetFanSpeed(handle, fan): c_speed = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetTargetFanSpeed") @@ -2466,7 +2479,6 @@ def nvmlDeviceGetTargetFanSpeed(handle, fan): _nvmlCheckReturn(ret) return c_speed.value - def nvmlDeviceGetNumFans(device): c_numFans = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumFans") @@ -2474,35 +2486,30 @@ def nvmlDeviceGetNumFans(device): _nvmlCheckReturn(ret) return c_numFans.value - def nvmlDeviceSetDefaultFanSpeed_v2(handle, index): - fn = _nvmlGetFunctionPointer("nvmlDeviceSetDefaultFanSpeed_v2") + fn = _nvmlGetFunctionPointer("nvmlDeviceSetDefaultFanSpeed_v2"); ret = fn(handle, index) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetMinMaxFanSpeed(handle, minSpeed, maxSpeed): fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinMaxFanSpeed") ret = fn(handle, minSpeed, maxSpeed) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetFanControlPolicy_v2(handle, fan, fanControlPolicy): fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanControlPolicy_v2") ret = fn(handle, fan, fanControlPolicy) _nvmlCheckReturn(ret) return ret - def nvmlDeviceSetFanControlPolicy(handle, fan, fanControlPolicy): fn = _nvmlGetFunctionPointer("nvmlDeviceSetFanControlPolicy") ret = fn(handle, fan, _nvmlFanControlPolicy_t(fanControlPolicy)) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetTemperature(handle, sensor): c_temp = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperature") @@ -2510,7 +2517,6 @@ def nvmlDeviceGetTemperature(handle, sensor): _nvmlCheckReturn(ret) return c_temp.value - def nvmlDeviceGetTemperatureThreshold(handle, threshold): c_temp = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperatureThreshold") @@ -2518,7 +2524,6 @@ def nvmlDeviceGetTemperatureThreshold(handle, threshold): _nvmlCheckReturn(ret) return c_temp.value - def nvmlDeviceSetTemperatureThreshold(handle, threshold, temp): c_temp = c_uint() c_temp.value = temp @@ -2527,7 +2532,6 @@ def nvmlDeviceSetTemperatureThreshold(handle, threshold, temp): _nvmlCheckReturn(ret) return None - # DEPRECATED use nvmlDeviceGetPerformanceState def nvmlDeviceGetPowerState(handle): c_pstate = _nvmlPstates_t() @@ -2536,7 +2540,6 @@ def nvmlDeviceGetPowerState(handle): _nvmlCheckReturn(ret) return c_pstate.value - def nvmlDeviceGetPerformanceState(handle): c_pstate = _nvmlPstates_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetPerformanceState") @@ -2544,7 +2547,6 @@ def nvmlDeviceGetPerformanceState(handle): _nvmlCheckReturn(ret) return c_pstate.value - def nvmlDeviceGetPowerManagementMode(handle): c_pcapMode = _nvmlEnableState_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementMode") @@ -2552,7 +2554,6 @@ def nvmlDeviceGetPowerManagementMode(handle): _nvmlCheckReturn(ret) return c_pcapMode.value - def nvmlDeviceGetPowerManagementLimit(handle): c_limit = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementLimit") @@ -2560,7 +2561,6 @@ def nvmlDeviceGetPowerManagementLimit(handle): _nvmlCheckReturn(ret) return c_limit.value - # Added in 4.304 def nvmlDeviceGetPowerManagementLimitConstraints(handle): c_minLimit = c_uint() @@ -2570,7 +2570,6 @@ def nvmlDeviceGetPowerManagementLimitConstraints(handle): _nvmlCheckReturn(ret) return [c_minLimit.value, c_maxLimit.value] - # Added in 4.304 def nvmlDeviceGetPowerManagementDefaultLimit(handle): c_limit = c_uint() @@ -2588,7 +2587,6 @@ def nvmlDeviceGetEnforcedPowerLimit(handle): _nvmlCheckReturn(ret) return c_limit.value - def nvmlDeviceGetPowerUsage(handle): c_watts = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerUsage") @@ -2596,7 +2594,6 @@ def nvmlDeviceGetPowerUsage(handle): _nvmlCheckReturn(ret) return c_watts.value - def nvmlDeviceGetTotalEnergyConsumption(handle): c_millijoules = c_uint64() fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEnergyConsumption") @@ -2604,7 +2601,6 @@ def nvmlDeviceGetTotalEnergyConsumption(handle): _nvmlCheckReturn(ret) return c_millijoules.value - # Added in 4.304 def nvmlDeviceGetGpuOperationMode(handle): c_currState = _nvmlGpuOperationMode_t() @@ -2614,17 +2610,14 @@ def nvmlDeviceGetGpuOperationMode(handle): _nvmlCheckReturn(ret) return [c_currState.value, c_pendingState.value] - # Added in 4.304 def nvmlDeviceGetCurrentGpuOperationMode(handle): return nvmlDeviceGetGpuOperationMode(handle)[0] - # Added in 4.304 def nvmlDeviceGetPendingGpuOperationMode(handle): return nvmlDeviceGetGpuOperationMode(handle)[1] - def nvmlDeviceGetMemoryInfo(handle, version=None): if not version: c_memory = c_nvmlMemory_t() @@ -2637,7 +2630,6 @@ def nvmlDeviceGetMemoryInfo(handle, version=None): _nvmlCheckReturn(ret) return c_memory - def nvmlDeviceGetBAR1MemoryInfo(handle): c_bar1_memory = c_nvmlBAR1Memory_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetBAR1MemoryInfo") @@ -2645,7 +2637,6 @@ def nvmlDeviceGetBAR1MemoryInfo(handle): _nvmlCheckReturn(ret) return c_bar1_memory - def nvmlDeviceGetComputeMode(handle): c_mode = _nvmlComputeMode_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeMode") @@ -2653,7 +2644,6 @@ def nvmlDeviceGetComputeMode(handle): _nvmlCheckReturn(ret) return c_mode.value - def nvmlDeviceGetCudaComputeCapability(handle): c_major = c_int() c_minor = c_int() @@ -2662,7 +2652,6 @@ def nvmlDeviceGetCudaComputeCapability(handle): _nvmlCheckReturn(ret) return (c_major.value, c_minor.value) - def nvmlDeviceGetEccMode(handle): c_currState = _nvmlEnableState_t() c_pendingState = _nvmlEnableState_t() @@ -2671,17 +2660,14 @@ def nvmlDeviceGetEccMode(handle): _nvmlCheckReturn(ret) return [c_currState.value, c_pendingState.value] - # added to API def nvmlDeviceGetCurrentEccMode(handle): return nvmlDeviceGetEccMode(handle)[0] - # added to API def nvmlDeviceGetPendingEccMode(handle): return nvmlDeviceGetEccMode(handle)[1] - def nvmlDeviceGetDefaultEccMode(handle): c_defaultState = _nvmlEnableState_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetDefaultEccMode") @@ -2689,26 +2675,23 @@ def nvmlDeviceGetDefaultEccMode(handle): _nvmlCheckReturn(ret) return [c_defaultState.value] - def nvmlDeviceGetTotalEccErrors(handle, errorType, counterType): c_count = c_ulonglong() fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEccErrors") ret = fn(handle, _nvmlMemoryErrorType_t(errorType), - _nvmlEccCounterType_t(counterType), byref(c_count)) + _nvmlEccCounterType_t(counterType), byref(c_count)) _nvmlCheckReturn(ret) return c_count.value - # This is deprecated, instead use nvmlDeviceGetMemoryErrorCounter def nvmlDeviceGetDetailedEccErrors(handle, errorType, counterType): c_counts = c_nvmlEccErrorCounts_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetDetailedEccErrors") ret = fn(handle, _nvmlMemoryErrorType_t(errorType), - _nvmlEccCounterType_t(counterType), byref(c_counts)) + _nvmlEccCounterType_t(counterType), byref(c_counts)) _nvmlCheckReturn(ret) return c_counts - # Added in 4.304 def nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, locationType): c_count = c_ulonglong() @@ -2721,7 +2704,6 @@ def nvmlDeviceGetMemoryErrorCounter(handle, errorType, counterType, locationType _nvmlCheckReturn(ret) return c_count.value - def nvmlDeviceGetUtilizationRates(handle): c_util = c_nvmlUtilization_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetUtilizationRates") @@ -2729,7 +2711,6 @@ def nvmlDeviceGetUtilizationRates(handle): _nvmlCheckReturn(ret) return c_util - def nvmlDeviceGetEncoderUtilization(handle): c_util = c_uint() c_samplingPeriod = c_uint() @@ -2738,7 +2719,6 @@ def nvmlDeviceGetEncoderUtilization(handle): _nvmlCheckReturn(ret) return [c_util.value, c_samplingPeriod.value] - def nvmlDeviceGetDecoderUtilization(handle): c_util = c_uint() c_samplingPeriod = c_uint() @@ -2747,6 +2727,21 @@ def nvmlDeviceGetDecoderUtilization(handle): _nvmlCheckReturn(ret) return [c_util.value, c_samplingPeriod.value] +def nvmlDeviceGetJpgUtilization(handle): + c_util = c_uint() + c_samplingPeriod = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetJpgUtilization") + ret = fn(handle, byref(c_util), byref(c_samplingPeriod)) + _nvmlCheckReturn(ret) + return [c_util.value, c_samplingPeriod.value] + +def nvmlDeviceGetOfaUtilization(handle): + c_util = c_uint() + c_samplingPeriod = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetOfaUtilization") + ret = fn(handle, byref(c_util), byref(c_samplingPeriod)) + _nvmlCheckReturn(ret) + return [c_util.value, c_samplingPeriod.value] def nvmlDeviceGetPcieReplayCounter(handle): c_replay = c_uint() @@ -2755,7 +2750,6 @@ def nvmlDeviceGetPcieReplayCounter(handle): _nvmlCheckReturn(ret) return c_replay.value - def nvmlDeviceGetDriverModel(handle): c_currModel = _nvmlDriverModel_t() c_pendingModel = _nvmlDriverModel_t() @@ -2764,17 +2758,14 @@ def nvmlDeviceGetDriverModel(handle): _nvmlCheckReturn(ret) return [c_currModel.value, c_pendingModel.value] - # added to API def nvmlDeviceGetCurrentDriverModel(handle): return nvmlDeviceGetDriverModel(handle)[0] - # added to API def nvmlDeviceGetPendingDriverModel(handle): return nvmlDeviceGetDriverModel(handle)[1] - # Added in 2.285 @convertStrBytes def nvmlDeviceGetVbiosVersion(handle): @@ -2784,7 +2775,6 @@ def nvmlDeviceGetVbiosVersion(handle): _nvmlCheckReturn(ret) return c_version.value - # Added in 2.285 def nvmlDeviceGetComputeRunningProcesses_v3(handle): # first call to get the size @@ -2820,11 +2810,9 @@ def nvmlDeviceGetComputeRunningProcesses_v3(handle): # error case raise NVMLError(ret) - def nvmlDeviceGetComputeRunningProcesses(handle): return nvmlDeviceGetComputeRunningProcesses_v3(handle) - def nvmlDeviceGetGraphicsRunningProcesses_v3(handle): # first call to get the size c_count = c_uint(0) @@ -2859,15 +2847,12 @@ def nvmlDeviceGetGraphicsRunningProcesses_v3(handle): # error case raise NVMLError(ret) - def nvmlDeviceGetGraphicsRunningProcesses(handle): return nvmlDeviceGetGraphicsRunningProcesses_v3(handle) - def nvmlDeviceGetMPSComputeRunningProcesses(handle): return nvmlDeviceGetMPSComputeRunningProcesses_v3(handle) - def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle): # first call to get the size c_count = c_uint(0) @@ -2902,6 +2887,40 @@ def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle): # error case raise NVMLError(ret) +def nvmlDeviceGetRunningProcessDetailList(handle, version, mode): + c_processDetailList = c_nvmlProcessDetailList_t() + c_processDetailList.version = version + c_processDetailList.mode = mode + + fn = _nvmlGetFunctionPointer("nvmlDeviceGetRunningProcessDetailList") + + # first call to get the size + ret = fn(handle, byref(c_processDetailList)) + if (ret == NVML_SUCCESS): + # special case, no running processes + return [] + elif (ret == NVML_ERROR_INSUFFICIENT_SIZE): + c_procs = c_nvmlProcessDetail_v1_t * c_processDetailList.numProcArrayEntries + c_processDetailList.procArray = cast((c_procs)(), POINTER(c_nvmlProcessDetail_v1_t)) + + # make the call again + ret = fn(handle, byref(c_processDetailList)) + _nvmlCheckReturn(ret) + + procs = [] + for i in range(c_processDetailList.numProcArrayEntries): + # use an alternative struct for this object + obj = c_processDetailList.procArray[i] + if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value): + obj.usedGpuMemory = None + if (obj.usedGpuCcProtectedMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value): + obj.usedGpuCcProtectedMemory = None + procs.append(obj) + + return procs + else: + # error case + raise NVMLError(ret) def nvmlDeviceGetAutoBoostedClocksEnabled(handle): c_isEnabled = _nvmlEnableState_t() @@ -2910,8 +2929,7 @@ def nvmlDeviceGetAutoBoostedClocksEnabled(handle): ret = fn(handle, byref(c_isEnabled), byref(c_defaultIsEnabled)) _nvmlCheckReturn(ret) return [c_isEnabled.value, c_defaultIsEnabled.value] - # Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks - + #Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks ## Set functions def nvmlUnitSetLedState(unit, color): @@ -2920,57 +2938,49 @@ def nvmlUnitSetLedState(unit, color): _nvmlCheckReturn(ret) return None - def nvmlDeviceSetPersistenceMode(handle, mode): fn = _nvmlGetFunctionPointer("nvmlDeviceSetPersistenceMode") ret = fn(handle, _nvmlEnableState_t(mode)) _nvmlCheckReturn(ret) return None - def nvmlDeviceSetComputeMode(handle, mode): fn = _nvmlGetFunctionPointer("nvmlDeviceSetComputeMode") ret = fn(handle, _nvmlComputeMode_t(mode)) _nvmlCheckReturn(ret) return None - def nvmlDeviceSetEccMode(handle, mode): fn = _nvmlGetFunctionPointer("nvmlDeviceSetEccMode") ret = fn(handle, _nvmlEnableState_t(mode)) _nvmlCheckReturn(ret) return None - def nvmlDeviceClearEccErrorCounts(handle, counterType): fn = _nvmlGetFunctionPointer("nvmlDeviceClearEccErrorCounts") ret = fn(handle, _nvmlEccCounterType_t(counterType)) _nvmlCheckReturn(ret) return None - def nvmlDeviceSetDriverModel(handle, model): fn = _nvmlGetFunctionPointer("nvmlDeviceSetDriverModel") ret = fn(handle, _nvmlDriverModel_t(model)) _nvmlCheckReturn(ret) return None - def nvmlDeviceSetAutoBoostedClocksEnabled(handle, enabled): fn = _nvmlGetFunctionPointer("nvmlDeviceSetAutoBoostedClocksEnabled") ret = fn(handle, _nvmlEnableState_t(enabled)) _nvmlCheckReturn(ret) return None - # Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks - + #Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks def nvmlDeviceSetDefaultAutoBoostedClocksEnabled(handle, enabled, flags): fn = _nvmlGetFunctionPointer("nvmlDeviceSetDefaultAutoBoostedClocksEnabled") ret = fn(handle, _nvmlEnableState_t(enabled), c_uint(flags)) _nvmlCheckReturn(ret) return None - # Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks - + #Throws NVML_ERROR_NOT_SUPPORTED if hardware doesn't support setting auto boosted clocks def nvmlDeviceSetGpuLockedClocks(handle, minGpuClockMHz, maxGpuClockMHz): fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpuLockedClocks") @@ -2978,34 +2988,29 @@ def nvmlDeviceSetGpuLockedClocks(handle, minGpuClockMHz, maxGpuClockMHz): _nvmlCheckReturn(ret) return None - def nvmlDeviceResetGpuLockedClocks(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceResetGpuLockedClocks") ret = fn(handle) _nvmlCheckReturn(ret) return None - def nvmlDeviceSetMemoryLockedClocks(handle, minMemClockMHz, maxMemClockMHz): fn = _nvmlGetFunctionPointer("nvmlDeviceSetMemoryLockedClocks") ret = fn(handle, c_uint(minMemClockMHz), c_uint(maxMemClockMHz)) _nvmlCheckReturn(ret) return None - def nvmlDeviceResetMemoryLockedClocks(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceResetMemoryLockedClocks") ret = fn(handle) _nvmlCheckReturn(ret) return None - def nvmlDeviceGetClkMonStatus(handle, c_clkMonInfo): fn = _nvmlGetFunctionPointer("nvmlDeviceGetClkMonStatus") ret = fn(handle, c_clkMonInfo) return ret - # Added in 4.304 def nvmlDeviceSetApplicationsClocks(handle, maxMemClockMHz, maxGraphicsClockMHz): fn = _nvmlGetFunctionPointer("nvmlDeviceSetApplicationsClocks") @@ -3013,7 +3018,6 @@ def nvmlDeviceSetApplicationsClocks(handle, maxMemClockMHz, maxGraphicsClockMHz) _nvmlCheckReturn(ret) return None - # Added in 4.304 def nvmlDeviceResetApplicationsClocks(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceResetApplicationsClocks") @@ -3021,7 +3025,6 @@ def nvmlDeviceResetApplicationsClocks(handle): _nvmlCheckReturn(ret) return None - # Added in 4.304 def nvmlDeviceSetPowerManagementLimit(handle, limit): fn = _nvmlGetFunctionPointer("nvmlDeviceSetPowerManagementLimit") @@ -3029,7 +3032,6 @@ def nvmlDeviceSetPowerManagementLimit(handle, limit): _nvmlCheckReturn(ret) return None - # Added in 4.304 def nvmlDeviceSetGpuOperationMode(handle, mode): fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpuOperationMode") @@ -3037,7 +3039,6 @@ def nvmlDeviceSetGpuOperationMode(handle, mode): _nvmlCheckReturn(ret) return None - # Added in 2.285 def nvmlEventSetCreate(): fn = _nvmlGetFunctionPointer("nvmlEventSetCreate") @@ -3046,7 +3047,6 @@ def nvmlEventSetCreate(): _nvmlCheckReturn(ret) return eventSet - # Added in 2.285 def nvmlDeviceRegisterEvents(handle, eventTypes, eventSet): fn = _nvmlGetFunctionPointer("nvmlDeviceRegisterEvents") @@ -3054,7 +3054,6 @@ def nvmlDeviceRegisterEvents(handle, eventTypes, eventSet): _nvmlCheckReturn(ret) return None - # Added in 2.285 def nvmlDeviceGetSupportedEventTypes(handle): c_eventTypes = c_ulonglong() @@ -3063,7 +3062,6 @@ def nvmlDeviceGetSupportedEventTypes(handle): _nvmlCheckReturn(ret) return c_eventTypes.value - # raises NVML_ERROR_TIMEOUT exception on timeout def nvmlEventSetWait_v2(eventSet, timeoutms): fn = _nvmlGetFunctionPointer("nvmlEventSetWait_v2") @@ -3072,11 +3070,9 @@ def nvmlEventSetWait_v2(eventSet, timeoutms): _nvmlCheckReturn(ret) return data - def nvmlEventSetWait(eventSet, timeoutms): return nvmlEventSetWait_v2(eventSet, timeoutms) - # Added in 2.285 def nvmlEventSetFree(eventSet): fn = _nvmlGetFunctionPointer("nvmlEventSetFree") @@ -3084,7 +3080,6 @@ def nvmlEventSetFree(eventSet): _nvmlCheckReturn(ret) return None - # Added in 3.295 def nvmlDeviceOnSameBoard(handle1, handle2): fn = _nvmlGetFunctionPointer("nvmlDeviceOnSameBoard") @@ -3093,7 +3088,6 @@ def nvmlDeviceOnSameBoard(handle1, handle2): _nvmlCheckReturn(ret) return (onSameBoard.value != 0) - # Added in 3.295 def nvmlDeviceGetCurrPcieLinkGeneration(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkGeneration") @@ -3102,7 +3096,6 @@ def nvmlDeviceGetCurrPcieLinkGeneration(handle): _nvmlCheckReturn(ret) return gen.value - # Added in 3.295 def nvmlDeviceGetMaxPcieLinkGeneration(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkGeneration") @@ -3111,7 +3104,6 @@ def nvmlDeviceGetMaxPcieLinkGeneration(handle): _nvmlCheckReturn(ret) return gen.value - # Added in 3.295 def nvmlDeviceGetCurrPcieLinkWidth(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkWidth") @@ -3120,7 +3112,6 @@ def nvmlDeviceGetCurrPcieLinkWidth(handle): _nvmlCheckReturn(ret) return width.value - # Added in 3.295 def nvmlDeviceGetMaxPcieLinkWidth(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkWidth") @@ -3129,7 +3120,6 @@ def nvmlDeviceGetMaxPcieLinkWidth(handle): _nvmlCheckReturn(ret) return width.value - def nvmlDeviceGetGpuMaxPcieLinkGeneration(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuMaxPcieLinkGeneration") gen = c_uint() @@ -3137,24 +3127,35 @@ def nvmlDeviceGetGpuMaxPcieLinkGeneration(handle): _nvmlCheckReturn(ret) return gen.value - # Added in 4.304 def nvmlDeviceGetSupportedClocksThrottleReasons(handle): - c_reasons = c_ulonglong() + c_reasons= c_ulonglong() fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedClocksThrottleReasons") ret = fn(handle, byref(c_reasons)) _nvmlCheckReturn(ret) return c_reasons.value +def nvmlDeviceGetSupportedClocksEventReasons(handle): + c_reasons= c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedClocksEventReasons") + ret = fn(handle, byref(c_reasons)) + _nvmlCheckReturn(ret) + return c_reasons.value # Added in 4.304 def nvmlDeviceGetCurrentClocksThrottleReasons(handle): - c_reasons = c_ulonglong() + c_reasons= c_ulonglong() fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrentClocksThrottleReasons") ret = fn(handle, byref(c_reasons)) _nvmlCheckReturn(ret) return c_reasons.value +def nvmlDeviceGetCurrentClocksEventReasons(handle): + c_reasons= c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrentClocksEventReasons") + ret = fn(handle, byref(c_reasons)) + _nvmlCheckReturn(ret) + return c_reasons.value # Added in 5.319 def nvmlDeviceGetIndex(handle): @@ -3164,7 +3165,6 @@ def nvmlDeviceGetIndex(handle): _nvmlCheckReturn(ret) return c_index.value - # Added in 5.319 def nvmlDeviceGetAccountingMode(handle): c_mode = _nvmlEnableState_t() @@ -3173,21 +3173,18 @@ def nvmlDeviceGetAccountingMode(handle): _nvmlCheckReturn(ret) return c_mode.value - def nvmlDeviceSetAccountingMode(handle, mode): fn = _nvmlGetFunctionPointer("nvmlDeviceSetAccountingMode") ret = fn(handle, _nvmlEnableState_t(mode)) _nvmlCheckReturn(ret) return None - def nvmlDeviceClearAccountingPids(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceClearAccountingPids") ret = fn(handle) _nvmlCheckReturn(ret) return None - def nvmlDeviceGetAccountingStats(handle, pid): stats = c_nvmlAccountingStats_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingStats") @@ -3198,7 +3195,6 @@ def nvmlDeviceGetAccountingStats(handle, pid): stats.maxMemoryUsage = None return stats - def nvmlDeviceGetAccountingPids(handle): count = c_uint(nvmlDeviceGetAccountingBufferSize(handle)) pids = (c_uint * count.value)() @@ -3207,7 +3203,6 @@ def nvmlDeviceGetAccountingPids(handle): _nvmlCheckReturn(ret) return list(map(int, pids[0:count.value])) - def nvmlDeviceGetAccountingBufferSize(handle): bufferSize = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetAccountingBufferSize") @@ -3215,7 +3210,6 @@ def nvmlDeviceGetAccountingBufferSize(handle): _nvmlCheckReturn(ret) return int(bufferSize.value) - def nvmlDeviceGetRetiredPages(device, sourceFilter): c_source = _nvmlPageRetirementCause_t(sourceFilter) c_count = c_uint(0) @@ -3226,7 +3220,7 @@ def nvmlDeviceGetRetiredPages(device, sourceFilter): # this should only fail with insufficient size if ((ret != NVML_SUCCESS) and - (ret != NVML_ERROR_INSUFFICIENT_SIZE)): + (ret != NVML_ERROR_INSUFFICIENT_SIZE)): raise NVMLError(ret) # call again with a buffer @@ -3239,7 +3233,6 @@ def nvmlDeviceGetRetiredPages(device, sourceFilter): _nvmlCheckReturn(ret) return list(map(int, c_pages[0:c_count.value])) - def nvmlDeviceGetRetiredPages_v2(device, sourceFilter): c_source = _nvmlPageRetirementCause_t(sourceFilter) c_count = c_uint(0) @@ -3250,7 +3243,7 @@ def nvmlDeviceGetRetiredPages_v2(device, sourceFilter): # this should only fail with insufficient size if ((ret != NVML_SUCCESS) and - (ret != NVML_ERROR_INSUFFICIENT_SIZE)): + (ret != NVML_ERROR_INSUFFICIENT_SIZE)): raise NVMLError(ret) # call again with a buffer @@ -3263,8 +3256,7 @@ def nvmlDeviceGetRetiredPages_v2(device, sourceFilter): c_times = times_array() ret = fn(device, c_source, byref(c_count), c_pages, c_times) _nvmlCheckReturn(ret) - return [{'address': int(c_pages[i]), 'timestamp': int(c_times[i])} for i in range(c_count.value)] - + return [ { 'address': int(c_pages[i]), 'timestamp': int(c_times[i]) } for i in range(c_count.value) ]; def nvmlDeviceGetRetiredPagesPendingStatus(device): c_pending = _nvmlEnableState_t() @@ -3273,7 +3265,6 @@ def nvmlDeviceGetRetiredPagesPendingStatus(device): _nvmlCheckReturn(ret) return int(c_pending.value) - def nvmlDeviceGetAPIRestriction(device, apiType): c_permission = _nvmlEnableState_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetAPIRestriction") @@ -3281,14 +3272,12 @@ def nvmlDeviceGetAPIRestriction(device, apiType): _nvmlCheckReturn(ret) return int(c_permission.value) - def nvmlDeviceSetAPIRestriction(handle, apiType, isRestricted): fn = _nvmlGetFunctionPointer("nvmlDeviceSetAPIRestriction") ret = fn(handle, _nvmlRestrictedAPI_t(apiType), _nvmlEnableState_t(isRestricted)) _nvmlCheckReturn(ret) return None - def nvmlDeviceGetBridgeChipInfo(handle): bridgeHierarchy = c_nvmlBridgeChipHierarchy_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetBridgeChipInfo") @@ -3296,7 +3285,6 @@ def nvmlDeviceGetBridgeChipInfo(handle): _nvmlCheckReturn(ret) return bridgeHierarchy - def nvmlDeviceGetSamples(device, sampling_type, timeStamp): c_sampling_type = _nvmlSamplingType_t(sampling_type) c_time_stamp = c_ulonglong(timeStamp) @@ -3313,11 +3301,10 @@ def nvmlDeviceGetSamples(device, sampling_type, timeStamp): sampleArray = c_sample_count.value * c_nvmlSample_t c_samples = sampleArray() - ret = fn(device, c_sampling_type, c_time_stamp, byref(c_sample_value_type), byref(c_sample_count), c_samples) + ret = fn(device, c_sampling_type, c_time_stamp, byref(c_sample_value_type), byref(c_sample_count), c_samples) _nvmlCheckReturn(ret) return (c_sample_value_type.value, c_samples[0:c_sample_count.value]) - def nvmlDeviceGetViolationStatus(device, perfPolicyType): c_perfPolicy_type = _nvmlPerfPolicyType_t(perfPolicyType) c_violTime = c_nvmlViolationTime_t() @@ -3328,7 +3315,6 @@ def nvmlDeviceGetViolationStatus(device, perfPolicyType): _nvmlCheckReturn(ret) return c_violTime - def nvmlDeviceGetPcieThroughput(device, counter): c_util = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieThroughput") @@ -3336,7 +3322,6 @@ def nvmlDeviceGetPcieThroughput(device, counter): _nvmlCheckReturn(ret) return c_util.value - def nvmlSystemGetTopologyGpuSet(cpuNumber): c_count = c_uint(0) fn = _nvmlGetFunctionPointer("nvmlSystemGetTopologyGpuSet") @@ -3353,7 +3338,6 @@ def nvmlSystemGetTopologyGpuSet(cpuNumber): _nvmlCheckReturn(ret) return list(c_devices[0:c_count.value]) - def nvmlDeviceGetTopologyNearestGpus(device, level): c_count = c_uint(0) fn = _nvmlGetFunctionPointer("nvmlDeviceGetTopologyNearestGpus") @@ -3371,7 +3355,6 @@ def nvmlDeviceGetTopologyNearestGpus(device, level): _nvmlCheckReturn(ret) return list(c_devices[0:c_count.value]) - def nvmlDeviceGetTopologyCommonAncestor(device1, device2): c_level = _nvmlGpuTopologyLevel_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetTopologyCommonAncestor") @@ -3379,7 +3362,6 @@ def nvmlDeviceGetTopologyCommonAncestor(device1, device2): _nvmlCheckReturn(ret) return c_level.value - def nvmlDeviceGetNvLinkUtilizationCounter(device, link, counter): c_rxcounter = c_ulonglong() c_txcounter = c_ulonglong() @@ -3388,28 +3370,24 @@ def nvmlDeviceGetNvLinkUtilizationCounter(device, link, counter): _nvmlCheckReturn(ret) return (c_rxcounter.value, c_txcounter.value) - def nvmlDeviceFreezeNvLinkUtilizationCounter(device, link, counter, freeze): fn = _nvmlGetFunctionPointer("nvmlDeviceFreezeNvLinkUtilizationCounter") ret = fn(device, link, counter, freeze) _nvmlCheckReturn(ret) return None - def nvmlDeviceResetNvLinkUtilizationCounter(device, link, counter): fn = _nvmlGetFunctionPointer("nvmlDeviceResetNvLinkUtilizationCounter") ret = fn(device, link, counter) _nvmlCheckReturn(ret) return None - def nvmlDeviceSetNvLinkUtilizationControl(device, link, counter, control, reset): fn = _nvmlGetFunctionPointer("nvmlDeviceSetNvLinkUtilizationControl") ret = fn(device, link, counter, byref(control), reset) _nvmlCheckReturn(ret) return None - def nvmlDeviceGetNvLinkUtilizationControl(device, link, counter): c_control = nvmlNvLinkUtilizationControl_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkUtilizationControl") @@ -3417,7 +3395,6 @@ def nvmlDeviceGetNvLinkUtilizationControl(device, link, counter): _nvmlCheckReturn(ret) return c_control - def nvmlDeviceGetNvLinkCapability(device, link, capability): c_capResult = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkCapability") @@ -3425,7 +3402,6 @@ def nvmlDeviceGetNvLinkCapability(device, link, capability): _nvmlCheckReturn(ret) return c_capResult.value - def nvmlDeviceGetNvLinkErrorCounter(device, link, counter): c_result = c_ulonglong() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkErrorCounter") @@ -3433,14 +3409,12 @@ def nvmlDeviceGetNvLinkErrorCounter(device, link, counter): _nvmlCheckReturn(ret) return c_result.value - def nvmlDeviceResetNvLinkErrorCounters(device, link): fn = _nvmlGetFunctionPointer("nvmlDeviceResetNvLinkErrorCounters") ret = fn(device, link) _nvmlCheckReturn(ret) return None - def nvmlDeviceGetNvLinkRemotePciInfo(device, link): c_pci = nvmlPciInfo_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkRemotePciInfo_v2") @@ -3448,7 +3422,6 @@ def nvmlDeviceGetNvLinkRemotePciInfo(device, link): _nvmlCheckReturn(ret) return c_pci - def nvmlDeviceGetNvLinkRemoteDeviceType(handle, link): c_type = _nvmlNvLinkDeviceType_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkRemoteDeviceType") @@ -3456,7 +3429,6 @@ def nvmlDeviceGetNvLinkRemoteDeviceType(handle, link): _nvmlCheckReturn(ret) return c_type.value - def nvmlDeviceGetNvLinkState(device, link): c_isActive = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkState") @@ -3464,7 +3436,6 @@ def nvmlDeviceGetNvLinkState(device, link): _nvmlCheckReturn(ret) return c_isActive.value - def nvmlDeviceGetNvLinkVersion(device, link): c_version = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkVersion") @@ -3472,14 +3443,12 @@ def nvmlDeviceGetNvLinkVersion(device, link): _nvmlCheckReturn(ret) return c_version.value - def nvmlDeviceModifyDrainState(pciInfo, newState): fn = _nvmlGetFunctionPointer("nvmlDeviceModifyDrainState") ret = fn(pointer(pciInfo), newState) _nvmlCheckReturn(ret) return None - def nvmlDeviceQueryDrainState(pciInfo): c_newState = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceQueryDrainState") @@ -3487,21 +3456,18 @@ def nvmlDeviceQueryDrainState(pciInfo): _nvmlCheckReturn(ret) return c_newState.value - def nvmlDeviceRemoveGpu(pciInfo): fn = _nvmlGetFunctionPointer("nvmlDeviceRemoveGpu") ret = fn(pointer(pciInfo)) _nvmlCheckReturn(ret) return None - def nvmlDeviceDiscoverGpus(pciInfo): fn = _nvmlGetFunctionPointer("nvmlDeviceDiscoverGpus") ret = fn(pointer(pciInfo)) _nvmlCheckReturn(ret) return None - def nvmlDeviceGetFieldValues(handle, fieldIds): values_arr = c_nvmlFieldValue_t * len(fieldIds) values = values_arr() @@ -3517,7 +3483,6 @@ def nvmlDeviceGetFieldValues(handle, fieldIds): _nvmlCheckReturn(ret) return values - def nvmlDeviceClearFieldValues(handle, fieldIds): values_arr = c_nvmlFieldValue_t * len(fieldIds) values = values_arr() @@ -3533,7 +3498,6 @@ def nvmlDeviceClearFieldValues(handle, fieldIds): _nvmlCheckReturn(ret) return values - def nvmlDeviceGetVirtualizationMode(handle): c_virtualization_mode = c_ulonglong() fn = _nvmlGetFunctionPointer("nvmlDeviceGetVirtualizationMode") @@ -3541,12 +3505,10 @@ def nvmlDeviceGetVirtualizationMode(handle): _nvmlCheckReturn(ret) return c_virtualization_mode.value - def nvmlDeviceSetVirtualizationMode(handle, virtualization_mode): fn = _nvmlGetFunctionPointer("nvmlDeviceSetVirtualizationMode") return fn(handle, virtualization_mode) - def nvmlGetVgpuDriverCapabilities(capability): c_capResult = c_uint() fn = _nvmlGetFunctionPointer("nvmlGetVgpuDriverCapabilities") @@ -3554,7 +3516,6 @@ def nvmlGetVgpuDriverCapabilities(capability): _nvmlCheckReturn(ret) return c_capResult.value - def nvmlDeviceGetVgpuCapabilities(handle, capability): c_capResult = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuCapabilities") @@ -3562,12 +3523,11 @@ def nvmlDeviceGetVgpuCapabilities(handle, capability): _nvmlCheckReturn(ret) return c_capResult.value - def nvmlDeviceGetSupportedVgpus(handle): # first call to get the size c_vgpu_count = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedVgpus") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedVgpus") ret = fn(handle, byref(c_vgpu_count), None) if (ret == NVML_SUCCESS): @@ -3589,12 +3549,11 @@ def nvmlDeviceGetSupportedVgpus(handle): # error case raise NVMLError(ret) - def nvmlDeviceGetCreatableVgpus(handle): # first call to get the size c_vgpu_count = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlDeviceGetCreatableVgpus") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCreatableVgpus") ret = fn(handle, byref(c_vgpu_count), None) if (ret == NVML_SUCCESS): @@ -3616,108 +3575,96 @@ def nvmlDeviceGetCreatableVgpus(handle): # error case raise NVMLError(ret) - def nvmlVgpuTypeGetGpuInstanceProfileId(vgpuTypeId): c_profile_id = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetGpuInstanceProfileId") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetGpuInstanceProfileId") ret = fn(vgpuTypeId, byref(c_profile_id)) _nvmlCheckReturn(ret) return (c_profile_id.value) - @convertStrBytes def nvmlVgpuTypeGetClass(vgpuTypeId): c_class = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE) c_buffer_size = c_uint(NVML_DEVICE_NAME_BUFFER_SIZE) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetClass") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetClass") ret = fn(vgpuTypeId, c_class, byref(c_buffer_size)) _nvmlCheckReturn(ret) return c_class.value - @convertStrBytes def nvmlVgpuTypeGetName(vgpuTypeId): c_name = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE) c_buffer_size = c_uint(NVML_DEVICE_NAME_BUFFER_SIZE) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetName") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetName") ret = fn(vgpuTypeId, c_name, byref(c_buffer_size)) _nvmlCheckReturn(ret) return c_name.value - def nvmlVgpuTypeGetDeviceID(vgpuTypeId): - c_device_id = c_ulonglong(0) + c_device_id = c_ulonglong(0) c_subsystem_id = c_ulonglong(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetDeviceID") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetDeviceID") ret = fn(vgpuTypeId, byref(c_device_id), byref(c_subsystem_id)) _nvmlCheckReturn(ret) return (c_device_id.value, c_subsystem_id.value) - def nvmlVgpuTypeGetFramebufferSize(vgpuTypeId): c_fb_size = c_ulonglong(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFramebufferSize") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFramebufferSize") ret = fn(vgpuTypeId, byref(c_fb_size)) _nvmlCheckReturn(ret) return c_fb_size.value - def nvmlVgpuTypeGetNumDisplayHeads(vgpuTypeId): c_num_heads = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetNumDisplayHeads") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetNumDisplayHeads") ret = fn(vgpuTypeId, byref(c_num_heads)) _nvmlCheckReturn(ret) return c_num_heads.value - def nvmlVgpuTypeGetResolution(vgpuTypeId): c_xdim = c_uint(0) c_ydim = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetResolution") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetResolution") ret = fn(vgpuTypeId, 0, byref(c_xdim), byref(c_ydim)) _nvmlCheckReturn(ret) return (c_xdim.value, c_ydim.value) - @convertStrBytes def nvmlVgpuTypeGetLicense(vgpuTypeId): c_license = create_string_buffer(NVML_GRID_LICENSE_BUFFER_SIZE) c_buffer_size = c_uint(NVML_GRID_LICENSE_BUFFER_SIZE) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetLicense") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetLicense") ret = fn(vgpuTypeId, c_license, c_buffer_size) _nvmlCheckReturn(ret) return c_license.value - def nvmlVgpuTypeGetFrameRateLimit(vgpuTypeId): c_frl_config = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFrameRateLimit") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetFrameRateLimit") ret = fn(vgpuTypeId, byref(c_frl_config)) _nvmlCheckReturn(ret) return c_frl_config.value - def nvmlVgpuTypeGetMaxInstances(handle, vgpuTypeId): c_max_instances = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstances") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstances") ret = fn(handle, vgpuTypeId, byref(c_max_instances)) _nvmlCheckReturn(ret) return c_max_instances.value - def nvmlVgpuTypeGetMaxInstancesPerVm(vgpuTypeId): c_max_instances_per_vm = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstancesPerVm") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetMaxInstancesPerVm") ret = fn(vgpuTypeId, byref(c_max_instances_per_vm)) _nvmlCheckReturn(ret) return c_max_instances_per_vm.value - def nvmlDeviceGetActiveVgpus(handle): # first call to get the size c_vgpu_count = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlDeviceGetActiveVgpus") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetActiveVgpus") ret = fn(handle, byref(c_vgpu_count), None) if (ret == NVML_SUCCESS): @@ -3739,76 +3686,67 @@ def nvmlDeviceGetActiveVgpus(handle): # error case raise NVMLError(ret) - @convertStrBytes def nvmlVgpuInstanceGetVmID(vgpuInstance): c_vm_id = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) c_buffer_size = c_uint(NVML_GRID_LICENSE_BUFFER_SIZE) - c_vm_id_type = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetVmID") + c_vm_id_type = c_uint(0) + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetVmID") ret = fn(vgpuInstance, byref(c_vm_id), c_buffer_size, byref(c_vm_id_type)) _nvmlCheckReturn(ret) return (c_vm_id.value, c_vm_id_type.value) - @convertStrBytes def nvmlVgpuInstanceGetUUID(vgpuInstance): c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) c_buffer_size = c_uint(NVML_DEVICE_UUID_BUFFER_SIZE) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetUUID") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetUUID") ret = fn(vgpuInstance, byref(c_uuid), c_buffer_size) _nvmlCheckReturn(ret) return c_uuid.value - @convertStrBytes def nvmlVgpuInstanceGetMdevUUID(vgpuInstance): c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) c_buffer_size = c_uint(NVML_DEVICE_UUID_BUFFER_SIZE) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetMdevUUID") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetMdevUUID") ret = fn(vgpuInstance, byref(c_uuid), c_buffer_size) _nvmlCheckReturn(ret) return c_uuid.value - @convertStrBytes def nvmlVgpuInstanceGetVmDriverVersion(vgpuInstance): c_driver_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) c_buffer_size = c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetVmDriverVersion") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetVmDriverVersion") ret = fn(vgpuInstance, byref(c_driver_version), c_buffer_size) _nvmlCheckReturn(ret) return c_driver_version.value - def nvmlVgpuInstanceGetLicenseStatus(vgpuInstance): c_license_status = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetLicenseStatus") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetLicenseStatus") ret = fn(vgpuInstance, byref(c_license_status)) _nvmlCheckReturn(ret) return c_license_status.value - def nvmlVgpuInstanceGetLicenseInfo_v2(vgpuInstance): - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetLicenseInfo_v2") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetLicenseInfo_v2") c_license_info = c_nvmlVgpuLicenseInfo_t() ret = fn(vgpuInstance, byref(c_license_info)) _nvmlCheckReturn(ret) return c_license_info - def nvmlVgpuInstanceGetLicenseInfo(vgpuInstance): return nvmlVgpuInstanceGetLicenseInfo_v2(vgpuInstance) - def nvmlVgpuInstanceGetFrameRateLimit(vgpuInstance): c_frl = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFrameRateLimit") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFrameRateLimit") ret = fn(vgpuInstance, byref(c_frl)) _nvmlCheckReturn(ret) return c_frl.value - def nvmlVgpuInstanceGetEccMode(vgpuInstance): c_mode = _nvmlEnableState_t() fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEccMode") @@ -3816,52 +3754,45 @@ def nvmlVgpuInstanceGetEccMode(vgpuInstance): _nvmlCheckReturn(ret) return c_mode.value - def nvmlVgpuInstanceGetType(vgpuInstance): c_vgpu_type = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetType") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetType") ret = fn(vgpuInstance, byref(c_vgpu_type)) _nvmlCheckReturn(ret) return c_vgpu_type.value - def nvmlVgpuInstanceGetEncoderCapacity(vgpuInstance): c_encoder_capacity = c_ulonglong(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderCapacity") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderCapacity") ret = fn(vgpuInstance, byref(c_encoder_capacity)) _nvmlCheckReturn(ret) return c_encoder_capacity.value - def nvmlVgpuInstanceSetEncoderCapacity(vgpuInstance, encoder_capacity): - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceSetEncoderCapacity") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceSetEncoderCapacity") return fn(vgpuInstance, encoder_capacity) - def nvmlVgpuInstanceGetFbUsage(vgpuInstance): c_fb_usage = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFbUsage") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFbUsage") ret = fn(vgpuInstance, byref(c_fb_usage)) _nvmlCheckReturn(ret) return c_fb_usage.value - def nvmlVgpuTypeGetCapabilities(vgpuTypeId, capability): c_cap_result = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetCapabilities") + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetCapabilities") ret = fn(vgpuTypeId, _nvmlVgpuCapability_t(capability), byref(c_cap_result)) _nvmlCheckReturn(ret) return (c_cap_result.value) - def nvmlVgpuInstanceGetGpuInstanceId(vgpuInstance): c_id = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetGpuInstanceId") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetGpuInstanceId") ret = fn(vgpuInstance, byref(c_id)) _nvmlCheckReturn(ret) return (c_id.value) - @convertStrBytes def nvmlVgpuInstanceGetGpuPciId(vgpuInstance): c_vgpuPciId = create_string_buffer(NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) @@ -3870,14 +3801,13 @@ def nvmlVgpuInstanceGetGpuPciId(vgpuInstance): _nvmlCheckReturn(ret) return c_vgpuPciId.value - def nvmlDeviceGetVgpuUtilization(handle, timeStamp): # first call to get the size c_vgpu_count = c_uint(0) c_time_stamp = c_ulonglong(timeStamp) c_sample_value_type = _nvmlValueType_t() - fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuUtilization") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuUtilization") ret = fn(handle, c_time_stamp, byref(c_sample_value_type), byref(c_vgpu_count), None) if (ret == NVML_SUCCESS): @@ -3897,15 +3827,13 @@ def nvmlDeviceGetVgpuUtilization(handle, timeStamp): # error case raise NVMLError(ret) - def nvmlDeviceGetP2PStatus(device1, device2, p2pIndex): c_p2pstatus = _nvmlGpuP2PStatus_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetP2PStatus") - ret = fn(device1, device2, p2pIndex, byref(c_p2pstatus)) + ret = fn(device1, device2,p2pIndex, byref(c_p2pstatus)) _nvmlCheckReturn(ret) return c_p2pstatus.value - def nvmlDeviceGetGridLicensableFeatures_v4(handle): c_get_grid_licensable_features = c_nvmlGridLicensableFeatures_v4_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetGridLicensableFeatures_v4") @@ -3914,25 +3842,21 @@ def nvmlDeviceGetGridLicensableFeatures_v4(handle): return (c_get_grid_licensable_features) - def nvmlDeviceGetGridLicensableFeatures(handle): return nvmlDeviceGetGridLicensableFeatures_v4(handle) - def nvmlDeviceGetGspFirmwareVersion(handle, version): fn = _nvmlGetFunctionPointer("nvmlDeviceGetGspFirmwareVersion") ret = fn(handle, version) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetGspFirmwareMode(handle, isEnabled, defaultMode): fn = _nvmlGetFunctionPointer("nvmlDeviceGetGspFirmwareMode") ret = fn(handle, isEnabled, defaultMode) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetEncoderCapacity(handle, encoderQueryType): c_encoder_capacity = c_ulonglong(0) c_encoderQuery_type = _nvmlEncoderQueryType_t(encoderQueryType) @@ -3942,13 +3866,12 @@ def nvmlDeviceGetEncoderCapacity(handle, encoderQueryType): _nvmlCheckReturn(ret) return c_encoder_capacity.value - def nvmlDeviceGetVgpuProcessUtilization(handle, timeStamp): # first call to get the size c_vgpu_count = c_uint(0) c_time_stamp = c_ulonglong(timeStamp) - fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuProcessUtilization") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuProcessUtilization") ret = fn(handle, c_time_stamp, byref(c_vgpu_count), None) if (ret == NVML_SUCCESS): @@ -3968,7 +3891,6 @@ def nvmlDeviceGetVgpuProcessUtilization(handle, timeStamp): # error case raise NVMLError(ret) - def nvmlDeviceGetEncoderStats(handle): c_encoderCount = c_ulonglong(0) c_encodeFps = c_ulonglong(0) @@ -3978,12 +3900,11 @@ def nvmlDeviceGetEncoderStats(handle): _nvmlCheckReturn(ret) return (c_encoderCount.value, c_encodeFps.value, c_encoderLatency.value) - def nvmlDeviceGetEncoderSessions(handle): # first call to get the size c_session_count = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderSessions") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetEncoderSessions") ret = fn(handle, byref(c_session_count), None) if (ret == NVML_SUCCESS): @@ -4005,7 +3926,6 @@ def nvmlDeviceGetEncoderSessions(handle): # error case raise NVMLError(ret) - def nvmlDeviceGetFBCStats(handle): c_fbcStats = c_nvmlFBCStats_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetFBCStats") @@ -4013,12 +3933,11 @@ def nvmlDeviceGetFBCStats(handle): _nvmlCheckReturn(ret) return c_fbcStats - def nvmlDeviceGetFBCSessions(handle): # first call to get the size c_session_count = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlDeviceGetFBCSessions") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetFBCSessions") ret = fn(handle, byref(c_session_count), None) if (ret == NVML_SUCCESS): @@ -4040,22 +3959,20 @@ def nvmlDeviceGetFBCSessions(handle): # error case raise NVMLError(ret) - def nvmlVgpuInstanceGetEncoderStats(vgpuInstance): - c_encoderCount = c_ulonglong(0) - c_encodeFps = c_ulonglong(0) - c_encoderLatency = c_ulonglong(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderStats") + c_encoderCount = c_ulonglong(0) + c_encodeFps = c_ulonglong(0) + c_encoderLatency = c_ulonglong(0) + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderStats") ret = fn(vgpuInstance, byref(c_encoderCount), byref(c_encodeFps), byref(c_encoderLatency)) _nvmlCheckReturn(ret) return (c_encoderCount.value, c_encodeFps.value, c_encoderLatency.value) - def nvmlVgpuInstanceGetEncoderSessions(vgpuInstance): # first call to get the size c_session_count = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderSessions") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetEncoderSessions") ret = fn(vgpuInstance, byref(c_session_count), None) if (ret == NVML_SUCCESS): @@ -4077,7 +3994,6 @@ def nvmlVgpuInstanceGetEncoderSessions(vgpuInstance): # error case raise NVMLError(ret) - def nvmlVgpuInstanceGetFBCStats(vgpuInstance): c_fbcStats = c_nvmlFBCStats_t() fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFBCStats") @@ -4085,12 +4001,11 @@ def nvmlVgpuInstanceGetFBCStats(vgpuInstance): _nvmlCheckReturn(ret) return c_fbcStats - def nvmlVgpuInstanceGetFBCSessions(vgpuInstance): # first call to get the size c_session_count = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFBCSessions") + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFBCSessions") ret = fn(vgpuInstance, byref(c_session_count), None) if (ret == NVML_SUCCESS): @@ -4112,13 +4027,12 @@ def nvmlVgpuInstanceGetFBCSessions(vgpuInstance): # error case raise NVMLError(ret) - def nvmlDeviceGetProcessUtilization(handle, timeStamp): # first call to get the size c_count = c_uint(0) c_time_stamp = c_ulonglong(timeStamp) - fn = _nvmlGetFunctionPointer("nvmlDeviceGetProcessUtilization") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetProcessUtilization") ret = fn(handle, None, byref(c_count), c_time_stamp) if (ret == NVML_ERROR_INSUFFICIENT_SIZE): @@ -4135,7 +4049,6 @@ def nvmlDeviceGetProcessUtilization(handle, timeStamp): # error case raise NVMLError(ret) - def nvmlVgpuInstanceGetMetadata(vgpuInstance): fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetMetadata") c_vgpuMetadata = c_nvmlVgpuMetadata_t() @@ -4150,7 +4063,6 @@ def nvmlVgpuInstanceGetMetadata(vgpuInstance): raise NVMLError(ret) return c_vgpuMetadata - def nvmlDeviceGetVgpuMetadata(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuMetadata") c_vgpuPgpuMetadata = c_nvmlVgpuPgpuMetadata_t() @@ -4165,7 +4077,6 @@ def nvmlDeviceGetVgpuMetadata(handle): raise NVMLError(ret) return c_vgpuPgpuMetadata - def nvmlGetVgpuCompatibility(vgpuMetadata, pgpuMetadata): fn = _nvmlGetFunctionPointer("nvmlGetVgpuCompatibility") c_vgpuPgpuCompatibility = c_nvmlVgpuPgpuCompatibility_t() @@ -4173,7 +4084,6 @@ def nvmlGetVgpuCompatibility(vgpuMetadata, pgpuMetadata): _nvmlCheckReturn(ret) return c_vgpuPgpuCompatibility - @convertStrBytes def nvmlDeviceGetPgpuMetadataString(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceGetPgpuMetadataString") @@ -4189,7 +4099,6 @@ def nvmlDeviceGetPgpuMetadataString(handle): raise NVMLError(ret) return (c_pgpuMetadata.value, c_bufferSize.value) - def nvmlDeviceGetVgpuSchedulerLog(handle): c_vgpu_sched_log = c_nvmlVgpuSchedulerLog_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerLog") @@ -4197,7 +4106,6 @@ def nvmlDeviceGetVgpuSchedulerLog(handle): _nvmlCheckReturn(ret) return c_vgpu_sched_log - def nvmlDeviceGetVgpuSchedulerState(handle): c_vgpu_sched_state = c_nvmlVgpuSchedulerGetState_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerState") @@ -4205,7 +4113,6 @@ def nvmlDeviceGetVgpuSchedulerState(handle): _nvmlCheckReturn(ret) return c_vgpu_sched_state - def nvmlDeviceGetVgpuSchedulerCapabilities(handle): c_vgpu_sched_caps = c_nvmlVgpuSchedulerCapabilities_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerCapabilities") @@ -4213,6 +4120,11 @@ def nvmlDeviceGetVgpuSchedulerCapabilities(handle): _nvmlCheckReturn(ret) return c_vgpu_sched_caps +def nvmlDeviceSetVgpuSchedulerState(handle, sched_state): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetVgpuSchedulerState") + ret = fn(handle, byref(sched_state)) + _nvmlCheckReturn(ret) + return ret def nvmlSetVgpuVersion(vgpuVersion): fn = _nvmlGetFunctionPointer("nvmlSetVgpuVersion") @@ -4220,14 +4132,12 @@ def nvmlSetVgpuVersion(vgpuVersion): _nvmlCheckReturn(ret) return ret - def nvmlGetVgpuVersion(supported, current): fn = _nvmlGetFunctionPointer("nvmlGetVgpuVersion") ret = fn(byref(supported), byref(current)) _nvmlCheckReturn(ret) return ret - def nvmlVgpuInstanceGetAccountingMode(vgpuInstance): c_mode = _nvmlEnableState_t() fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetAccountingMode") @@ -4235,7 +4145,6 @@ def nvmlVgpuInstanceGetAccountingMode(vgpuInstance): _nvmlCheckReturn(ret) return c_mode.value - def nvmlVgpuInstanceGetAccountingPids(vgpuInstance): c_pidCount = c_uint() fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetAccountingPids") @@ -4249,7 +4158,6 @@ def nvmlVgpuInstanceGetAccountingPids(vgpuInstance): raise NVMLError(ret) return (c_pidCount, c_pidArray) - def nvmlVgpuInstanceGetAccountingStats(vgpuInstance, pid): c_accountingStats = c_nvmlAccountingStats_t() fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetAccountingStats") @@ -4257,14 +4165,12 @@ def nvmlVgpuInstanceGetAccountingStats(vgpuInstance, pid): _nvmlCheckReturn(ret) return c_accountingStats - def nvmlVgpuInstanceClearAccountingPids(vgpuInstance): fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceClearAccountingPids") ret = fn(vgpuInstance) _nvmlCheckReturn(ret) return ret - def nvmlGetExcludedDeviceCount(): c_count = c_uint() fn = _nvmlGetFunctionPointer("nvmlGetExcludedDeviceCount") @@ -4272,7 +4178,6 @@ def nvmlGetExcludedDeviceCount(): _nvmlCheckReturn(ret) return c_count.value - def nvmlGetExcludedDeviceInfoByIndex(index): c_index = c_uint(index) info = c_nvmlExcludedDeviceInfo_t() @@ -4281,7 +4186,6 @@ def nvmlGetExcludedDeviceInfoByIndex(index): _nvmlCheckReturn(ret) return info - def nvmlDeviceGetHostVgpuMode(handle): c_host_vgpu_mode = _nvmlHostVgpuMode_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetHostVgpuMode") @@ -4289,7 +4193,6 @@ def nvmlDeviceGetHostVgpuMode(handle): _nvmlCheckReturn(ret) return c_host_vgpu_mode.value - def nvmlDeviceSetMigMode(device, mode): c_activationStatus = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceSetMigMode") @@ -4297,7 +4200,6 @@ def nvmlDeviceSetMigMode(device, mode): _nvmlCheckReturn(ret) return c_activationStatus.value - def nvmlDeviceGetMigMode(device): c_currentMode = c_uint() c_pendingMode = c_uint() @@ -4306,7 +4208,6 @@ def nvmlDeviceGetMigMode(device): _nvmlCheckReturn(ret) return [c_currentMode.value, c_pendingMode.value] - def nvmlDeviceGetGpuInstanceProfileInfo(device, profile, version=2): if version == 2: c_info = c_nvmlGpuInstanceProfileInfo_v2_t() @@ -4320,11 +4221,9 @@ def nvmlDeviceGetGpuInstanceProfileInfo(device, profile, version=2): _nvmlCheckReturn(ret) return c_info - # Define function alias for the API exposed by NVML nvmlDeviceGetGpuInstanceProfileInfoV = nvmlDeviceGetGpuInstanceProfileInfo - def nvmlDeviceGetGpuInstanceRemainingCapacity(device, profileId): c_count = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceRemainingCapacity") @@ -4332,14 +4231,12 @@ def nvmlDeviceGetGpuInstanceRemainingCapacity(device, profileId): _nvmlCheckReturn(ret) return c_count.value - def nvmlDeviceGetGpuInstancePossiblePlacements(device, profileId, placementsRef, countRef): fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstancePossiblePlacements_v2") ret = fn(device, profileId, placementsRef, countRef) _nvmlCheckReturn(ret) return ret - def nvmlDeviceCreateGpuInstance(device, profileId): c_instance = c_nvmlGpuInstance_t() fn = _nvmlGetFunctionPointer("nvmlDeviceCreateGpuInstance") @@ -4347,7 +4244,6 @@ def nvmlDeviceCreateGpuInstance(device, profileId): _nvmlCheckReturn(ret) return c_instance - def nvmlDeviceCreateGpuInstanceWithPlacement(device, profileId, placement): c_instance = c_nvmlGpuInstance_t() fn = _nvmlGetFunctionPointer("nvmlDeviceCreateGpuInstanceWithPlacement") @@ -4355,21 +4251,18 @@ def nvmlDeviceCreateGpuInstanceWithPlacement(device, profileId, placement): _nvmlCheckReturn(ret) return c_instance - def nvmlGpuInstanceDestroy(gpuInstance): fn = _nvmlGetFunctionPointer("nvmlGpuInstanceDestroy") ret = fn(gpuInstance) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetGpuInstances(device, profileId, gpuInstancesRef, countRef): fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstances") ret = fn(device, profileId, gpuInstancesRef, countRef) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetGpuInstanceById(device, gpuInstanceId): c_instance = c_nvmlGpuInstance_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceById") @@ -4377,7 +4270,6 @@ def nvmlDeviceGetGpuInstanceById(device, gpuInstanceId): _nvmlCheckReturn(ret) return c_instance - def nvmlGpuInstanceGetInfo(gpuInstance): c_info = c_nvmlGpuInstanceInfo_t() fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetInfo") @@ -4385,7 +4277,6 @@ def nvmlGpuInstanceGetInfo(gpuInstance): _nvmlCheckReturn(ret) return c_info - def nvmlGpuInstanceGetComputeInstanceProfileInfo(device, profile, engProfile, version=2): if version == 2: c_info = c_nvmlComputeInstanceProfileInfo_v2_t() @@ -4394,16 +4285,14 @@ def nvmlGpuInstanceGetComputeInstanceProfileInfo(device, profile, engProfile, ve c_info = c_nvmlComputeInstanceProfileInfo_t() fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceProfileInfo") else: - raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND) + raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND) ret = fn(device, profile, engProfile, byref(c_info)) _nvmlCheckReturn(ret) return c_info - # Define function alias for the API exposed by NVML nvmlGpuInstanceGetComputeInstanceProfileInfoV = nvmlGpuInstanceGetComputeInstanceProfileInfo - def nvmlGpuInstanceGetComputeInstanceRemainingCapacity(gpuInstance, profileId): c_count = c_uint() fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceRemainingCapacity") @@ -4411,14 +4300,12 @@ def nvmlGpuInstanceGetComputeInstanceRemainingCapacity(gpuInstance, profileId): _nvmlCheckReturn(ret) return c_count.value - def nvmlGpuInstanceGetComputeInstancePossiblePlacements(gpuInstance, profileId, placementsRef, countRef): fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstancePossiblePlacements") ret = fn(gpuInstance, profileId, placementsRef, countRef) _nvmlCheckReturn(ret) return ret - def nvmlGpuInstanceCreateComputeInstance(gpuInstance, profileId): c_instance = c_nvmlComputeInstance_t() fn = _nvmlGetFunctionPointer("nvmlGpuInstanceCreateComputeInstance") @@ -4426,7 +4313,6 @@ def nvmlGpuInstanceCreateComputeInstance(gpuInstance, profileId): _nvmlCheckReturn(ret) return c_instance - def nvmlGpuInstanceCreateComputeInstanceWithPlacement(gpuInstance, profileId, placement): c_instance = c_nvmlComputeInstance_t() fn = _nvmlGetFunctionPointer("nvmlGpuInstanceCreateComputeInstanceWithPlacement") @@ -4434,21 +4320,18 @@ def nvmlGpuInstanceCreateComputeInstanceWithPlacement(gpuInstance, profileId, pl _nvmlCheckReturn(ret) return c_instance - def nvmlComputeInstanceDestroy(computeInstance): fn = _nvmlGetFunctionPointer("nvmlComputeInstanceDestroy") ret = fn(computeInstance) _nvmlCheckReturn(ret) return ret - def nvmlGpuInstanceGetComputeInstances(gpuInstance, profileId, computeInstancesRef, countRef): fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstances") ret = fn(gpuInstance, profileId, computeInstancesRef, countRef) _nvmlCheckReturn(ret) return ret - def nvmlGpuInstanceGetComputeInstanceById(gpuInstance, computeInstanceId): c_instance = c_nvmlComputeInstance_t() fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceById") @@ -4456,7 +4339,6 @@ def nvmlGpuInstanceGetComputeInstanceById(gpuInstance, computeInstanceId): _nvmlCheckReturn(ret) return c_instance - def nvmlComputeInstanceGetInfo_v2(computeInstance): c_info = c_nvmlComputeInstanceInfo_t() fn = _nvmlGetFunctionPointer("nvmlComputeInstanceGetInfo_v2") @@ -4464,11 +4346,9 @@ def nvmlComputeInstanceGetInfo_v2(computeInstance): _nvmlCheckReturn(ret) return c_info - def nvmlComputeInstanceGetInfo(computeInstance): return nvmlComputeInstanceGetInfo_v2(computeInstance) - def nvmlDeviceIsMigDeviceHandle(device): c_isMigDevice = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceIsMigDeviceHandle") @@ -4476,7 +4356,6 @@ def nvmlDeviceIsMigDeviceHandle(device): _nvmlCheckReturn(ret) return c_isMigDevice - def nvmlDeviceGetGpuInstanceId(device): c_gpuInstanceId = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceId") @@ -4484,7 +4363,6 @@ def nvmlDeviceGetGpuInstanceId(device): _nvmlCheckReturn(ret) return c_gpuInstanceId.value - def nvmlDeviceGetComputeInstanceId(device): c_computeInstanceId = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeInstanceId") @@ -4492,7 +4370,6 @@ def nvmlDeviceGetComputeInstanceId(device): _nvmlCheckReturn(ret) return c_computeInstanceId.value - def nvmlDeviceGetMaxMigDeviceCount(device): c_count = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxMigDeviceCount") @@ -4500,7 +4377,6 @@ def nvmlDeviceGetMaxMigDeviceCount(device): _nvmlCheckReturn(ret) return c_count.value - def nvmlDeviceGetMigDeviceHandleByIndex(device, index): c_index = c_uint(index) migDevice = c_nvmlDevice_t() @@ -4509,7 +4385,6 @@ def nvmlDeviceGetMigDeviceHandleByIndex(device, index): _nvmlCheckReturn(ret) return migDevice - def nvmlDeviceGetDeviceHandleFromMigDeviceHandle(migDevice): device = c_nvmlDevice_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetDeviceHandleFromMigDeviceHandle") @@ -4517,7 +4392,6 @@ def nvmlDeviceGetDeviceHandleFromMigDeviceHandle(migDevice): _nvmlCheckReturn(ret) return device - def nvmlDeviceGetAttributes_v2(device): c_attrs = c_nvmlDeviceAttributes() fn = _nvmlGetFunctionPointer("nvmlDeviceGetAttributes_v2") @@ -4525,11 +4399,9 @@ def nvmlDeviceGetAttributes_v2(device): _nvmlCheckReturn(ret) return c_attrs - def nvmlDeviceGetAttributes(device): return nvmlDeviceGetAttributes_v2(device) - def nvmlDeviceGetRemappedRows(device): fn = _nvmlGetFunctionPointer("nvmlDeviceGetRemappedRows") c_corr = c_uint() @@ -4540,7 +4412,6 @@ def nvmlDeviceGetRemappedRows(device): _nvmlCheckReturn(ret) return (c_corr.value, c_unc.value, c_bpending.value, c_bfailure.value) - def nvmlDeviceGetRowRemapperHistogram(device): c_vals = c_nvmlRowRemapperHistogramValues() fn = _nvmlGetFunctionPointer("nvmlDeviceGetRowRemapperHistogram") @@ -4548,7 +4419,6 @@ def nvmlDeviceGetRowRemapperHistogram(device): _nvmlCheckReturn(ret) return c_vals - def nvmlDeviceGetArchitecture(device): arch = _nvmlDeviceArchitecture_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetArchitecture") @@ -4556,7 +4426,6 @@ def nvmlDeviceGetArchitecture(device): _nvmlCheckReturn(ret) return arch.value - def nvmlDeviceGetBusType(device): c_busType = _nvmlBusType_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetBusType") @@ -4564,7 +4433,6 @@ def nvmlDeviceGetBusType(device): _nvmlCheckReturn(ret) return c_busType.value - def nvmlDeviceGetIrqNum(device): c_irqNum = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetIrqNum") @@ -4572,7 +4440,6 @@ def nvmlDeviceGetIrqNum(device): _nvmlCheckReturn(ret) return c_irqNum.value - def nvmlDeviceGetNumGpuCores(device): c_numCores = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumGpuCores") @@ -4580,7 +4447,6 @@ def nvmlDeviceGetNumGpuCores(device): _nvmlCheckReturn(ret) return c_numCores.value - def nvmlDeviceGetPowerSource(device): c_powerSource = _nvmlPowerSource_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerSource") @@ -4588,7 +4454,6 @@ def nvmlDeviceGetPowerSource(device): _nvmlCheckReturn(ret) return c_powerSource.value - def nvmlDeviceGetMemoryBusWidth(device): c_memBusWidth = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryBusWidth") @@ -4596,7 +4461,6 @@ def nvmlDeviceGetMemoryBusWidth(device): _nvmlCheckReturn(ret) return c_memBusWidth.value - def nvmlDeviceGetPcieLinkMaxSpeed(device): c_speed = _nvmlPcieLinkMaxSpeed_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieLinkMaxSpeed") @@ -4604,7 +4468,6 @@ def nvmlDeviceGetPcieLinkMaxSpeed(device): _nvmlCheckReturn(ret) return c_speed.value - def nvmlDeviceGetAdaptiveClockInfoStatus(device): c_adaptiveClockInfoStatus = _nvmlAdaptiveClockInfoStatus_t() fn = _nvmlGetFunctionPointer("nvmlDeviceGetAdaptiveClockInfoStatus") @@ -4612,7 +4475,6 @@ def nvmlDeviceGetAdaptiveClockInfoStatus(device): _nvmlCheckReturn(ret) return c_adaptiveClockInfoStatus.value - def nvmlDeviceGetPcieSpeed(device): c_speed = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieSpeed") @@ -4620,39 +4482,34 @@ def nvmlDeviceGetPcieSpeed(device): _nvmlCheckReturn(ret) return c_speed.value - def nvmlDeviceGetDynamicPstatesInfo(device, c_dynamicpstatesinfo): - fn = _nvmlGetFunctionPointer("nvmlDeviceGetDynamicPstatesInfo") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDynamicPstatesInfo"); ret = fn(device, c_dynamicpstatesinfo) _nvmlCheckReturn(ret) return ret - def nvmlDeviceSetFanSpeed_v2(handle, index, speed): - fn = _nvmlGetFunctionPointer("nvmlDeviceSetFanSpeed_v2") + fn = _nvmlGetFunctionPointer("nvmlDeviceSetFanSpeed_v2"); ret = fn(handle, index, speed) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetThermalSettings(device, sensorindex, c_thermalsettings): - fn = _nvmlGetFunctionPointer("nvmlDeviceGetThermalSettings") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetThermalSettings"); ret = fn(device, sensorindex, c_thermalsettings) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetMinMaxClockOfPState(device, type, pstate, minClockMHz, maxClockMHz): - fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinMaxClockOfPState") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinMaxClockOfPState"); ret = fn(device, _nvmlClockType_t(type), _nvmlClockType_t(pstate), minClockMHz, maxClockMHz) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetSupportedPerformanceStates(device): pstates = [] c_count = c_uint(NVML_MAX_GPU_PERF_PSTATES) - c_size = sizeof(c_uint) * c_count.value + c_size = sizeof(c_uint)*c_count.value # NOTE: use 'c_uint' to represent the size of the nvmlPstate_t enumeration. pstates_array = _nvmlPstates_t * c_count.value @@ -4668,7 +4525,6 @@ def nvmlDeviceGetSupportedPerformanceStates(device): return pstates - def nvmlDeviceGetGpcClkVfOffset(device): offset = c_int32() fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpcClkVfOffset") @@ -4676,7 +4532,6 @@ def nvmlDeviceGetGpcClkVfOffset(device): _nvmlCheckReturn(ret) return offset.value - def nvmlDeviceSetGpcClkVfOffset(device, offset): c_offset = c_int32(offset) fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpcClkVfOffset") @@ -4684,14 +4539,12 @@ def nvmlDeviceSetGpcClkVfOffset(device, offset): _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetGpcClkMinMaxVfOffset(device, minOffset, maxOffset): fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpcClkMinMaxVfOffset") ret = fn(device, minOffset, maxOffset) _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetMemClkVfOffset(device): offset = c_int32() fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemClkVfOffset") @@ -4699,7 +4552,6 @@ def nvmlDeviceGetMemClkVfOffset(device): _nvmlCheckReturn(ret) return offset.value - def nvmlDeviceSetMemClkVfOffset(device, offset): c_offset = c_int32(offset) fn = _nvmlGetFunctionPointer("nvmlDeviceSetMemClkVfOffset") @@ -4707,13 +4559,75 @@ def nvmlDeviceSetMemClkVfOffset(device, offset): _nvmlCheckReturn(ret) return ret - def nvmlDeviceGetMemClkMinMaxVfOffset(device, minOffset, maxOffset): fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemClkMinMaxVfOffset") ret = fn(device, minOffset, maxOffset) _nvmlCheckReturn(ret) return ret +def nvmlSystemSetConfComputeGpusReadyState(state): + c_state = c_uint(state) + fn = _nvmlGetFunctionPointer("nvmlSystemSetConfComputeGpusReadyState") + ret = fn(c_state) + _nvmlCheckReturn(ret) + return ret + +def nvmlSystemGetConfComputeGpusReadyState(): + c_state = c_uint() + fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeGpusReadyState") + ret = fn(byref(c_state)) + _nvmlCheckReturn(ret) + return c_state.value + +def nvmlSystemGetConfComputeCapabilities(): + c_ccSysCaps = c_nvmlConfComputeSystemCaps_t() + fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeCapabilities") + ret = fn(byref(c_ccSysCaps)) + _nvmlCheckReturn(ret) + return c_ccSysCaps + +def nvmlSystemGetConfComputeState(): + c_state = c_nvmlConfComputeSystemState_t() + fn = _nvmlGetFunctionPointer("nvmlSystemGetConfComputeState") + ret = fn(byref(c_state)) + _nvmlCheckReturn(ret) + return c_state + +def nvmlDeviceSetConfComputeUnprotectedMemSize(device, c_ccMemSize): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetConfComputeUnprotectedMemSize") + ret = fn(device, c_ccMemSize) + _nvmlCheckReturn(ret) + return ret + +def nvmlDeviceGetConfComputeMemSizeInfo(device): + c_ccMemSize = c_nvmlConfComputeMemSizeInfo_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeMemSizeInfo") + ret = fn(device, byref(c_ccMemSize)) + _nvmlCheckReturn(ret) + return c_ccMemSize + +def nvmlDeviceGetConfComputeProtectedMemoryUsage(device): + c_memory = c_nvmlMemory_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeProtectedMemoryUsage") + ret = fn(device, byref(c_memory)) + _nvmlCheckReturn(ret) + return c_memory + +def nvmlDeviceGetConfComputeGpuCertificate(device): + c_cert = c_nvmlConfComputeGpuCertificate_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeGpuCertificate") + ret = fn(device, byref(c_cert)) + _nvmlCheckReturn(ret) + return c_cert + +def nvmlDeviceGetConfComputeGpuAttestationReport(device, c_nonce): + c_attestReport = c_nvmlConfComputeGpuAttestationReport_t() + c_nonce_arr = (c_uint8 * len(c_nonce))(*(c_nonce)) + setattr(c_attestReport, 'nonce', c_nonce_arr) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetConfComputeGpuAttestationReport") + ret = fn(device, byref(c_attestReport)) + _nvmlCheckReturn(ret) + return c_attestReport ## GPM ## ######### @@ -4721,77 +4635,76 @@ def nvmlDeviceGetMemClkMinMaxVfOffset(device, minOffset, maxOffset): ## Enums/defines #### GPM Metric Identifiers -NVML_GPM_METRIC_GRAPHICS_UTIL = 1 # Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 -NVML_GPM_METRIC_SM_UTIL = 2 # Percentage of SMs that were busy. 0.0 - 100.0 -NVML_GPM_METRIC_SM_OCCUPANCY = 3 # Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 -NVML_GPM_METRIC_INTEGER_UTIL = 4 # Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 -NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5 # Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 -NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6 # Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 -NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7 # Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 -NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9 # Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 -NVML_GPM_METRIC_DRAM_BW_UTIL = 10 # Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 -NVML_GPM_METRIC_FP64_UTIL = 11 # Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 -NVML_GPM_METRIC_FP32_UTIL = 12 # Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 -NVML_GPM_METRIC_FP16_UTIL = 13 # Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 -NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20 # PCIe traffic from this GPU in MiB/sec -NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21 # PCIe traffic to this GPU in MiB/sec -NVML_GPM_METRIC_NVDEC_0_UTIL = 30 # Percent utilization of NVDEC 0. 0.0 - 100.0 -NVML_GPM_METRIC_NVDEC_1_UTIL = 31 # Percent utilization of NVDEC 1. 0.0 - 100.0 -NVML_GPM_METRIC_NVDEC_2_UTIL = 32 # Percent utilization of NVDEC 2. 0.0 - 100.0 -NVML_GPM_METRIC_NVDEC_3_UTIL = 33 # Percent utilization of NVDEC 3. 0.0 - 100.0 -NVML_GPM_METRIC_NVDEC_4_UTIL = 34 # Percent utilization of NVDEC 4. 0.0 - 100.0 -NVML_GPM_METRIC_NVDEC_5_UTIL = 35 # Percent utilization of NVDEC 5. 0.0 - 100.0 -NVML_GPM_METRIC_NVDEC_6_UTIL = 36 # Percent utilization of NVDEC 6. 0.0 - 100.0 -NVML_GPM_METRIC_NVDEC_7_UTIL = 37 # Percent utilization of NVDEC 7. 0.0 - 100.0 -NVML_GPM_METRIC_NVJPG_0_UTIL = 40 # Percent utilization of NVJPG 0. 0.0 - 100.0 -NVML_GPM_METRIC_NVJPG_1_UTIL = 41 # Percent utilization of NVJPG 1. 0.0 - 100.0 -NVML_GPM_METRIC_NVJPG_2_UTIL = 42 # Percent utilization of NVJPG 2. 0.0 - 100.0 -NVML_GPM_METRIC_NVJPG_3_UTIL = 43 # Percent utilization of NVJPG 3. 0.0 - 100.0 -NVML_GPM_METRIC_NVJPG_4_UTIL = 44 # Percent utilization of NVJPG 4. 0.0 - 100.0 -NVML_GPM_METRIC_NVJPG_5_UTIL = 45 # Percent utilization of NVJPG 5. 0.0 - 100.0 -NVML_GPM_METRIC_NVJPG_6_UTIL = 46 # Percent utilization of NVJPG 6. 0.0 - 100.0 -NVML_GPM_METRIC_NVJPG_7_UTIL = 47 # Percent utilization of NVJPG 7. 0.0 - 100.0 -NVML_GPM_METRIC_NVOFA_0_UTIL = 50 # Percent utilization of NVOFA 0. 0.0 - 100.0 -NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60 # NvLink read bandwidth for all links in MiB/sec -NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61 # NvLink write bandwidth for all links in MiB/sec -NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62 # NvLink read bandwidth for link 0 in MiB/sec -NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63 # NvLink write bandwidth for link 0 in MiB/sec -NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64 # NvLink read bandwidth for link 1 in MiB/sec -NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65 # NvLink write bandwidth for link 1 in MiB/sec -NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66 # NvLink read bandwidth for link 2 in MiB/sec -NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67 # NvLink write bandwidth for link 2 in MiB/sec -NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68 # NvLink read bandwidth for link 3 in MiB/sec -NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69 # NvLink write bandwidth for link 3 in MiB/sec -NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70 # NvLink read bandwidth for link 4 in MiB/sec -NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71 # NvLink write bandwidth for link 4 in MiB/sec -NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72 # NvLink read bandwidth for link 5 in MiB/sec -NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73 # NvLink write bandwidth for link 5 in MiB/sec -NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74 # NvLink read bandwidth for link 6 in MiB/sec -NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75 # NvLink write bandwidth for link 6 in MiB/sec -NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76 # NvLink read bandwidth for link 7 in MiB/sec -NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77 # NvLink write bandwidth for link 7 in MiB/sec -NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78 # NvLink read bandwidth for link 8 in MiB/sec -NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79 # NvLink write bandwidth for link 8 in MiB/sec -NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80 # NvLink read bandwidth for link 9 in MiB/sec -NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81 # NvLink write bandwidth for link 9 in MiB/sec -NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82 # NvLink read bandwidth for link 10 in MiB/sec -NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83 # NvLink write bandwidth for link 10 in MiB/sec -NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84 # NvLink read bandwidth for link 11 in MiB/sec -NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85 # NvLink write bandwidth for link 11 in MiB/sec -NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86 # NvLink read bandwidth for link 12 in MiB/sec -NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87 # NvLink write bandwidth for link 12 in MiB/sec -NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88 # NvLink read bandwidth for link 13 in MiB/sec -NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89 # NvLink write bandwidth for link 13 in MiB/sec -NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90 # NvLink read bandwidth for link 14 in MiB/sec -NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91 # NvLink write bandwidth for link 14 in MiB/sec -NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92 # NvLink read bandwidth for link 15 in MiB/sec -NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93 # NvLink write bandwidth for link 15 in MiB/sec -NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94 # NvLink read bandwidth for link 16 in MiB/sec -NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95 # NvLink write bandwidth for link 16 in MiB/sec -NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96 # NvLink read bandwidth for link 17 in MiB/sec -NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97 # NvLink write bandwidth for link 17 in MiB/sec -NVML_GPM_METRIC_MAX = 98 - +NVML_GPM_METRIC_GRAPHICS_UTIL = 1 # Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 +NVML_GPM_METRIC_SM_UTIL = 2 # Percentage of SMs that were busy. 0.0 - 100.0 +NVML_GPM_METRIC_SM_OCCUPANCY = 3 # Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 +NVML_GPM_METRIC_INTEGER_UTIL = 4 # Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 +NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5 # Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 +NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6 # Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 +NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7 # Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 +NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9 # Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 +NVML_GPM_METRIC_DRAM_BW_UTIL = 10 # Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 +NVML_GPM_METRIC_FP64_UTIL = 11 # Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 +NVML_GPM_METRIC_FP32_UTIL = 12 # Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 +NVML_GPM_METRIC_FP16_UTIL = 13 # Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 +NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20 # PCIe traffic from this GPU in MiB/sec +NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21 # PCIe traffic to this GPU in MiB/sec +NVML_GPM_METRIC_NVDEC_0_UTIL = 30 # Percent utilization of NVDEC 0. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_1_UTIL = 31 # Percent utilization of NVDEC 1. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_2_UTIL = 32 # Percent utilization of NVDEC 2. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_3_UTIL = 33 # Percent utilization of NVDEC 3. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_4_UTIL = 34 # Percent utilization of NVDEC 4. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_5_UTIL = 35 # Percent utilization of NVDEC 5. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_6_UTIL = 36 # Percent utilization of NVDEC 6. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_7_UTIL = 37 # Percent utilization of NVDEC 7. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_0_UTIL = 40 # Percent utilization of NVJPG 0. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_1_UTIL = 41 # Percent utilization of NVJPG 1. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_2_UTIL = 42 # Percent utilization of NVJPG 2. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_3_UTIL = 43 # Percent utilization of NVJPG 3. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_4_UTIL = 44 # Percent utilization of NVJPG 4. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_5_UTIL = 45 # Percent utilization of NVJPG 5. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_6_UTIL = 46 # Percent utilization of NVJPG 6. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_7_UTIL = 47 # Percent utilization of NVJPG 7. 0.0 - 100.0 +NVML_GPM_METRIC_NVOFA_0_UTIL = 50 # Percent utilization of NVOFA 0. 0.0 - 100.0 +NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60 # NvLink read bandwidth for all links in MiB/sec +NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61 # NvLink write bandwidth for all links in MiB/sec +NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62 # NvLink read bandwidth for link 0 in MiB/sec +NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63 # NvLink write bandwidth for link 0 in MiB/sec +NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64 # NvLink read bandwidth for link 1 in MiB/sec +NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65 # NvLink write bandwidth for link 1 in MiB/sec +NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66 # NvLink read bandwidth for link 2 in MiB/sec +NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67 # NvLink write bandwidth for link 2 in MiB/sec +NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68 # NvLink read bandwidth for link 3 in MiB/sec +NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69 # NvLink write bandwidth for link 3 in MiB/sec +NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70 # NvLink read bandwidth for link 4 in MiB/sec +NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71 # NvLink write bandwidth for link 4 in MiB/sec +NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72 # NvLink read bandwidth for link 5 in MiB/sec +NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73 # NvLink write bandwidth for link 5 in MiB/sec +NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74 # NvLink read bandwidth for link 6 in MiB/sec +NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75 # NvLink write bandwidth for link 6 in MiB/sec +NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76 # NvLink read bandwidth for link 7 in MiB/sec +NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77 # NvLink write bandwidth for link 7 in MiB/sec +NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78 # NvLink read bandwidth for link 8 in MiB/sec +NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79 # NvLink write bandwidth for link 8 in MiB/sec +NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80 # NvLink read bandwidth for link 9 in MiB/sec +NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81 # NvLink write bandwidth for link 9 in MiB/sec +NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82 # NvLink read bandwidth for link 10 in MiB/sec +NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83 # NvLink write bandwidth for link 10 in MiB/sec +NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84 # NvLink read bandwidth for link 11 in MiB/sec +NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85 # NvLink write bandwidth for link 11 in MiB/sec +NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86 # NvLink read bandwidth for link 12 in MiB/sec +NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87 # NvLink write bandwidth for link 12 in MiB/sec +NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88 # NvLink read bandwidth for link 13 in MiB/sec +NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89 # NvLink write bandwidth for link 13 in MiB/sec +NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90 # NvLink read bandwidth for link 14 in MiB/sec +NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91 # NvLink write bandwidth for link 14 in MiB/sec +NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92 # NvLink read bandwidth for link 15 in MiB/sec +NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93 # NvLink write bandwidth for link 15 in MiB/sec +NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94 # NvLink read bandwidth for link 16 in MiB/sec +NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95 # NvLink write bandwidth for link 16 in MiB/sec +NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96 # NvLink read bandwidth for link 17 in MiB/sec +NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97 # NvLink write bandwidth for link 17 in MiB/sec +NVML_GPM_METRIC_MAX = 98 ## Structs @@ -4803,14 +4716,10 @@ class c_nvmlUnitInfo_t(_PrintableStructure): ('firmwareVersion', c_char * 96), ] - class struct_c_nvmlGpmSample_t(Structure): - pass # opaque handle - - + pass # opaque handle c_nvmlGpmSample_t = POINTER(struct_c_nvmlGpmSample_t) - class c_metricInfo_t(Structure): _fields_ = [ ("shortName", c_char_p), @@ -4818,7 +4727,6 @@ class c_metricInfo_t(Structure): ("unit", c_char_p), ] - class c_nvmlGpmMetric_t(_PrintableStructure): _fields_ = [ ('metricId', c_uint), @@ -4827,7 +4735,6 @@ class c_nvmlGpmMetric_t(_PrintableStructure): ('metricInfo', c_metricInfo_t) ] - class c_nvmlGpmMetricsGet_t(_PrintableStructure): _fields_ = [ ('version', c_uint), @@ -4837,20 +4744,16 @@ class c_nvmlGpmMetricsGet_t(_PrintableStructure): ('metrics', c_nvmlGpmMetric_t * NVML_GPM_METRIC_MAX) ] - NVML_GPM_METRICS_GET_VERSION = 1 - class c_nvmlGpmSupport_t(_PrintableStructure): _fields_ = [ ('version', c_uint), ('isSupportedDevice', c_uint), ] - NVML_GPM_SUPPORT_VERSION = 1 - ## Functions def nvmlGpmMetricsGet(metricsGet): @@ -4859,14 +4762,12 @@ def nvmlGpmMetricsGet(metricsGet): _nvmlCheckReturn(ret) return metricsGet - def nvmlGpmSampleFree(gpmSample): fn = _nvmlGetFunctionPointer("nvmlGpmSampleFree") ret = fn(gpmSample) _nvmlCheckReturn(ret) return - def nvmlGpmSampleAlloc(): gpmSample = c_nvmlGpmSample_t() fn = _nvmlGetFunctionPointer("nvmlGpmSampleAlloc") @@ -4874,21 +4775,18 @@ def nvmlGpmSampleAlloc(): _nvmlCheckReturn(ret) return gpmSample - def nvmlGpmSampleGet(device, gpmSample): fn = _nvmlGetFunctionPointer("nvmlGpmSampleGet") ret = fn(device, gpmSample) _nvmlCheckReturn(ret) return gpmSample - def nvmlGpmMigSampleGet(device, gpuInstanceId, gpmSample): fn = _nvmlGetFunctionPointer("nvmlGpmMigSampleGet") ret = fn(device, gpuInstanceId, gpmSample) _nvmlCheckReturn(ret) return gpmSample - def nvmlGpmQueryDeviceSupport(device): gpmSupport = c_nvmlGpmSupport_t() gpmSupport.version = NVML_GPM_SUPPORT_VERSION @@ -4897,35 +4795,20 @@ def nvmlGpmQueryDeviceSupport(device): _nvmlCheckReturn(ret) return gpmSupport - -## CCU ## -######### - -## Enums/defines - -#### CCU Stream State -NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE = 0 -NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE = 1 - - -## Functions - -def nvmlDeviceCcuSetStreamState(device, state): +def nvmlGpmSetStreamingEnabled(device, state): c_state = c_uint(state) - fn = _nvmlGetFunctionPointer("nvmlDeviceCcuSetStreamState") + fn = _nvmlGetFunctionPointer("nvmlGpmSetStreamingEnabled") ret = fn(device, c_state) _nvmlCheckReturn(ret) return ret - -def nvmlDeviceCcuGetStreamState(device): +def nvmlGpmQueryIfStreamingEnabled(device): c_state = c_uint() - fn = _nvmlGetFunctionPointer("nvmlDeviceCcuGetStreamState") + fn = _nvmlGetFunctionPointer("nvmlGpmQueryIfStreamingEnabled") ret = fn(device, byref(c_state)) _nvmlCheckReturn(ret) return c_state.value - # Low Power Structure and Function class c_nvmlNvLinkPowerThres_t(Structure): @@ -4933,22 +4816,19 @@ class c_nvmlNvLinkPowerThres_t(Structure): ("lowPwrThreshold", c_uint), ] - def nvmlDeviceSetNvLinkDeviceLowPowerThreshold(device, l1threshold): c_info = c_nvmlNvLinkPowerThres_t() c_info.lowPwrThreshold = l1threshold fn = _nvmlGetFunctionPointer("nvmlDeviceSetNvLinkDeviceLowPowerThreshold") ret = fn(device, byref(c_info)) _nvmlCheckReturn(ret) - return ret - + return ret _nvmlGpuFabricState_t = c_uint NVML_GPU_FABRIC_STATE_NOT_SUPPORTED = 0 -NVML_GPU_FABRIC_STATE_NOT_STARTED = 1 -NVML_GPU_FABRIC_STATE_IN_PROGRESS = 2 -NVML_GPU_FABRIC_STATE_COMPLETED = 3 - +NVML_GPU_FABRIC_STATE_NOT_STARTED = 1 +NVML_GPU_FABRIC_STATE_IN_PROGRESS = 2 +NVML_GPU_FABRIC_STATE_COMPLETED = 3 class c_nvmlGpuFabricInfo_t(_PrintableStructure): _fields_ = [ @@ -4958,9 +4838,56 @@ class c_nvmlGpuFabricInfo_t(_PrintableStructure): ("state", _nvmlGpuFabricState_t) ] - def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo): - fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuFabricInfo") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuFabricInfo"); ret = fn(device, gpuFabricInfo) _nvmlCheckReturn(ret) - return ret \ No newline at end of file + return ret + +###################### +## Enums/defines +#### NVML GPU NVLINK BW MODE +NVML_GPU_NVLINK_BW_MODE_FULL = 0x0 +NVML_GPU_NVLINK_BW_MODE_OFF = 0x1 +NVML_GPU_NVLINK_BW_MODE_MIN = 0x2 +NVML_GPU_NVLINK_BW_MODE_HALF = 0x3 +NVML_GPU_NVLINK_BW_MODE_3QUARTER = 0x4 +NVML_GPU_NVLINK_BW_MODE_COUNT = 0x5 + +def nvmlSystemSetNvlinkBwMode(mode): + fn = _nvmlGetFunctionPointer("nvmlSystemSetNvlinkBwMode") + ret = fn(mode) + _nvmlCheckReturn(ret) + return ret + +def nvmlSystemGetNvlinkBwMode(): + mode = c_uint(); + fn = _nvmlGetFunctionPointer("nvmlSystemGetNvlinkBwMode") + ret = fn(byref(mode)) + _nvmlCheckReturn(ret) + return mode.value + +_nvmlPowerScopeType_t = c_uint +NVML_POWER_SCOPE_GPU = 0 +NVML_POWER_SCOPE_MODULE = 1 + +class c_nvmlPowerValue_v2_t(_PrintableStructure): + _fields_ = [ + ('version', c_uint), + ('powerScope', _nvmlPowerScopeType_t), + ('powerValueMw', c_uint), + ] + _fmt_ = {'': "%d B"} + +nvmlPowerValue_v2 = 0x0200000C + +def nvmlDeviceSetPowerManagementLimit_v2(device, powerScope, powerLimit, version=nvmlPowerValue_v2): + c_powerScope = _nvmlPowerScopeType_t(powerScope) + c_powerValue = c_nvmlPowerValue_v2_t() + c_powerValue.version = c_uint(version) + c_powerValue.powerScope = c_powerScope + c_powerValue.powerValueMw = c_uint(powerLimit) + fn = _nvmlGetFunctionPointer("nvmlDeviceSetPowerManagementLimit_v2") + ret = fn(device, byref(c_powerValue)) + return ret + diff --git a/clearml_agent/helper/resource_monitor.py b/clearml_agent/helper/resource_monitor.py index 4936a13..9a61866 100644 --- a/clearml_agent/helper/resource_monitor.py +++ b/clearml_agent/helper/resource_monitor.py @@ -1,19 +1,20 @@ from __future__ import unicode_literals, division import logging -import os +import re import shlex from collections import deque from itertools import starmap from threading import Thread, Event from time import time -from typing import Text, Sequence, List, Dict, Optional +from typing import Sequence, List, Union, Dict, Optional import attr import psutil from pathlib2 import Path + +from clearml_agent.definitions import ENV_WORKER_TAGS, ENV_GPU_FRACTIONS from clearml_agent.session import Session -from clearml_agent.definitions import ENV_WORKER_TAGS try: from .gpu import gpustat @@ -87,7 +88,13 @@ class ResourceMonitor(object): self._gpustat_fail = 0 self._gpustat = gpustat self._active_gpus = None + self._default_gpu_utilization = session.config.get("agent.resource_monitoring.default_gpu_utilization", 100) + # allow default_gpu_utilization as null in the config, in which case we don't log anything + if self._default_gpu_utilization is not None: + self._default_gpu_utilization = int(self._default_gpu_utilization) + self._gpu_utilization_warning_sent = False self._disk_use_path = str(session.config.get("agent.resource_monitoring.disk_use_path", None) or Path.home()) + self._fractions_handler = GpuFractionsHandler() if session.feature_set != "basic" else None if not worker_tags and ENV_WORKER_TAGS.get(): worker_tags = shlex.split(ENV_WORKER_TAGS.get()) self._worker_tags = worker_tags @@ -237,7 +244,7 @@ class ResourceMonitor(object): try: self._update_readouts() except Exception as ex: - log.warning("failed getting machine stats: %s", report_error(ex)) + log.error("failed getting machine stats: %s", report_error(ex)) self._failure() seconds_since_started += int(round(time() - last_report)) @@ -357,25 +364,47 @@ class ResourceMonitor(object): if self._active_gpus is not False and self._gpustat: try: gpu_stat = self._gpustat.new_query() + report_index = 0 for i, g in enumerate(gpu_stat.gpus): # only monitor the active gpu's, if none were selected, monitor everything if self._active_gpus: uuid = getattr(g, "uuid", None) - if str(i) not in self._active_gpus and (not uuid or uuid not in self._active_gpus): + mig_uuid = getattr(g, "mig_uuid", None) + if ( + str(g.index) not in self._active_gpus + and (not uuid or uuid not in self._active_gpus) + and (not mig_uuid or mig_uuid not in self._active_gpus) + ): continue - stats["gpu_temperature_{:d}".format(i)] = g["temperature.gpu"] - stats["gpu_utilization_{:d}".format(i)] = g["utilization.gpu"] - stats["gpu_mem_usage_{:d}".format(i)] = ( + stats["gpu_temperature_{}".format(report_index)] = g["temperature.gpu"] + + if g["utilization.gpu"] is not None: + stats["gpu_utilization_{}".format(report_index)] = g["utilization.gpu"] + elif self._default_gpu_utilization is not None: + stats["gpu_utilization_{}".format(report_index)] = self._default_gpu_utilization + if getattr(g, "mig_index", None) is None and not self._gpu_utilization_warning_sent: + # this shouldn't happen for non-MIGs, warn the user about it + log.error("Failed fetching GPU utilization") + self._gpu_utilization_warning_sent = True + + stats["gpu_mem_usage_{}".format(report_index)] = ( 100.0 * g["memory.used"] / g["memory.total"] ) # already in MBs - stats["gpu_mem_free_{:d}".format(i)] = ( + stats["gpu_mem_free_{}".format(report_index)] = ( g["memory.total"] - g["memory.used"] ) - stats["gpu_mem_used_%d" % i] = g["memory.used"] + + stats["gpu_mem_used_{}".format(report_index)] = g["memory.used"] or 0 + + if self._fractions_handler: + fractions = self._fractions_handler.fractions + stats["gpu_fraction_{}".format(report_index)] = \ + (fractions[i] if i < len(fractions) else fractions[-1]) if fractions else 1.0 + except Exception as ex: # something happened and we can't use gpu stats, - log.warning("failed getting machine stats: %s", report_error(ex)) + log.error("failed getting machine stats: %s", report_error(ex)) self._failure() return stats @@ -388,19 +417,137 @@ class ResourceMonitor(object): ) self._gpustat = None - BACKEND_STAT_MAP = {"cpu_usage_*": "cpu_usage", - "cpu_temperature_*": "cpu_temperature", - "disk_free_percent": "disk_free_home", - "io_read_mbs": "disk_read", - "io_write_mbs": "disk_write", - "network_tx_mbs": "network_tx", - "network_rx_mbs": "network_rx", - "memory_free": "memory_free", - "memory_used": "memory_used", - "gpu_temperature_*": "gpu_temperature", - "gpu_mem_used_*": "gpu_memory_used", - "gpu_mem_free_*": "gpu_memory_free", - "gpu_utilization_*": "gpu_usage"} + BACKEND_STAT_MAP = { + "cpu_usage_*": "cpu_usage", + "cpu_temperature_*": "cpu_temperature", + "disk_free_percent": "disk_free_home", + "io_read_mbs": "disk_read", + "io_write_mbs": "disk_write", + "network_tx_mbs": "network_tx", + "network_rx_mbs": "network_rx", + "memory_free": "memory_free", + "memory_used": "memory_used", + "gpu_temperature_*": "gpu_temperature", + "gpu_mem_used_*": "gpu_memory_used", + "gpu_mem_free_*": "gpu_memory_free", + "gpu_utilization_*": "gpu_usage", + "gpu_fraction_*": "gpu_fraction" + } + + +class GpuFractionsHandler: + _number_re = re.compile(r"^clear\.ml/fraction(-\d+)?$") + _mig_re = re.compile(r"^nvidia\.com/mig-(?P[0-9]+)g\.(?P[0-9]+)gb$") + + _gpu_name_to_memory_gb = { + "A30": 24, + "NVIDIA A30": 24, + "A100-SXM4-40GB": 40, + "NVIDIA-A100-40GB-PCIe": 40, + "NVIDIA A100-40GB-PCIe": 40, + "NVIDIA-A100-SXM4-40GB": 40, + "NVIDIA A100-SXM4-40GB": 40, + "NVIDIA-A100-SXM4-80GB": 79, + "NVIDIA A100-SXM4-80GB": 79, + "NVIDIA-A100-80GB-PCIe": 79, + "NVIDIA A100-80GB-PCIe": 79, + } + + def __init__(self): + self._total_memory_gb = [ + self._gpu_name_to_memory_gb.get(name, 0) + for name in (self._get_gpu_names() or []) + ] + self._fractions = self._get_fractions() + + @property + def fractions(self) -> List[float]: + return self._fractions + + def _get_fractions(self) -> List[float]: + if not self._total_memory_gb: + # Can't compute + return [1.0] + + fractions = (ENV_GPU_FRACTIONS.get() or "").strip() + if not fractions: + # No fractions + return [1.0] + + decoded_fractions = self.decode_fractions(fractions) + + if isinstance(decoded_fractions, list): + return decoded_fractions + + totals = [] + for i, (fraction, count) in enumerate(decoded_fractions.items()): + m = self._mig_re.match(fraction) + if not m: + continue + try: + total_gb = self._total_memory_gb[i] if i < len(self._total_memory_gb) else self._total_memory_gb[-1] + if not total_gb: + continue + totals.append((int(m.group("memory")) * count) / total_gb) + except ValueError: + pass + + if not totals: + log.warning("Fractions count is empty for {}".format(fractions)) + return [1.0] + + return totals + + @classmethod + def extract_custom_limits(cls, limits: dict): + for k, v in list(limits.items() or []): + if cls._number_re.match(k): + limits.pop(k, None) + + @classmethod + def get_simple_fractions_total(cls, limits: dict) -> float: + try: + if any(cls._number_re.match(x) for x in limits): + return sum(float(v) for k, v in limits.items() if cls._number_re.match(k)) + except Exception as ex: + log.error("Failed summing up fractions from {}: {}".format(limits, ex)) + return 0 + + @classmethod + def encode_fractions(cls, limits: dict) -> str: + if any(cls._number_re.match(x) for x in (limits or {})): + return ",".join(str(v) for k, v in sorted(limits.items()) if cls._number_re.match(k)) + return ",".join(("{}:{}".format(k, v) for k, v in (limits or {}).items() if cls._mig_re.match(k))) + + @staticmethod + def decode_fractions(fractions: str) -> Union[List[float], Dict[str, int]]: + try: + items = [f.strip() for f in fractions.strip().split(",")] + tuples = [(k.strip(), v.strip()) for k, v in (f.partition(":")[::2] for f in items)] + if all(not v for _, v in tuples): + # comma-separated float fractions + return [float(k) for k, _ in tuples] + # comma-separated slice:count items + return { + k.strip(): int(v.strip()) + for k, v in tuples + } + except Exception as ex: + log.error("Failed decoding GPU fractions '{}': {}".format(fractions, ex)) + return {} + + @staticmethod + def _get_gpu_names(): + # noinspection PyBroadException + try: + gpus = gpustat.new_query().gpus + names = [g["name"] for g in gpus] + + print("GPU names: {}".format(names)) + + return names + except Exception as ex: + log.error("Failed getting GPU names: {}".format(ex)) def report_error(ex):