From c45158929894bdbcf11f0963b0542bc90c02a3cd Mon Sep 17 00:00:00 2001 From: pollfly <75068813+pollfly@users.noreply.github.com> Date: Tue, 25 Jul 2023 12:19:24 +0300 Subject: [PATCH 01/21] Edit docstrings (#1084) --- clearml/automation/optimization.py | 14 ++++++------- clearml/automation/parameters.py | 20 +++++++++---------- clearml/model.py | 2 +- docs/logger.md | 8 ++++---- .../jsonargparse/pytorch_lightning_cli.py | 2 +- .../pytorch_lightning_example.py | 2 +- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/clearml/automation/optimization.py b/clearml/automation/optimization.py index 75f14628..080142fe 100644 --- a/clearml/automation/optimization.py +++ b/clearml/automation/optimization.py @@ -610,7 +610,7 @@ class SearchStrategy(object): :param int top_k: The number of Tasks (experiments) to return. :param all_metrics: Default False, only return the objective metric on the metrics dictionary. If True, return all scalar metrics of the experiment - :param all_hyper_parameters: Default False. If True, return all the hyper-parameters from all the sections. + :param all_hyper_parameters: Default False. If True, return all the hyperparameters from all the sections. :param only_completed: return only completed Tasks. Default False. :return: A list of dictionaries ({task_id: '', hyper_parameters: {}, metrics: {}}), ordered by performance, @@ -791,7 +791,7 @@ class SearchStrategy(object): def _validate_base_task(self): # type: () -> () """ - Check the base task exists and contains the requested Objective metric and hyper parameters. + Check the base task exists and contains the requested Objective metric and hyperparameters. """ # check if the task exists try: @@ -929,7 +929,7 @@ class SearchStrategy(object): class GridSearch(SearchStrategy): """ - Grid search strategy controller. Full grid sampling of every hyper-parameter combination. + Grid search strategy controller. Full grid sampling of every hyperparameter combination. """ def __init__( @@ -1001,7 +1001,7 @@ class GridSearch(SearchStrategy): class RandomSearch(SearchStrategy): """ - Random search strategy controller. Random uniform sampling of hyper-parameters. + Random search strategy controller. Random uniform sampling of hyperparameters. """ # Number of already chosen random samples before assuming we covered the entire hyper-parameter space @@ -1105,7 +1105,7 @@ class HyperParameterOptimizer(object): ): # type: (...) -> () """ - Create a new hyper-parameter controller. The newly created object will launch and monitor the new experiments. + Create a new hyperparameter controller. The newly created object will launch and monitor the new experiments. :param str base_task_id: The Task ID to be used as template experiment to optimize. :param list hyper_parameters: The list of Parameter objects to optimize over. @@ -1120,7 +1120,7 @@ class HyperParameterOptimizer(object): - ``min_global`` - Minimize the min value of *all* reported values for the specific title/series scalar. - ``max_global`` - Maximize the max value of *all* reported values for the specific title/series scalar. - :param class.SearchStrategy optimizer_class: The SearchStrategy optimizer to use for the hyper-parameter search + :param class.SearchStrategy optimizer_class: The SearchStrategy optimizer to use for the hyperparameter search :param int max_number_of_concurrent_tasks: The maximum number of concurrent Tasks (experiments) running at the same time. :param str execution_queue: The execution queue to use for launching Tasks (experiments). @@ -1516,7 +1516,7 @@ class HyperParameterOptimizer(object): :param int top_k: The number of Tasks (experiments) to return. :param all_metrics: Default False, only return the objective metric on the metrics dictionary. If True, return all scalar metrics of the experiment - :param all_hyper_parameters: Default False. If True, return all the hyper-parameters from all the sections. + :param all_hyper_parameters: Default False. If True, return all the hyperparameters from all the sections. :param only_completed: return only completed Tasks. Default False. :return: A list of dictionaries ({task_id: '', hyper_parameters: {}, metrics: {}}), ordered by performance, diff --git a/clearml/automation/parameters.py b/clearml/automation/parameters.py index a32cee74..bf3d0237 100644 --- a/clearml/automation/parameters.py +++ b/clearml/automation/parameters.py @@ -15,7 +15,7 @@ class RandomSeed(object): def set_random_seed(seed=1337): # type: (int) -> () """ - Set global seed for all hyper-parameter strategy random number sampling. + Set global seed for all hyperparameter strategy random number sampling. :param int seed: The random seed. """ @@ -26,7 +26,7 @@ class RandomSeed(object): def get_random_seed(): # type: () -> int """ - Get the global seed for all hyper-parameter strategy random number sampling. + Get the global seed for all hyperparameter strategy random number sampling. :return: The random seed. """ @@ -35,14 +35,14 @@ class RandomSeed(object): class Parameter(RandomSeed): """ - The base hyper-parameter optimization object. + The base hyperparameter optimization object. """ _class_type_serialize_name = 'type' def __init__(self, name): # type: (Optional[str]) -> () """ - Create a new Parameter for hyper-parameter optimization + Create a new Parameter for hyperparameter optimization :param str name: The new Parameter name. This is the parameter name that will be passed to a Task. """ @@ -125,7 +125,7 @@ class UniformParameterRange(Parameter): """ Create a parameter to be sampled by the SearchStrategy - :param str name: The parameter name. Match the Task hyper-parameter name. + :param str name: The parameter name. Match the Task hyperparameter name. :param float min_value: The minimum sample to use for uniform random sampling. :param float max_value: The maximum sample to use for uniform random sampling. :param float step_size: If not ``None``, set step size (quantization) for value sampling. @@ -172,7 +172,7 @@ class UniformParameterRange(Parameter): class LogUniformParameterRange(UniformParameterRange): """ - Logarithmic uniform randomly sampled hyper-parameter object. + Logarithmic uniform randomly sampled hyperparameter object. """ def __init__( @@ -188,7 +188,7 @@ class LogUniformParameterRange(UniformParameterRange): """ Create a parameter to be sampled by the SearchStrategy - :param str name: The parameter name. Match the Task hyper-parameter name. + :param str name: The parameter name. Match the Task hyperparameter name. :param float min_value: The minimum exponent sample to use for uniform random sampling. :param float max_value: The maximum exponent sample to use for uniform random sampling. :param float base: The base used to raise the sampled exponent. @@ -228,7 +228,7 @@ class UniformIntegerParameterRange(Parameter): """ Create a parameter to be sampled by the SearchStrategy. - :param str name: The parameter name. Match the task hyper-parameter name. + :param str name: The parameter name. Match the task hyperparameter name. :param int min_value: The minimum sample to use for uniform random sampling. :param int max_value: The maximum sample to use for uniform random sampling. :param int step_size: The default step size is ``1``. @@ -272,7 +272,7 @@ class UniformIntegerParameterRange(Parameter): class DiscreteParameterRange(Parameter): """ - Discrete randomly sampled hyper-parameter object. + Discrete randomly sampled hyperparameter object. """ def __init__(self, name, values=()): @@ -280,7 +280,7 @@ class DiscreteParameterRange(Parameter): """ Uniformly sample values form a list of discrete options. - :param str name: The parameter name. Match the task hyper-parameter name. + :param str name: The parameter name. Match the task hyperparameter name. :param list values: The list/tuple of valid parameter values to sample from. """ super(DiscreteParameterRange, self).__init__(name=name) diff --git a/clearml/model.py b/clearml/model.py index fa51b80a..7b9271c9 100644 --- a/clearml/model.py +++ b/clearml/model.py @@ -2160,7 +2160,7 @@ class OutputModel(BaseModel): # type: (str) -> None """ Set the URI of the storage destination for uploaded model weight files. - Supported storage destinations include S3, Google Cloud Storage), and file locations. + Supported storage destinations include S3, Google Cloud Storage, and file locations. Using this method, file uploads are separate and then a link to each is stored in the model object. diff --git a/docs/logger.md b/docs/logger.md index 58842593..59f8ea9a 100644 --- a/docs/logger.md +++ b/docs/logger.md @@ -12,7 +12,7 @@ Using the **ClearML** [Logger](https://github.com/allegroai/clearml/blob/master/ * [Surface diagrams](#surface-diagrams) * [Images](#images) -* Track hyper-parameters and OS environment variables +* Track hyperparameters and OS environment variables * Logging experiment parameter [dictionaries](#logging-experiment-parameter-dictionaries) * Specifying [environment variables](#specifying-environment-variables-to-track) to track @@ -819,7 +819,7 @@ def report_surface(self, title, series, matrix, iteration, xlabels=None, ylabels ### Images Use to report an image and upload its contents to the bucket specified in the **ClearML** configuration file, -or a [a default upload destination](#set-default-upload-destination), if you set a default. +or a [default upload destination](#set-default-upload-destination), if you set a default. First [get the current logger](#get-the-current-logger) and then use it (see an [example script](https://github.com/allegroai/clearml/blob/master/examples/manual_reporting.py)) with the following method. @@ -925,13 +925,13 @@ def report_image(self, title, series, iteration, local_path=None, matrix=None, m -## Hyper-parameters and Environment Variables +## Hyperparameters and Environment Variables ### Logging Experiment Parameter Dictionaries In order for **ClearML** to log a dictionary of parameters, use the `Task.connect` method. -For example, to log the hyper-parameters learning_rate, batch_size, display_step, model_path, n_hidden_1, and n_hidden_2: +For example, to log the hyperparameters learning_rate, batch_size, display_step, model_path, n_hidden_1, and n_hidden_2: ```python # Create a dictionary of parameters diff --git a/examples/frameworks/jsonargparse/pytorch_lightning_cli.py b/examples/frameworks/jsonargparse/pytorch_lightning_cli.py index 751a37a5..31142b6e 100644 --- a/examples/frameworks/jsonargparse/pytorch_lightning_cli.py +++ b/examples/frameworks/jsonargparse/pytorch_lightning_cli.py @@ -3,7 +3,7 @@ try: from lightning.pytorch.demos.boring_classes import DemoModel, BoringDataModule except ImportError: import sys - print("Module 'lightning' not installed (only available for Python 3.8+") + print("Module 'lightning' not installed (only available for Python 3.8+)") sys.exit(0) from clearml import Task diff --git a/examples/frameworks/pytorch-lightning/pytorch_lightning_example.py b/examples/frameworks/pytorch-lightning/pytorch_lightning_example.py index 337d829e..7c85a9e2 100644 --- a/examples/frameworks/pytorch-lightning/pytorch_lightning_example.py +++ b/examples/frameworks/pytorch-lightning/pytorch_lightning_example.py @@ -65,7 +65,7 @@ if __name__ == '__main__': parser = LitClassifier.add_model_specific_args(parser) args = parser.parse_args() - Task.init(project_name="examples-internal", task_name="lightning checkpoint issue and argparser") + Task.init(project_name="examples", task_name="pytorch lightning MNIST") # ------------ # data From b1f99abc9679d9758d07598ad2be9f048c8c7ea2 Mon Sep 17 00:00:00 2001 From: Michael Pilosov <40366263+mathematicalmichael@users.noreply.github.com> Date: Thu, 27 Jul 2023 05:54:19 -0600 Subject: [PATCH 02/21] Raise error if dataset name empty (#1083) * raise error if dataset name empty. hidden dataset project when name is an empty string. if the dataset project happens to match an existing project, it can be quite jarring to have one's history of results suddenly become hidden. if there's an intent behind this behavior, please let me know and I'd gladly change it to a warning instead. * move check to top of init, change message * whitespace --- clearml/datasets/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index 569fb743..fbe49908 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -159,6 +159,8 @@ class Dataset(object): LoggerRoot.get_base_logger().warning( "Setting non-semantic dataset version '{}'".format(self._dataset_version) ) + if dataset_name == "": + raise ValueError("`dataset_name` cannot be an empty string") if task: self._task_pinger = None self._created_task = False From e467c05fb4d1d824bf95cf973e5b11f4f63b14b0 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Sun, 30 Jul 2023 13:04:42 +0300 Subject: [PATCH 03/21] Fix S3 uploads fail with `LazyEvalWrapper` type error (#1081) --- clearml/storage/helper.py | 53 +++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/clearml/storage/helper.py b/clearml/storage/helper.py index b3a407a7..f4bcae4b 100644 --- a/clearml/storage/helper.py +++ b/clearml/storage/helper.py @@ -504,12 +504,16 @@ class _Boto3Driver(_Driver): 'ContentType': get_file_mimetype(object_name) } extra_args.update(container.config.extra_args or {}) - container.bucket.upload_fileobj(stream, object_name, Config=boto3.s3.transfer.TransferConfig( - use_threads=container.config.multipart, - max_concurrency=self._max_multipart_concurrency if container.config.multipart else 1, - num_download_attempts=container.config.retries, - multipart_threshold=self._multipart_threshold, - multipart_chunksize=self._multipart_chunksize), + container.bucket.upload_fileobj( + stream, + object_name, + Config=boto3.s3.transfer.TransferConfig( + use_threads=container.config.multipart, + max_concurrency=int(self._max_multipart_concurrency) if container.config.multipart else 1, + num_download_attempts=container.config.retries, + multipart_threshold=int(self._multipart_threshold), + multipart_chunksize=int(self._multipart_chunksize), + ), Callback=callback, ExtraArgs=extra_args, ) @@ -523,8 +527,8 @@ class _Boto3Driver(_Driver): Config=boto3.s3.transfer.TransferConfig( use_threads=False, num_download_attempts=container.config.retries, - multipart_threshold=self._multipart_threshold, - multipart_chunksize=self._multipart_chunksize, + multipart_threshold=int(self._multipart_threshold), + multipart_chunksize=int(self._multipart_chunksize), ), Callback=callback, ExtraArgs=extra_args @@ -545,12 +549,16 @@ class _Boto3Driver(_Driver): 'ContentType': get_file_mimetype(object_name or file_path) } extra_args.update(container.config.extra_args or {}) - container.bucket.upload_file(file_path, object_name, Config=boto3.s3.transfer.TransferConfig( - use_threads=container.config.multipart, - max_concurrency=self._max_multipart_concurrency if container.config.multipart else 1, - num_download_attempts=container.config.retries, - multipart_threshold=self._multipart_threshold, - multipart_chunksize=self._multipart_chunksize), + container.bucket.upload_file( + file_path, + object_name, + Config=boto3.s3.transfer.TransferConfig( + use_threads=container.config.multipart, + max_concurrency=int(self._max_multipart_concurrency) if container.config.multipart else 1, + num_download_attempts=container.config.retries, + multipart_threshold=int(self._multipart_threshold), + multipart_chunksize=int(self._multipart_chunksize), + ), Callback=callback, ExtraArgs=extra_args, ) @@ -564,8 +572,8 @@ class _Boto3Driver(_Driver): Config=boto3.s3.transfer.TransferConfig( use_threads=False, num_download_attempts=container.config.retries, - multipart_threshold=self._multipart_threshold, - multipart_chunksize=self._multipart_chunksize + multipart_threshold=int(self._multipart_threshold), + multipart_chunksize=int(self._multipart_chunksize) ), Callback=callback, ExtraArgs=extra_args @@ -617,10 +625,11 @@ class _Boto3Driver(_Driver): container = self._containers[obj.container_name] config = boto3.s3.transfer.TransferConfig( use_threads=container.config.multipart, - max_concurrency=self._max_multipart_concurrency if container.config.multipart else 1, + max_concurrency=int(self._max_multipart_concurrency) if container.config.multipart else 1, num_download_attempts=container.config.retries, - multipart_threshold=self._multipart_threshold, - multipart_chunksize=self._multipart_chunksize) + multipart_threshold=int(self._multipart_threshold), + multipart_chunksize=int(self._multipart_chunksize), + ) total_size_mb = obj.content_length / (1024. * 1024.) remote_path = os.path.join(obj.container_name, obj.key) cb = DownloadProgressReport(total_size_mb, verbose, remote_path, log) @@ -637,10 +646,10 @@ class _Boto3Driver(_Driver): container = self._containers[obj.container_name] Config = boto3.s3.transfer.TransferConfig( use_threads=container.config.multipart, - max_concurrency=self._max_multipart_concurrency if container.config.multipart else 1, + max_concurrency=int(self._max_multipart_concurrency) if container.config.multipart else 1, num_download_attempts=container.config.retries, - multipart_threshold=self._multipart_threshold, - multipart_chunksize=self._multipart_chunksize + multipart_threshold=int(self._multipart_threshold), + multipart_chunksize=int(self._multipart_chunksize) ) obj.download_file(str(p), Callback=callback, Config=Config) From 80ef359f4570246517bdc22b7c84ad772a54f741 Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Mon, 31 Jul 2023 21:33:28 +0300 Subject: [PATCH 04/21] Bump version and clarify docs --- clearml/datasets/dataset.py | 3 ++- clearml/task.py | 6 ++++-- clearml/version.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index fbe49908..8db853ce 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -1886,7 +1886,8 @@ class Dataset(object): Query list of dataset in the system :param dataset_project: Specify dataset project name - :param partial_name: Specify partial match to a dataset name + :param partial_name: Specify partial match to a dataset name. This method supports regular expressions for name + matching (if you wish to match special characters and avoid any regex behaviour, use re.escape()) :param tags: Specify user tags :param ids: List specific dataset based on IDs list :param only_completed: If False, return datasets that are still in progress (uploading/edited etc.) diff --git a/clearml/task.py b/clearml/task.py index e0110af7..87177745 100644 --- a/clearml/task.py +++ b/clearml/task.py @@ -971,7 +971,8 @@ class Task(_Task): Use a list of strings for multiple optional project names. :param str task_name: The full name or partial name of the Tasks to match within the specified ``project_name`` (or all projects if ``project_name`` is ``None``). - This method supports regular expressions for name matching. (Optional) + This method supports regular expressions for name matching (if you wish to match special characters and + avoid any regex behaviour, use re.escape()). (Optional) To match an exact task name (i.e. not partial matching), add ^/$ at the beginning/end of the string, for example: "^exact_task_name_here$" :param list tags: Filter based on the requested list of tags (strings) (Task must have all the listed tags) @@ -1020,7 +1021,8 @@ class Task(_Task): Use a list of strings for multiple optional project names. :param str task_name: The full name or partial name of the Tasks to match within the specified ``project_name`` (or all projects if ``project_name`` is ``None``). - This method supports regular expressions for name matching. (Optional) + This method supports regular expressions for name matching (if you wish to match special characters and + avoid any regex behaviour, use re.escape()). (Optional) :param str project_name: project name (str) the task belongs to (use None for all projects) :param str task_name: task name (str) within the selected project Return any partial match of task_name, regular expressions matching is also supported. diff --git a/clearml/version.py b/clearml/version.py index 666b2f71..fe70fa28 100644 --- a/clearml/version.py +++ b/clearml/version.py @@ -1 +1 @@ -__version__ = '1.12.0' +__version__ = '1.12.1' From 92511efbde5652d4dd110b87772c8195a18499ef Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Mon, 31 Jul 2023 21:34:13 +0300 Subject: [PATCH 05/21] Allow setting multiprocessing start method --- clearml/backend_interface/task/repo/scriptinfo.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/clearml/backend_interface/task/repo/scriptinfo.py b/clearml/backend_interface/task/repo/scriptinfo.py index 78bef15e..f9c08a82 100644 --- a/clearml/backend_interface/task/repo/scriptinfo.py +++ b/clearml/backend_interface/task/repo/scriptinfo.py @@ -273,8 +273,8 @@ class ScriptRequirements(object): class _JupyterObserver(object): _thread = None - _exit_event = SafeEvent() - _sync_event = SafeEvent() + _exit_event = None + _sync_event = None _sample_frequency = 30. _first_sample_frequency = 3. _jupyter_history_logger = None @@ -286,6 +286,10 @@ class _JupyterObserver(object): @classmethod def observer(cls, jupyter_notebook_filename, notebook_name=None, log_history=False): + if cls._exit_event is None: + cls._exit_event = SafeEvent() + if cls._sync_event is None: + cls._sync_event = SafeEvent() if cls._thread is not None: # order of signaling is important! cls._exit_event.set() @@ -304,6 +308,8 @@ class _JupyterObserver(object): @classmethod def signal_sync(cls, *_, **__): + if cls._sync_event is None: + return cls._sync_event.set() @classmethod From a8b71de3e40d6d44a02ca65fcb52191f92d819df Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Fri, 4 Aug 2023 14:30:10 +0300 Subject: [PATCH 06/21] Adjust a few things in the README --- README.md | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index ab1a7710..c5e29140 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ ### ClearML *Formerly known as Allegro Trains* -ClearML is a ML/DL development and production suite, it contains FIVE main modules: +ClearML is a ML/DL development and production suite. It contains FIVE main modules: - [Experiment Manager](#clearml-experiment-manager) - Automagical experiment tracking, environments and results - [MLOps](https://github.com/allegroai/clearml-agent) - Orchestration, Automation & Pipelines solution for ML/DL jobs (K8s / Cloud / bare-metal) @@ -73,7 +73,7 @@ Instrumenting these components is the **ClearML-server**, see [Self-Hosting](htt **Adding only 2 lines to your code gets you the following** * Complete experiment setup log - * Full source control info including non-committed local changes + * Full source control info, including non-committed local changes * Execution environment (including specific packages & versions) * Hyper-parameters * [`argparse`](https://docs.python.org/3/library/argparse.html)/[Click](https://github.com/pallets/click/)/[PythonFire](https://github.com/google/python-fire) for command line parameters with currently used values @@ -122,7 +122,7 @@ below and follow the instructions: task = Task.init(project_name='examples', task_name='hello world') ``` -You are done, everything your process outputs is now automagically logged into ClearML. +And you are done! Everything your process outputs is now automagically logged into ClearML. Next step, automation! **Learn more about ClearML's two-click automation [here](https://clear.ml/docs/latest/docs/getting_started/mlops/mlops_first_steps)**. @@ -130,9 +130,9 @@ Next step, automation! **Learn more about ClearML's two-click automation [here]( The ClearML run-time components: -* The ClearML Python Package for integrating ClearML into your existing scripts by adding just two lines of code, and optionally extending your experiments and other workflows with ClearML's powerful and versatile set of classes and methods. -* The ClearML Server for storing experiment, model, and workflow data, and supporting the Web UI experiment manager, and MLOps automation for reproducibility and tuning. It is available as a hosted service and open source for you to deploy your own ClearML Server. -* The ClearML Agent for MLOps orchestration, experiment and workflow reproducibility, and scalability. +* The ClearML Python Package - for integrating ClearML into your existing scripts by adding just two lines of code, and optionally extending your experiments and other workflows with ClearML's powerful and versatile set of classes and methods. +* The ClearML Server - for storing experiment, model, and workflow data; supporting the Web UI experiment manager and MLOps automation for reproducibility and tuning. It is available as a hosted service and open source for you to deploy your own ClearML Server. +* The ClearML Agent - for MLOps orchestration, experiment and workflow reproducibility, and scalability. clearml-architecture @@ -142,7 +142,7 @@ The ClearML run-time components: - [clearml-task](https://github.com/allegroai/clearml/blob/master/docs/clearml-task.md) - Run any codebase on remote machines with full remote logging of Tensorboard, Matplotlib & Console outputs - [clearml-data](https://github.com/allegroai/clearml/blob/master/docs/datasets.md) - **CLI for managing and versioning your datasets, including creating / uploading / downloading of data from S3/GS/Azure/NAS** - [AWS Auto-Scaler](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler) - Automatically spin EC2 instances based on your workloads with preconfigured budget! No need for K8s! -- [Hyper-Parameter Optimization](https://clear.ml/docs/latest/docs/guides/optimization/hyper-parameter-optimization/examples_hyperparam_opt) - Optimize any code with black-box approach and state of the art Bayesian optimization algorithms +- [Hyper-Parameter Optimization](https://clear.ml/docs/latest/docs/guides/optimization/hyper-parameter-optimization/examples_hyperparam_opt) - Optimize any code with black-box approach and state-of-the-art Bayesian optimization algorithms - [Automation Pipeline](https://clear.ml/docs/latest/docs/guides/pipeline/pipeline_controller) - Build pipelines based on existing experiments / jobs, supports building pipelines of pipelines! - [Slack Integration](https://clear.ml/docs/latest/docs/guides/services/slack_alerts) - Report experiments progress / failure directly to Slack (fully customizable!) @@ -159,11 +159,11 @@ and practices. - Use it on a daily basis to boost collaboration and visibility in your team - Create a remote job from any experiment with a click of a button - Automate processes and create pipelines to collect your experimentation logs, outputs, and data - - Store all you data on any object-storage solution, with the simplest interface possible - - Make you data transparent by cataloging it all on the ClearML platform + - Store all your data on any object-storage solution, with the most straightforward interface possible + - Make your data transparent by cataloging it all on the ClearML platform We believe ClearML is ground-breaking. We wish to establish new standards of true seamless integration between -experiment management, MLOps and data management. +experiment management, MLOps, and data management. ## Who We Are @@ -172,8 +172,7 @@ ClearML is supported by you and the [clear.ml](https://clear.ml) team, which hel We built ClearML to track and control the glorious but messy process of training production-grade deep learning models. We are committed to vigorously supporting and expanding the capabilities of ClearML. -We promise to always be backwardly compatible, making sure all your logs, data and pipelines -will always upgrade with you. +We promise to always be backwardly compatible, making sure all your logs, data, and pipelines will always upgrade with you. ## License @@ -192,7 +191,7 @@ author = {ClearML}, ## Documentation, Community & Support -More information in the [official documentation](https://clear.ml/docs) and [on YouTube](https://www.youtube.com/c/ClearML). +For more information, see the [official documentation](https://clear.ml/docs) and [on YouTube](https://www.youtube.com/c/ClearML). For examples and use cases, check the [examples folder](https://github.com/allegroai/clearml/tree/master/examples) and [corresponding documentation](https://clear.ml/docs/latest/docs/guides). From 46c6d2bf0f7f0c01443da199b9925e7e7abfccf2 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 4 Aug 2023 19:05:24 +0300 Subject: [PATCH 07/21] Fix proxy object support for lists --- clearml/utilities/proxy_object.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clearml/utilities/proxy_object.py b/clearml/utilities/proxy_object.py index 43435879..9aadc2f4 100644 --- a/clearml/utilities/proxy_object.py +++ b/clearml/utilities/proxy_object.py @@ -303,7 +303,8 @@ class WrapperBase(type): '__repr__', '__reversed__', '__rfloorfiv__', '__rlshift__', '__rmod__', '__rmul__', '__ror__', '__rpow__', '__rrshift__', '__rshift__', '__rsub__', '__rtruediv__', '__rxor__', '__setitem__', '__setslice__', '__sub__', - '__truediv__', '__xor__', 'next', '__str__', '__repr__', + '__truediv__', '__xor__', 'next', '__str__', '__repr__', + '__round__', '__fspath__', '__bytes__', '__index__' ] def __new__(mcs, classname, bases, attrs): From 99c7eecbeeb3beb91a72bebc11edd3d044c06e5e Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 4 Aug 2023 19:06:15 +0300 Subject: [PATCH 08/21] Fix pipeline should always use artifacts hash for checking cache hits --- clearml/automation/job.py | 48 +++++++++++++++------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/clearml/automation/job.py b/clearml/automation/job.py index 3ebd9f7b..7b62f79e 100644 --- a/clearml/automation/job.py +++ b/clearml/automation/job.py @@ -384,10 +384,9 @@ class BaseJob(object): section_overrides=None, params_override=None, configurations_override=None, - explicit_docker_image=None, - account_for_artifacts_hashes=True + explicit_docker_image=None ): - # type: (Task, Optional[dict], Optional[dict], Optional[dict], Optional[str], bool) -> Optional[str] + # type: (Task, Optional[dict], Optional[dict], Optional[dict], Optional[str]) -> Optional[str] """ Create Hash (str) representing the state of the Task @@ -398,8 +397,6 @@ class BaseJob(object): :param configurations_override: dictionary of configuration override objects (tasks.ConfigurationItem) :param explicit_docker_image: The explicit docker image. Used to invalidate the hash when the docker image was explicitly changed - :param account_for_artifacts_hashes: Calculate the hash of the task by accounting for the hashes of the - artifacts in `kwargs_artifacts` (as opposed of the task ID/artifact name stored in this section) :return: str hash of the Task configuration """ @@ -420,22 +417,21 @@ class BaseJob(object): script.pop("requirements", None) hyper_params = deepcopy(task.get_parameters() if params_override is None else params_override) - if account_for_artifacts_hashes: - hyper_params_to_change = {} - task_cache = {} - for key, value in hyper_params.items(): - if key.startswith("kwargs_artifacts/"): - # noinspection PyBroadException - try: - # key format is . - task_id, artifact = value.split(".", 1) - task_ = task_cache.setdefault(task_id, Task.get_task(task_id)) - # set the value of the hyper parameter to the hash of the artifact - # because the task ID might differ, but the artifact might be the same - hyper_params_to_change[key] = task_.artifacts[artifact].hash - except Exception: - pass - hyper_params.update(hyper_params_to_change) + hyper_params_to_change = {} + task_cache = {} + for key, value in hyper_params.items(): + if key.startswith("kwargs_artifacts/"): + # noinspection PyBroadException + try: + # key format is . + task_id, artifact = value.split(".", 1) + task_ = task_cache.setdefault(task_id, Task.get_task(task_id)) + # set the value of the hyper parameter to the hash of the artifact + # because the task ID might differ, but the artifact might be the same + hyper_params_to_change[key] = task_.artifacts[artifact].hash + except Exception: + pass + hyper_params.update(hyper_params_to_change) configs = task.get_configuration_objects() if configurations_override is None else configurations_override # currently we do not add the docker image to the hash (only args and setup script), # because default docker image will cause the step to change @@ -604,14 +600,6 @@ class ClearmlJob(BaseJob): if allow_caching: # look for a cached copy of the Task # get parameters + task_overrides + as dict and hash it. - task_hash_legacy = self._create_task_hash( - base_temp_task, - section_overrides=sections, - params_override=task_params, - configurations_override=configuration_overrides or None, - explicit_docker_image=kwargs.get("explicit_docker_image"), - account_for_artifacts_hashes=False - ) task_hash = self._create_task_hash( base_temp_task, section_overrides=sections, @@ -619,7 +607,7 @@ class ClearmlJob(BaseJob): configurations_override=configuration_overrides or None, explicit_docker_image=kwargs.get("explicit_docker_image") ) - task = self._get_cached_task(task_hash_legacy) or self._get_cached_task(task_hash) + task = self._get_cached_task(task_hash) # if we found a task, just use if task: if disable_clone_task and self.task and self.task.status == self.task.TaskStatusEnum.created: From b8467721386e22ccea0e29f06544ea18a5ef40bb Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 4 Aug 2023 19:07:05 +0300 Subject: [PATCH 09/21] Fix pipeline monitor must be called after pipeline is completed (just in case we missed something) --- clearml/automation/controller.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/clearml/automation/controller.py b/clearml/automation/controller.py index 076c8c7d..06d2277f 100644 --- a/clearml/automation/controller.py +++ b/clearml/automation/controller.py @@ -785,7 +785,8 @@ class PipelineController(object): pass :param tags: A list of tags for the specific pipeline step. - When executing a Pipeline remotely (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. + When executing a Pipeline remotely + (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. :return: True if successful """ @@ -1431,7 +1432,8 @@ class PipelineController(object): # type: (Union[Sequence[str], str]) -> None """ Add tags to this pipeline. Old tags are not deleted. - When executing a Pipeline remotely (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. + When executing a Pipeline remotely + (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. :param tags: A list of tags for this pipeline. """ @@ -2075,7 +2077,8 @@ class PipelineController(object): pass :param tags: A list of tags for the specific pipeline step. - When executing a Pipeline remotely (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. + When executing a Pipeline remotely + (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. :return: True if successful """ @@ -3190,7 +3193,8 @@ class PipelineController(object): name=artifact_name, artifact_object=artifact_object, wait_on_upload=True, - extension_name=".pkl" if isinstance(artifact_object, dict) and not self._artifact_serialization_function else None, + extension_name=".pkl" if isinstance(artifact_object, dict) and + not self._artifact_serialization_function else None, serialization_function=self._artifact_serialization_function ) @@ -3468,6 +3472,7 @@ class PipelineDecorator(PipelineController): # visualize pipeline state (plot) self.update_execution_plot() + self._scan_monitored_nodes() if self._stop_event: # noinspection PyBroadException @@ -3803,7 +3808,8 @@ class PipelineDecorator(PipelineController): pass :param tags: A list of tags for the specific pipeline step. - When executing a Pipeline remotely (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. + When executing a Pipeline remotely + (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. :return: function wrapper """ @@ -3955,8 +3961,9 @@ class PipelineDecorator(PipelineController): # Note that for the first iteration (when `_node.name == _node_name`) # we always increment the name, as the name is always in `_launched_step_names` while _node.name in cls._singleton._launched_step_names or ( - _node.name in cls._singleton._nodes - and cls._singleton._nodes[_node.name].job_code_section != cls._singleton._nodes[_node_name].job_code_section + _node.name in cls._singleton._nodes and + cls._singleton._nodes[_node.name].job_code_section != + cls._singleton._nodes[_node_name].job_code_section ): _node.name = "{}_{}".format(_node_name, counter) counter += 1 From 696c349488018f5f121f40f7d39dd137b58359be Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 4 Aug 2023 19:07:35 +0300 Subject: [PATCH 10/21] Improve warning message on task execute_remotely --- clearml/task.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clearml/task.py b/clearml/task.py index 87177745..0aeb4a24 100644 --- a/clearml/task.py +++ b/clearml/task.py @@ -2778,7 +2778,8 @@ class Task(_Task): # leave this process. if exit_process: - LoggerRoot.get_base_logger().warning('Terminating local execution process') + LoggerRoot.get_base_logger().warning( + 'ClearML Terminating local execution process - continuing execution remotely') leave_process(0) return task From 197894735b016d9ca4404988436d7d39d42da7e8 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 4 Aug 2023 19:08:09 +0300 Subject: [PATCH 11/21] Make sure we work on a copy of task objects --- clearml/automation/job.py | 1 + 1 file changed, 1 insertion(+) diff --git a/clearml/automation/job.py b/clearml/automation/job.py index 7b62f79e..dde8eb34 100644 --- a/clearml/automation/job.py +++ b/clearml/automation/job.py @@ -414,6 +414,7 @@ class BaseJob(object): return None # we need to ignore `requirements` section because ir might be changing from run to run + script = deepcopy(script) script.pop("requirements", None) hyper_params = deepcopy(task.get_parameters() if params_override is None else params_override) From d26ce48dbe4dd349b5b3bca13beaaf407b182d07 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 4 Aug 2023 19:08:51 +0300 Subject: [PATCH 12/21] Improve artifacts serialization: should now be consistent when serializing pandas object into gzip stream (csv.gz) --- clearml/binding/artifacts.py | 134 ++++++++++++++++++++++++++--------- 1 file changed, 101 insertions(+), 33 deletions(-) diff --git a/clearml/binding/artifacts.py b/clearml/binding/artifacts.py index d4794b54..93062ebf 100644 --- a/clearml/binding/artifacts.py +++ b/clearml/binding/artifacts.py @@ -1,3 +1,4 @@ +import gzip import json import yaml import mimetypes @@ -38,7 +39,7 @@ try: except ImportError: np = None try: - from pathlib import Path as pathlib_Path + from pathlib import Path as pathlib_Path # noqa except ImportError: pathlib_Path = None @@ -321,6 +322,7 @@ class Artifacts(object): self._storage_prefix = None self._task_name = None self._project_name = None + self._temp_files_lookup = {} def register_artifact(self, name, artifact, metadata=None, uniqueness_columns=True): # type: (str, DataFrame, Optional[dict], Union[bool, Sequence[str]]) -> () @@ -428,15 +430,15 @@ class Artifacts(object): artifact_type_data.preview = "" override_filename_ext_in_uri = extension_name or "" override_filename_in_uri = name + override_filename_ext_in_uri - fd, local_filename = mkstemp(prefix=quote(name, safe="") + ".", suffix=override_filename_ext_in_uri) - os.close(fd) + local_filename = self._push_temp_file( + prefix=quote(name, safe="") + ".", suffix=override_filename_ext_in_uri) # noinspection PyBroadException try: with open(local_filename, "wb") as f: f.write(serialization_function(artifact_object)) except Exception: # cleanup and raise exception - os.unlink(local_filename) + self._delete_temp_file(local_filename) raise artifact_type_data.content_type = mimetypes.guess_type(local_filename)[0] elif extension_name == ".pkl": @@ -448,8 +450,8 @@ class Artifacts(object): extension_name, [".npz", ".csv.gz"], ".npz", artifact_type ) override_filename_in_uri = name + override_filename_ext_in_uri - fd, local_filename = mkstemp(prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) - os.close(fd) + local_filename = self._push_temp_file( + prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) if override_filename_ext_in_uri == ".npz": artifact_type_data.content_type = "application/numpy" np.savez_compressed(local_filename, **{name: artifact_object}) @@ -464,11 +466,10 @@ class Artifacts(object): extension_name, [".csv.gz", ".parquet", ".feather", ".pickle"], ".csv.gz", artifact_type ) override_filename_in_uri = name - fd, local_filename = mkstemp(prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) - os.close(fd) + local_filename = self._push_temp_file(prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) if override_filename_ext_in_uri == ".csv.gz": artifact_type_data.content_type = "text/csv" - artifact_object.to_csv(local_filename, compression=self._compression) + self._store_compressed_pd_csv(artifact_object, local_filename) elif override_filename_ext_in_uri == ".parquet": try: artifact_type_data.content_type = "application/parquet" @@ -480,7 +481,7 @@ class Artifacts(object): ) ) artifact_type_data.content_type = "text/csv" - artifact_object.to_csv(local_filename, compression=self._compression) + self._store_compressed_pd_csv(artifact_object, local_filename) elif override_filename_ext_in_uri == ".feather": try: artifact_type_data.content_type = "application/feather" @@ -492,7 +493,7 @@ class Artifacts(object): ) ) artifact_type_data.content_type = "text/csv" - artifact_object.to_csv(local_filename, compression=self._compression) + self._store_compressed_pd_csv(artifact_object, local_filename) elif override_filename_ext_in_uri == ".pickle": artifact_type_data.content_type = "application/pickle" artifact_object.to_pickle(local_filename) @@ -527,8 +528,8 @@ class Artifacts(object): if guessed_type: artifact_type_data.content_type = guessed_type - fd, local_filename = mkstemp(prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) - os.close(fd) + local_filename = self._push_temp_file( + prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) artifact_object.save(local_filename) delete_after_upload = True elif isinstance(artifact_object, dict): @@ -561,8 +562,9 @@ class Artifacts(object): if serialized_text is not None: override_filename_in_uri = name + override_filename_ext_in_uri - fd, local_filename = mkstemp(prefix=quote(name, safe="") + ".", suffix=override_filename_ext_in_uri) - with open(fd, "w") as f: + local_filename = self._push_temp_file( + prefix=quote(name, safe="") + ".", suffix=override_filename_ext_in_uri) + with open(local_filename, "w") as f: f.write(serialized_text) preview = preview or serialized_text if len(preview) < self.max_preview_size_bytes: @@ -599,7 +601,7 @@ class Artifacts(object): files = list(Path(folder).rglob(wildcard)) override_filename_ext_in_uri = '.zip' override_filename_in_uri = folder.parts[-1] + override_filename_ext_in_uri - fd, zip_file = mkstemp( + zip_file = self._push_temp_file( prefix=quote(folder.parts[-1], safe="") + '.', suffix=override_filename_ext_in_uri ) try: @@ -618,8 +620,7 @@ class Artifacts(object): LoggerRoot.get_base_logger().warning('Exception {}\nFailed zipping artifact folder {}'.format( folder, e)) return False - finally: - os.close(fd) + artifact_type_data.preview = preview or archive_preview artifact_object = zip_file artifact_type = 'archive' @@ -647,7 +648,7 @@ class Artifacts(object): override_filename_ext_in_uri = '.zip' override_filename_in_uri = quote(name, safe="") + override_filename_ext_in_uri common_path = get_common_path(list_files) - fd, zip_file = mkstemp( + zip_file = self._push_temp_file( prefix='artifact_folder.', suffix=override_filename_ext_in_uri ) try: @@ -670,8 +671,7 @@ class Artifacts(object): LoggerRoot.get_base_logger().warning('Exception {}\nFailed zipping artifact files {}'.format( artifact_object, e)) return False - finally: - os.close(fd) + artifact_type_data.preview = preview or archive_preview artifact_object = zip_file artifact_type = 'archive' @@ -704,15 +704,15 @@ class Artifacts(object): delete_after_upload = True override_filename_ext_in_uri = ".txt" override_filename_in_uri = name + override_filename_ext_in_uri - fd, local_filename = mkstemp(prefix=quote(name, safe="") + ".", suffix=override_filename_ext_in_uri) - os.close(fd) + local_filename = self._push_temp_file( + prefix=quote(name, safe="") + ".", suffix=override_filename_ext_in_uri) # noinspection PyBroadException try: with open(local_filename, "wt") as f: f.write(artifact_object) except Exception: # cleanup and raise exception - os.unlink(local_filename) + self._delete_temp_file(local_filename) raise elif artifact_object is None or (isinstance(artifact_object, str) and artifact_object == ""): artifact_type = '' @@ -736,15 +736,15 @@ class Artifacts(object): delete_after_upload = True override_filename_ext_in_uri = '.pkl' override_filename_in_uri = name + override_filename_ext_in_uri - fd, local_filename = mkstemp(prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) - os.close(fd) + local_filename = self._push_temp_file( + prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) # noinspection PyBroadException try: with open(local_filename, 'wb') as f: pickle.dump(artifact_object, f) except Exception: # cleanup and raise exception - os.unlink(local_filename) + self._delete_temp_file(local_filename) raise # verify preview not out of scope: @@ -875,10 +875,10 @@ class Artifacts(object): override_filename_ext_in_uri = self._save_format override_filename_in_uri = name - fd, local_csv = mkstemp(prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) - os.close(fd) + local_csv = self._push_temp_file( + prefix=quote(name, safe="") + '.', suffix=override_filename_ext_in_uri) local_csv = Path(local_csv) - pd_artifact.to_csv(local_csv.as_posix(), index=False, compression=self._compression) + self._store_compressed_pd_csv(pd_artifact, local_csv.as_posix(), index=False) current_sha2, file_sha2 = sha256sum( local_csv.as_posix(), skip_header=32, block_size=Artifacts._hash_block_size) if name in self._last_artifacts_upload: @@ -887,7 +887,7 @@ class Artifacts(object): # nothing to do, we can skip the upload # noinspection PyBroadException try: - local_csv.unlink() + self._delete_temp_file(local_csv) except Exception: pass return @@ -944,6 +944,8 @@ class Artifacts(object): """ from clearml.storage import StorageManager + local_file = self._pop_temp_file(local_file) + upload_uri = self._task.output_uri or self._task.get_logger().get_default_upload_destination() if not isinstance(local_file, Path): local_file = Path(local_file) @@ -962,7 +964,7 @@ class Artifacts(object): StorageManager.upload_file(local_file.as_posix(), uri, wait_for_upload=True, retries=ev.retries) if delete_after_upload: try: - os.unlink(local_file.as_posix()) + self._delete_temp_file(local_file) except OSError: LoggerRoot.get_base_logger().warning('Failed removing temporary {}'.format(local_file)) else: @@ -1047,9 +1049,75 @@ class Artifacts(object): def _get_storage_uri_prefix(self): # type: () -> str - if not self._storage_prefix or self._task_name != self._task.name or self._project_name != self._task.get_project_name(): + if not self._storage_prefix or self._task_name != self._task.name or \ + self._project_name != self._task.get_project_name(): # noinspection PyProtectedMember self._storage_prefix = self._task._get_output_destination_suffix() self._task_name = self._task.name self._project_name = self._task.get_project_name() return self._storage_prefix + + def _store_compressed_pd_csv(self, artifact_object, local_filename, **kwargs): + # bugfix: to make pandas csv.gz consistent file hash we must pass mtime=0 + # (otherwise it is encoded and creates new hash every time) + if self._compression == "gzip": + with gzip.GzipFile(local_filename, 'wb', mtime=0) as gzip_file: + artifact_object.to_csv(gzip_file, **kwargs) + else: + artifact_object.to_csv(local_filename, compression=self._compression) + + def _push_temp_file(self, prefix=None, suffix=None): + """ + Same prefix/suffix as mkstemp uses + :param prefix: Same prefix/suffix as mkstemp uses + :param suffix: Same prefix/suffix as mkstemp uses + :return: consistent temp file inside a single folder that later we rename to a temp file + """ + # we want to make sure our temp naming convention is consistent + # this is important for hashing zip files and gz files, because the name of the internal + # file becomes part of the content and then hash changes + + # temp filename is based on the assumption + + # put a consistent the file into a temp folder because the filename is part of + # the compressed artifact and we want consistency. After that we rename compressed file to temp file and + # delete temp folder + temp_folder = mkdtemp(prefix='artifacts_') + local_filename = Path(temp_folder) / (str(prefix).rstrip(".") + "." + str(suffix).lstrip(".")) + local_filename = local_filename.as_posix() + self._temp_files_lookup[local_filename] = (temp_folder, deepcopy(prefix), deepcopy(suffix)) + return local_filename + + def _pop_temp_file(self, local_filename=None): + """ + Now we need to move the consistent file from the temp folder to the main temp folder, + give it a new temp name, and remove the temp folder + + :param local_filename: local file name inside a temp folder, assumed to be a single file in the temp folder + :return: new temp file inside the main temp folder + """ + # convert to posix if Path + if isinstance(local_filename, Path): + local_filename = local_filename.as_posix() + + # if this is not our temp file, just do nothing + if local_filename not in self._temp_files_lookup: + return local_filename + + # move file out of temp folder + try: + temp_folder, prefix, suffix = self._temp_files_lookup.pop(local_filename) + fd, temp_filename = mkstemp(prefix=prefix, suffix=suffix) + os.close(fd) + os.replace(local_filename, temp_filename) + local_filename = temp_filename + os.rmdir(temp_folder) + except Exception as ex: + raise ValueError("Failed storing temp artifact into {}: error: {}".format(local_filename, ex)) + + return temp_filename + + def _delete_temp_file(self, local_filename): + # cleanup and raise exception + local_filename = self._pop_temp_file(local_filename) + os.unlink(local_filename) From c15f012e1bfdb7f5ef27a7ad0b35987975031a5e Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 4 Aug 2023 19:54:55 +0300 Subject: [PATCH 13/21] Add PipelineController always_create_from_code=True (False is previous behavior where we deserialize always from backend when running remotely, new flow means the pipeline is always created from code) Fix pipeline decorator does not read the pipeline arguments back from the backend when running remotely --- clearml/automation/controller.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/clearml/automation/controller.py b/clearml/automation/controller.py index 06d2277f..019b0d92 100644 --- a/clearml/automation/controller.py +++ b/clearml/automation/controller.py @@ -151,6 +151,7 @@ class PipelineController(object): repo=None, # type: Optional[str] repo_branch=None, # type: Optional[str] repo_commit=None, # type: Optional[str] + always_create_from_code=True, # type: bool artifact_serialization_function=None, # type: Optional[Callable[[Any], Union[bytes, bytearray]]] artifact_deserialization_function=None # type: Optional[Callable[[bytes], Any]] ): @@ -215,6 +216,9 @@ class PipelineController(object): Use empty string ("") to disable any repository auto-detection :param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used) :param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used) + :param always_create_from_code: If True (default) the pipeline is always constructed from code, + if False, pipeline is generated from pipeline configuration section on the pipeline Task itsef. + this allows to edit (also add/remove) pipeline steps without changing the original codebase :param artifact_serialization_function: A serialization function that takes one parameter of any type which is the object to be serialized. The function should return a `bytes` or `bytearray` object, which represents the serialized object. All parameter/return @@ -244,6 +248,7 @@ class PipelineController(object): self._start_time = None self._pipeline_time_limit = None self._default_execution_queue = None + self._always_create_from_code = bool(always_create_from_code) self._version = str(version).strip() if version else None if self._version and not Version.is_valid_version_string(self._version): raise ValueError( @@ -1413,7 +1418,7 @@ class PipelineController(object): pipeline_object._nodes = {} pipeline_object._running_nodes = [] try: - pipeline_object._deserialize(pipeline_task._get_configuration_dict(cls._config_section)) + pipeline_object._deserialize(pipeline_task._get_configuration_dict(cls._config_section), force=True) except Exception: pass return pipeline_object @@ -1715,13 +1720,16 @@ class PipelineController(object): return dag - def _deserialize(self, dag_dict): - # type: (dict) -> () + def _deserialize(self, dag_dict, force=False): + # type: (dict, bool) -> () """ Restore the DAG from a dictionary. This will be used to create the DAG from the dict stored on the Task, when running remotely. :return: """ + # if we always want to load the pipeline DAG from code, we are skipping the deserialization step + if not force and self._always_create_from_code: + return # if we do not clone the Task, only merge the parts we can override. for name in list(self._nodes.keys()): @@ -3329,6 +3337,7 @@ class PipelineDecorator(PipelineController): repo=repo, repo_branch=repo_branch, repo_commit=repo_commit, + always_create_from_code=False, artifact_serialization_function=artifact_serialization_function, artifact_deserialization_function=artifact_deserialization_function ) @@ -4310,11 +4319,6 @@ class PipelineDecorator(PipelineController): a_pipeline._task._set_runtime_properties( dict(multi_pipeline_counter=str(cls._multi_pipeline_call_counter))) - # sync arguments back (post deserialization and casting back) - for k in pipeline_kwargs.keys(): - if k in a_pipeline.get_parameters(): - pipeline_kwargs[k] = a_pipeline.get_parameters()[k] - # run the actual pipeline if not start_controller_locally and \ not PipelineDecorator._debug_execute_step_process and pipeline_execution_queue: @@ -4322,8 +4326,14 @@ class PipelineDecorator(PipelineController): a_pipeline._task.execute_remotely(queue_name=pipeline_execution_queue) # when we get here it means we are running remotely + # this will also deserialize the pipeline and arguments a_pipeline._start(wait=False) + # sync arguments back (post deserialization and casting back) + for k in pipeline_kwargs.keys(): + if k in a_pipeline.get_parameters(): + pipeline_kwargs[k] = a_pipeline.get_parameters()[k] + # this time the pipeline is executed only on the remote machine try: pipeline_result = func(**pipeline_kwargs) From 2c44bff461c0b9a9524ee4ae18c88ce951b85faa Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Fri, 11 Aug 2023 13:09:19 +0300 Subject: [PATCH 14/21] Fix Hydra support both Hydra section overrides and --- clearml/binding/hydra_bind.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/clearml/binding/hydra_bind.py b/clearml/binding/hydra_bind.py index 394dd79c..d26b4e85 100644 --- a/clearml/binding/hydra_bind.py +++ b/clearml/binding/hydra_bind.py @@ -1,7 +1,7 @@ import io import sys from functools import partial - +import yaml from ..config import running_remotely, get_remote_task_id, DEV_TASK_NO_REUSE from ..debugging.log import LoggerRoot @@ -81,7 +81,14 @@ class PatchHydra(object): stored_config = {k[len(PatchHydra._parameter_section)+1:]: v for k, v in full_parameters.items() if k.startswith(PatchHydra._parameter_section+'/')} stored_config.pop(PatchHydra._parameter_allow_full_edit, None) - overrides = ['{}={}'.format(k, v) for k, v in stored_config.items()] + # noinspection PyBroadException + try: + overrides = yaml.safe_load(full_parameters.get("Args/overrides", "")) or [] + except Exception: + overrides = [] + if overrides and not isinstance(overrides, (list, tuple)): + overrides = [overrides] + overrides += ['{}={}'.format(k, v) for k, v in stored_config.items()] else: # We take care of it inside the _patched_run_job pass @@ -119,7 +126,8 @@ class PatchHydra(object): else: PatchHydra._last_untracked_state['connect'] = dict( mutable=stored_config, name=PatchHydra._parameter_section) - # todo: remove the overrides section from the Args (we have it here) + # Maybe ?! remove the overrides section from the Args (we have it here) + # But when used with a Pipeline this is the only section we get... so we leave it here anyhow # PatchHydra._current_task.delete_parameter('Args/overrides') except Exception: pass From d4b11dfa22ddf22773211c4beb9b671dd2a0c71b Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Fri, 11 Aug 2023 13:11:58 +0300 Subject: [PATCH 15/21] Create OutputModel base model lazily --- clearml/model.py | 146 +++++++++++++++++++++++++++++------------------ 1 file changed, 90 insertions(+), 56 deletions(-) diff --git a/clearml/model.py b/clearml/model.py index 7b9271c9..c380ee34 100644 --- a/clearml/model.py +++ b/clearml/model.py @@ -357,6 +357,9 @@ class BaseModel(object): self._task = None self._reload_required = False self._reporter = None + self._floating_data = None + self._name = None + self._task_connect_name = None self._set_task(task) def get_weights(self, raise_on_error=False, force_download=False): @@ -1055,6 +1058,7 @@ class BaseModel(object): def _init_reporter(self): if self._reporter: return + self._base_model = self._get_force_base_model() metrics_manager = Metrics( session=_Model._get_default_session(), storage_uri=None, @@ -1126,6 +1130,8 @@ class BaseModel(object): :return: True if the metadata was set and False otherwise """ + if not self._base_model: + self._base_model = self._get_force_base_model() self._reload_required = ( _Model._get_default_session() .send( @@ -1167,6 +1173,8 @@ class BaseModel(object): :return: String representation of the value of the metadata entry or None if the entry was not found """ + if not self._base_model: + self._base_model = self._get_force_base_model() self._reload_if_required() return self.get_all_metadata().get(str(key), {}).get("value") @@ -1180,6 +1188,8 @@ class BaseModel(object): :return: The value of the metadata entry, casted to its type (if not possible, the string representation will be returned) or None if the entry was not found """ + if not self._base_model: + self._base_model = self._get_force_base_model() key = str(key) metadata = self.get_all_metadata() if key not in metadata: @@ -1194,6 +1204,8 @@ class BaseModel(object): :return: Get all metadata as a dictionary of format Dict[key, Dict[value, type]]. The key, value and type entries are all strings. Note that each entry might have an additional 'key' entry, repeating the key """ + if not self._base_model: + self._base_model = self._get_force_base_model() self._reload_if_required() return self._get_model_data().metadata or {} @@ -1204,6 +1216,8 @@ class BaseModel(object): entries are strings. The value is cast to its type if possible. Note that each entry might have an additional 'key' entry, repeating the key """ + if not self._base_model: + self._base_model = self._get_force_base_model() self._reload_if_required() result = {} metadata = self.get_all_metadata() @@ -1224,6 +1238,8 @@ class BaseModel(object): :return: True if the metadata was set and False otherwise """ + if not self._base_model: + self._base_model = self._get_force_base_model() metadata_array = [ { "key": str(k), @@ -1249,6 +1265,74 @@ class BaseModel(object): self._get_base_model().reload() self._reload_required = False + def _update_base_model(self, model_name=None, task_model_entry=None): + if not self._task: + return self._base_model + # update the model from the task inputs + labels = self._task.get_labels_enumeration() + # noinspection PyProtectedMember + config_text = self._task._get_model_config_text() + model_name = ( + model_name or self._name or (self._floating_data.name if self._floating_data else None) or self._task.name + ) + # noinspection PyBroadException + try: + task_model_entry = ( + task_model_entry + or self._task_connect_name + or Path(self._get_model_data().uri).stem + ) + except Exception: + pass + parent = self._task.input_models_id.get(task_model_entry) + self._base_model.update( + labels=(self._floating_data.labels if self._floating_data else None) or labels, + design=(self._floating_data.design if self._floating_data else None) or config_text, + task_id=self._task.id, + project_id=self._task.project, + parent_id=parent, + name=model_name, + comment=self._floating_data.comment if self._floating_data else None, + tags=self._floating_data.tags if self._floating_data else None, + framework=self._floating_data.framework if self._floating_data else None, + upload_storage_uri=self._floating_data.upload_storage_uri if self._floating_data else None, + ) + + # remove model floating change set, by now they should have matched the task. + self._floating_data = None + + # now we have to update the creator task so it points to us + if str(self._task.status) not in ( + str(self._task.TaskStatusEnum.created), + str(self._task.TaskStatusEnum.in_progress), + ): + self._log.warning( + "Could not update last created model in Task {}, " + "Task status '{}' cannot be updated".format( + self._task.id, self._task.status + ) + ) + elif task_model_entry: + self._base_model.update_for_task( + task_id=self._task.id, + model_id=self.id, + type_="output", + name=task_model_entry, + ) + + return self._base_model + + def _get_force_base_model(self, model_name=None, task_model_entry=None): + if self._base_model: + return self._base_model + if not self._task: + return None + + # create a new model from the task + # noinspection PyProtectedMember + self._base_model = self._task._get_output_model(model_id=None) + return self._update_base_model(model_name=model_name, task_model_entry=task_model_entry) + class Model(BaseModel): """ @@ -2060,6 +2144,7 @@ class OutputModel(BaseModel): self._base_model = None self._base_model_id = None self._task_connect_name = None + self._name = name self._label_enumeration = label_enumeration # noinspection PyProtectedMember self._floating_data = create_dummy_model( @@ -2300,7 +2385,11 @@ class OutputModel(BaseModel): if out_model_file_name else (self._task_connect_name or "Output Model") ) - model = self._get_force_base_model(task_model_entry=name) + if not self._base_model: + model = self._get_force_base_model(task_model_entry=name) + else: + self._update_base_model(task_model_entry=name) + model = self._base_model if not model: raise ValueError("Failed creating internal output model") @@ -2639,61 +2728,6 @@ class OutputModel(BaseModel): ) return weights_filename_offline or register_uri - def _get_force_base_model(self, model_name=None, task_model_entry=None): - if self._base_model: - return self._base_model - - # create a new model from the task - # noinspection PyProtectedMember - self._base_model = self._task._get_output_model(model_id=None) - # update the model from the task inputs - labels = self._task.get_labels_enumeration() - # noinspection PyProtectedMember - config_text = self._task._get_model_config_text() - model_name = model_name or self._floating_data.name or self._task.name - task_model_entry = ( - task_model_entry - or self._task_connect_name - or Path(self._get_model_data().uri).stem - ) - parent = self._task.input_models_id.get(task_model_entry) - self._base_model.update( - labels=self._floating_data.labels or labels, - design=self._floating_data.design or config_text, - task_id=self._task.id, - project_id=self._task.project, - parent_id=parent, - name=model_name, - comment=self._floating_data.comment, - tags=self._floating_data.tags, - framework=self._floating_data.framework, - upload_storage_uri=self._floating_data.upload_storage_uri, - ) - - # remove model floating change set, by now they should have matched the task. - self._floating_data = None - - # now we have to update the creator task so it points to us - if str(self._task.status) not in ( - str(self._task.TaskStatusEnum.created), - str(self._task.TaskStatusEnum.in_progress), - ): - self._log.warning( - "Could not update last created model in Task {}, " - "Task status '{}' cannot be updated".format( - self._task.id, self._task.status - ) - ) - else: - self._base_model.update_for_task( - task_id=self._task.id, - model_id=self.id, - type_="output", - name=task_model_entry, - ) - - return self._base_model - def _get_base_model(self): if self._floating_data: return self._floating_data From 83a04d438c00974431d10eb685efd01868bd2053 Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Fri, 11 Aug 2023 19:56:08 +0300 Subject: [PATCH 16/21] Adjust for pandas < 2.0 --- clearml/binding/artifacts.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/clearml/binding/artifacts.py b/clearml/binding/artifacts.py index 93062ebf..5577bf24 100644 --- a/clearml/binding/artifacts.py +++ b/clearml/binding/artifacts.py @@ -1,4 +1,5 @@ import gzip +import io import json import yaml import mimetypes @@ -1062,7 +1063,16 @@ class Artifacts(object): # (otherwise it is encoded and creates new hash every time) if self._compression == "gzip": with gzip.GzipFile(local_filename, 'wb', mtime=0) as gzip_file: - artifact_object.to_csv(gzip_file, **kwargs) + try: + pd_version = int(pd.__version__.split(".")[0]) + except ValueError: + pd_version = 0 + + if pd_version >= 2: + artifact_object.to_csv(gzip_file, **kwargs) + else: + # old (pandas<2) versions of pandas cannot handle direct gzip stream, so we manually encode it + artifact_object.to_csv(io.TextIOWrapper(gzip_file), **kwargs) else: artifact_object.to_csv(local_filename, compression=self._compression) From afe178b002ea7566459d38847d0b4146db1cd0b2 Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Fri, 11 Aug 2023 19:57:07 +0300 Subject: [PATCH 17/21] Bump version to 1.12.2 --- clearml/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml/version.py b/clearml/version.py index fe70fa28..96ddfeb7 100644 --- a/clearml/version.py +++ b/clearml/version.py @@ -1 +1 @@ -__version__ = '1.12.1' +__version__ = '1.12.2' From 88d036b367290e8994b01cd5cdb7a0f906386994 Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Tue, 15 Aug 2023 13:54:04 +0300 Subject: [PATCH 18/21] Update PyNVML to the latest NVIDIA version --- clearml/utilities/gpu/pynvml.py | 1373 +++++++++++++++++++++++++++++-- 1 file changed, 1301 insertions(+), 72 deletions(-) diff --git a/clearml/utilities/gpu/pynvml.py b/clearml/utilities/gpu/pynvml.py index 4fd7a7d8..25d82469 100644 --- a/clearml/utilities/gpu/pynvml.py +++ b/clearml/utilities/gpu/pynvml.py @@ -1,5 +1,5 @@ ##### -# Copyright (c) 2011-2019, NVIDIA Corporation. All rights reserved. +# Copyright (c) 2011-2023, NVIDIA Corporation. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -24,7 +24,6 @@ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF # THE POSSIBILITY OF SUCH DAMAGE. -# @ copied from nvidia-ml-py3 11.450.51 ##### # flake8: noqa # This is only to ignore F405 errors @@ -33,13 +32,15 @@ # Python bindings for the NVML library ## from ctypes import * # noqa: F403 +from ctypes.util import find_library # noqa +from functools import wraps import sys import os import threading import string -# C Type mappings # -# Enums +## C Type mappings ## +## Enums _nvmlEnableState_t = c_uint NVML_FEATURE_DISABLED = 0 NVML_FEATURE_ENABLED = 1 @@ -49,17 +50,31 @@ NVML_BRAND_UNKNOWN = 0 NVML_BRAND_QUADRO = 1 NVML_BRAND_TESLA = 2 NVML_BRAND_NVS = 3 -NVML_BRAND_GRID = 4 +NVML_BRAND_GRID = 4 # Deprecated from API reporting. Keeping definition for backward compatibility. NVML_BRAND_GEFORCE = 5 NVML_BRAND_TITAN = 6 -NVML_BRAND_COUNT = 7 +NVML_BRAND_NVIDIA_VAPPS = 7 # NVIDIA Virtual Applications +NVML_BRAND_NVIDIA_VPC = 8 # NVIDIA Virtual PC +NVML_BRAND_NVIDIA_VCS = 9 # NVIDIA Virtual Compute Server +NVML_BRAND_NVIDIA_VWS = 10 # NVIDIA RTX Virtual Workstation +NVML_BRAND_NVIDIA_CLOUD_GAMING = 11 # NVIDIA Cloud Gaming +NVML_BRAND_NVIDIA_VGAMING = NVML_BRAND_NVIDIA_CLOUD_GAMING # Deprecated from API reporting. Keeping definition for backward compatibility. +NVML_BRAND_QUADRO_RTX = 12 +NVML_BRAND_NVIDIA_RTX = 13 +NVML_BRAND_NVIDIA = 14 +NVML_BRAND_GEFORCE_RTX = 15 # Unused +NVML_BRAND_TITAN_RTX = 16 # Unused +NVML_BRAND_COUNT = 17 _nvmlTemperatureThresholds_t = c_uint NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0 NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1 NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2 NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3 -NVML_TEMPERATURE_THRESHOLD_COUNT = 4 +NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MIN = 4 +NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_CURR = 5 +NVML_TEMPERATURE_THRESHOLD_ACOUSTIC_MAX = 6 +NVML_TEMPERATURE_THRESHOLD_COUNT = 7 _nvmlTemperatureSensors_t = c_uint NVML_TEMPERATURE_GPU = 0 @@ -84,7 +99,7 @@ NVML_MEMORY_LOCATION_CBU = 6 NVML_MEMORY_LOCATION_SRAM = 7 NVML_MEMORY_LOCATION_COUNT = 8 -NVML_NVLINK_MAX_LINKS = 12 +NVML_NVLINK_MAX_LINKS = 18 # For backwards compatibility, maintain the incorrectly-named "LANES" define NVML_NVLINK_MAX_LANES = NVML_NVLINK_MAX_LINKS @@ -94,7 +109,15 @@ NVML_NVLINK_ERROR_DL_REPLAY = 0 NVML_NVLINK_ERROR_DL_RECOVERY = 1 NVML_NVLINK_ERROR_DL_CRC_FLIT = 2 NVML_NVLINK_ERROR_DL_CRC_DATA = 3 -NVML_NVLINK_ERROR_COUNT = 4 +NVML_NVLINK_ERROR_DL_ECC_DATA = 4 +NVML_NVLINK_ERROR_COUNT = 5 + +_nvmlNvLinkEccLaneErrorCounter_t = c_uint +NVML_NVLINK_ERROR_DL_ECC_LANE0 = 0 +NVML_NVLINK_ERROR_DL_ECC_LANE1 = 1 +NVML_NVLINK_ERROR_DL_ECC_LANE2 = 2 +NVML_NVLINK_ERROR_DL_ECC_LANE3 = 3 +NVML_NVLINK_ERROR_DL_ECC_COUNT = 5 _nvmlNvLinkCapability_t = c_uint NVML_NVLINK_CAP_P2P_SUPPORTED = 0 @@ -123,6 +146,12 @@ NVML_NVLINK_COUNTER_UNIT_BYTES = 2 NVML_NVLINK_COUNTER_UNIT_RESERVED = 3 NVML_NVLINK_COUNTER_UNIT_COUNT = 4 +_nvmlNvLinkDeviceType_t = c_uint +NVML_NVLINK_DEVICE_TYPE_GPU = 0x00 +NVML_NVLINK_DEVICE_TYPE_IBMNPU = 0x01 +NVML_NVLINK_DEVICE_TYPE_SWITCH = 0x02 +NVML_NVLINK_DEVICE_TYPE_UNKNOWN = 0xFF + # These are deprecated, instead use _nvmlMemoryErrorType_t _nvmlEccBitType_t = c_uint NVML_SINGLE_BIT_ECC = 0 @@ -156,6 +185,9 @@ NVML_CLOCK_ID_COUNT = 4 _nvmlDriverModel_t = c_uint NVML_DRIVER_WDDM = 0 NVML_DRIVER_WDM = 1 +NVML_DRIVER_MCDM = 2 + +NVML_MAX_GPU_PERF_PSTATES = 16 _nvmlPstates_t = c_uint NVML_PSTATE_0 = 0 @@ -207,12 +239,19 @@ NVML_ERROR_MEMORY = 20 NVML_ERROR_NO_DATA = 21 NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22 NVML_ERROR_INSUFFICIENT_RESOURCES = 23 +NVML_ERROR_FREQ_NOT_SUPPORTED = 24 +NVML_ERROR_ARGUMENT_VERSION_MISMATCH = 25 +NVML_ERROR_DEPRECATED = 26 NVML_ERROR_UNKNOWN = 999 _nvmlFanState_t = c_uint NVML_FAN_NORMAL = 0 NVML_FAN_FAILED = 1 +_nvmlFanControlPolicy_t = c_uint +NVML_FAN_POLICY_TEMPERATURE_CONTINOUS_SW = 0 +NVML_FAN_POLICY_MANUAL = 1 + _nvmlLedColor_t = c_uint NVML_LED_COLOR_GREEN = 0 NVML_LED_COLOR_AMBER = 1 @@ -300,7 +339,7 @@ NVML_TOPOLOGY_CPU = NVML_TOPOLOGY_NODE NVML_TOPOLOGY_SYSTEM = 50 _nvmlGpuP2PCapsIndex_t = c_uint -NVML_P2P_CAPS_INDEX_READ = 0, +NVML_P2P_CAPS_INDEX_READ = 0 NVML_P2P_CAPS_INDEX_WRITE = 1 NVML_P2P_CAPS_INDEX_NVLINK = 2 NVML_P2P_CAPS_INDEX_ATOMICS = 3 @@ -324,13 +363,40 @@ NVML_DEVICE_ARCH_PASCAL = 4 NVML_DEVICE_ARCH_VOLTA = 5 NVML_DEVICE_ARCH_TURING = 6 NVML_DEVICE_ARCH_AMPERE = 7 +NVML_DEVICE_ARCH_ADA = 8 +NVML_DEVICE_ARCH_HOPPER = 9 NVML_DEVICE_ARCH_UNKNOWN = 0xffffffff +# PCI bus Types +_nvmlBusType_t = c_uint +NVML_BUS_TYPE_UNKNOWN = 0 +NVML_BUS_TYPE_PCI = 1 +NVML_BUS_TYPE_PCIE = 2 +NVML_BUS_TYPE_FPCI = 3 +NVML_BUS_TYPE_AGP = 4 + +_nvmlPowerSource_t = c_uint +NVML_POWER_SOURCE_AC = 0x00000000 +NVML_POWER_SOURCE_BATTERY = 0x00000001 + +_nvmlAdaptiveClockInfoStatus_t = c_uint +NVML_ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED = 0x00000000 +NVML_ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED = 0x00000001 + _nvmlClockLimitId_t = c_uint NVML_CLOCK_LIMIT_ID_RANGE_START = 0xffffff00 NVML_CLOCK_LIMIT_ID_TDP = 0xffffff01 NVML_CLOCK_LIMIT_ID_UNLIMITED = 0xffffff02 +_nvmlPcieLinkMaxSpeed_t = c_uint +NVML_PCIE_LINK_MAX_SPEED_INVALID = 0x00000000 +NVML_PCIE_LINK_MAX_SPEED_2500MBPS = 0x00000001 +NVML_PCIE_LINK_MAX_SPEED_5000MBPS = 0x00000002 +NVML_PCIE_LINK_MAX_SPEED_8000MBPS = 0x00000003 +NVML_PCIE_LINK_MAX_SPEED_16000MBPS = 0x00000004 +NVML_PCIE_LINK_MAX_SPEED_32000MBPS = 0x00000005 +NVML_PCIE_LINK_MAX_SPEED_64000MBPS = 0x00000006 + _nvmlAffinityScope_t = c_uint NVML_AFFINITY_SCOPE_NODE = 0 NVML_AFFINITY_SCOPE_SOCKET = 1 @@ -350,8 +416,10 @@ NVML_DEVICE_UUID_V2_BUFFER_SIZE = 96 NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE = 80 NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE = 80 NVML_DEVICE_NAME_BUFFER_SIZE = 64 +NVML_DEVICE_NAME_V2_BUFFER_SIZE = 96 NVML_DEVICE_SERIAL_BUFFER_SIZE = 30 NVML_DEVICE_PART_NUMBER_BUFFER_SIZE = 80 +NVML_DEVICE_GPU_PART_NUMBER_BUFFER_SIZE = 80 NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE = 32 NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE = 16 @@ -478,8 +546,7 @@ NVML_FI_DEV_PERF_POLICY_TOTAL_BASE_CLOCKS = 81 # Perf Policy Counter for Total NVML_FI_DEV_MEMORY_TEMP = 82 # Memory temperature for the device # Energy Counter -NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 83 # Total energy consumption for the GPU in mJ since the driver was last -# reloaded +NVML_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 83 # Total energy consumption for the GPU in mJ since the driver was last reloaded # NVLink Speed NVML_FI_DEV_NVLINK_SPEED_MBPS_L0 = 84 @@ -569,7 +636,40 @@ NVML_FI_DEV_REMAPPED_UNC = 143 NVML_FI_DEV_REMAPPED_PENDING = 144 NVML_FI_DEV_REMAPPED_FAILURE = 145 -NVML_FI_MAX = 146 # One greater than the largest field ID defined above +# Remote device NVLink ID +NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID = 146 + +# Number of NVLinks connected to NVSwitch +NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT = 147 + +# NvLink ECC Data Error Counters +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L0 = 148 # < NVLink data ECC Error Counter for Link 0 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L1 = 149 # < NVLink data ECC Error Counter for Link 1 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L2 = 150 # < NVLink data ECC Error Counter for Link 2 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L3 = 151 # < NVLink data ECC Error Counter for Link 3 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L4 = 152 # < NVLink data ECC Error Counter for Link 4 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L5 = 153 # < NVLink data ECC Error Counter for Link 5 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L6 = 154 # < NVLink data ECC Error Counter for Link 6 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L7 = 155 # < NVLink data ECC Error Counter for Link 7 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L8 = 156 # < NVLink data ECC Error Counter for Link 8 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L9 = 157 # < NVLink data ECC Error Counter for Link 9 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L10 = 158 # < NVLink data ECC Error Counter for Link 10 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_L11 = 159 # < NVLink data ECC Error Counter for Link 11 +NVML_FI_DEV_NVLINK_ECC_DATA_ERROR_COUNT_TOTAL = 160 # < NvLink data ECC Error Counter total for all Links + +NVML_FI_DEV_NVLINK_ERROR_DL_REPLAY = 161 +NVML_FI_DEV_NVLINK_ERROR_DL_RECOVERY = 162 +NVML_FI_DEV_NVLINK_ERROR_DL_CRC = 163 +NVML_FI_DEV_NVLINK_GET_SPEED = 164 +NVML_FI_DEV_NVLINK_GET_STATE = 165 +NVML_FI_DEV_NVLINK_GET_VERSION = 166 + +NVML_FI_DEV_NVLINK_GET_POWER_STATE = 167 +NVML_FI_DEV_NVLINK_GET_POWER_THRESHOLD = 168 + +NVML_FI_DEV_PCIE_L0_TO_RECOVERY_COUNTER = 169 + +NVML_FI_MAX = 170 # One greater than the largest field ID defined above ## Enums needed for the method nvmlDeviceGetVirtualizationMode and nvmlDeviceSetVirtualizationMode NVML_GPU_VIRTUALIZATION_MODE_NONE = 0 # Represents Bare Metal GPU @@ -592,8 +692,39 @@ NVML_VGPU_VM_ID_DOMAIN_ID = 0 NVML_VGPU_VM_ID_UUID = 1 _nvmlGridLicenseFeatureCode_t = c_uint +NVML_GRID_LICENSE_FEATURE_CODE_UNKNOWN = 0 NVML_GRID_LICENSE_FEATURE_CODE_VGPU = 1 -NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2 +NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX = 2 +NVML_GRID_LICENSE_FEATURE_CODE_VWORKSTATION = 2 # deprecated, use NVML_GRID_LICENSE_FEATURE_CODE_NVIDIA_RTX. +NVML_GRID_LICENSE_FEATURE_CODE_GAMING = 3 +NVML_GRID_LICENSE_FEATURE_CODE_COMPUTE = 4 + +_nvmlGridLicenseExpiryStatus_t = c_uint8 +NVML_GRID_LICENSE_EXPIRY_NOT_AVAILABLE = 0 # Expiry information not available +NVML_GRID_LICENSE_EXPIRY_INVALID = 1 # Invalid expiry or error fetching expiry +NVML_GRID_LICENSE_EXPIRY_VALID = 2 # Valid expiry +NVML_GRID_LICENSE_EXPIRY_NOT_APPLICABLE = 3 # Expiry not applicable +NVML_GRID_LICENSE_EXPIRY_PERMANENT = 4 # Permanent expiry + +_nvmlVgpuCapability_t = c_uint +NVML_VGPU_CAP_NVLINK_P2P = 0 # vGPU P2P over NVLink is supported +NVML_VGPU_CAP_GPUDIRECT = 1 # GPUDirect capability is supported +NVML_VGPU_CAP_MULTI_VGPU_EXCLUSIVE = 2 # vGPU profile cannot be mixed with other vGPU profiles in same VM +NVML_VGPU_CAP_EXCLUSIVE_TYPE = 3 # vGPU profile cannot run on a GPU alongside other profiles of different type +NVML_VGPU_CAP_EXCLUSIVE_SIZE = 4 # vGPU profile cannot run on a GPU alongside other profiles of different size +NVML_VGPU_CAP_COUNT = 5 + +_nvmlVgpuDriverCapability_t = c_uint +NVML_VGPU_DRIVER_CAP_HETEROGENEOUS_MULTI_VGPU = 0 # Supports mixing of different vGPU profiles within one guest VM +NVML_VGPU_DRIVER_CAP_COUNT = 1 + +_nvmlDeviceVgpuCapability_t = c_uint +NVML_DEVICE_VGPU_CAP_FRACTIONAL_MULTI_VGPU = 0 # Fractional vGPU profiles on this GPU can be used in multi-vGPU configurations +NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_PROFILES = 1 # Supports concurrent execution of timesliced vGPU profiles of differing types +NVML_DEVICE_VGPU_CAP_HETEROGENEOUS_TIMESLICE_SIZES = 2 # Supports concurrent execution of timesliced vGPU profiles of differing framebuffer sizes +NVML_DEVICE_VGPU_CAP_READ_DEVICE_BUFFER_BW = 3 # GPU device's read_device_buffer expected bandwidth capacity in megabytes per second +NVML_DEVICE_VGPU_CAP_WRITE_DEVICE_BUFFER_BW = 4 # GPU device's write_device_buffer expected bandwidth capacity in megabytes per second +NVML_DEVICE_VGPU_CAP_COUNT = 5 _nvmlVgpuGuestInfoState_t = c_uint NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0 @@ -617,6 +748,9 @@ _nvmlHostVgpuMode_t = c_uint NVML_HOST_VGPU_MODE_NON_SRIOV = 0 NVML_HOST_VGPU_MODE_SRIOV = 1 +# GSP firmware +NVML_GSP_FIRMWARE_VERSION_BUF_SIZE = 0x40 + ## Error Checking ## class NVMLError(Exception): @@ -661,7 +795,7 @@ class NVMLError(Exception): if self.value not in NVMLError._errcode_to_string: NVMLError._errcode_to_string[self.value] = str(nvmlErrorString(self.value)) return NVMLError._errcode_to_string[self.value] - except NVMLError: # NVMLError_Uninitialized: + except NVMLError: return "NVML Error with code %d" % self.value def __eq__(self, other): @@ -713,7 +847,7 @@ def _nvmlCheckReturn(ret): return ret -# Function access # +## Function access ## _nvmlGetFunctionPointer_cache = dict() # function pointers are cached to prevent unnecessary libLoadLock locking @@ -738,7 +872,7 @@ def _nvmlGetFunctionPointer(name): libLoadLock.release() -# Alternative object +## Alternative object # Allows the object to be printed # Allows mismatched types to be assigned # - like None when the Structure variant requires c_uint @@ -756,7 +890,8 @@ def nvmlStructToFriendlyObject(struct): for x in struct._fields_: key = x[0] value = getattr(struct, key) - d[key] = value + # only need to convert from bytes if bytes, no need to check python version. + d[key] = value.decode() if isinstance(value, bytes) else value obj = nvmlFriendlyObject(d) return obj @@ -766,11 +901,15 @@ def nvmlFriendlyObjectToStruct(obj, model): for x in model._fields_: key = x[0] value = obj.__dict__[key] - setattr(model, key, value) + # any c_char_p in python3 needs to be bytes, default encoding works fine. + if sys.version_info >= (3,): + setattr(model, key, value.encode()) + else: + setattr(model, key, value) return model -# Unit structures +## Unit structures class struct_c_nvmlUnit_t(Structure): pass # opaque handle @@ -812,6 +951,25 @@ class _PrintableStructure(Structure): result.append(("%s: " + fmt) % (key, value)) return self.__class__.__name__ + "(" + ", ".join(result) + ")" + def __getattribute__(self, name): + res = super(_PrintableStructure, self).__getattribute__(name) + # need to convert bytes to unicode for python3 don't need to for python2 + # Python 2 strings are of both str and bytes + # Python 3 strings are not of type bytes + # ctypes should convert everything to the correct values otherwise + if isinstance(res, bytes): + if isinstance(res, str): + return res + return res.decode() + return res + + def __setattr__(self, name, value): + if isinstance(value, str): + # encoding a python2 string returns the same value, since python2 strings are bytes already + # bytes passed in python3 will be ignored. + value = value.encode() + super(_PrintableStructure, self).__setattr__(name, value) + class c_nvmlUnitInfo_t(_PrintableStructure): _fields_ = [ @@ -852,7 +1010,7 @@ class c_nvmlUnitFanSpeeds_t(_PrintableStructure): ] -# Device structures +## Device structures class struct_c_nvmlDevice_t(Structure): pass # opaque handle @@ -909,7 +1067,7 @@ class nvmlPciInfo_t(_PrintableStructure): } -class c_nvmlBlacklistDeviceInfo_t(_PrintableStructure): +class c_nvmlExcludedDeviceInfo_t(_PrintableStructure): _fields_ = [ ('pci', nvmlPciInfo_t), ('uuid', c_char * NVML_DEVICE_UUID_BUFFER_SIZE) @@ -932,6 +1090,20 @@ class c_nvmlMemory_t(_PrintableStructure): _fmt_ = {'': "%d B"} +class c_nvmlMemory_v2_t(_PrintableStructure): + _fields_ = [ + ('version', c_uint), + ('total', c_ulonglong), + ('reserved', c_ulonglong), + ('free', c_ulonglong), + ('used', c_ulonglong), + ] + _fmt_ = {'': "%d B"} + + +nvmlMemory_v2 = 0x02000028 + + class c_nvmlBAR1Memory_t(_PrintableStructure): _fields_ = [ ('bar1Total', c_ulonglong), @@ -941,6 +1113,19 @@ class c_nvmlBAR1Memory_t(_PrintableStructure): _fmt_ = {'': "%d B"} +class nvmlClkMonFaultInfo_t(Structure): + _fields_ = [("clkApiDomain", c_uint), + ("clkDomainFaultMask", c_uint) + ] + + +class nvmlClkMonStatus_t(Structure): + _fields_ = [("bGlobalStatus", c_uint), + ("clkMonListSize", c_uint), + ("clkMonList", nvmlClkMonFaultInfo_t) + ] + + # On Windows with the WDDM driver, usedGpuMemory is reported as None # Code that processes this structure should check for None, I.E. # @@ -956,8 +1141,11 @@ class c_nvmlProcessInfo_t(_PrintableStructure): _fields_ = [ ('pid', c_uint), ('usedGpuMemory', c_ulonglong), + ('gpuInstanceId', c_uint), + ('computeInstanceId', c_uint), ] - _fmt_ = {'usedGpuMemory': "%d B"} + _fmt_ = {'usedGpuMemory': "%d B", + } class c_nvmlBridgeChipInfo_t(_PrintableStructure): @@ -1059,6 +1247,34 @@ class c_nvmlVgpuProcessUtilizationSample_t(_PrintableStructure): ] +class c_nvmlVgpuLicenseExpiry_t(_PrintableStructure): + _fields_ = [ + ('year', c_uint32), + ('month', c_uint16), + ('day', c_uint16), + ('hour', c_uint16), + ('min', c_uint16), + ('sec', c_uint16), + ('status', c_uint8), + ] + + +NVML_GRID_LICENSE_STATE_UNKNOWN = 0 +NVML_GRID_LICENSE_STATE_UNINITIALIZED = 1 +NVML_GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED = 2 +NVML_GRID_LICENSE_STATE_UNLICENSED_RESTRICTED = 3 +NVML_GRID_LICENSE_STATE_UNLICENSED = 4 +NVML_GRID_LICENSE_STATE_LICENSED = 5 + + +class c_nvmlVgpuLicenseInfo_t(_PrintableStructure): + _fields_ = [ + ('isLicensed', c_uint8), + ('licenseExpiry', c_nvmlVgpuLicenseExpiry_t), + ('currentState', c_uint), + ] + + class c_nvmlEncoderSession_t(_PrintableStructure): _fields_ = [ ('sessionId', c_uint), @@ -1083,6 +1299,37 @@ class c_nvmlProcessUtilizationSample_t(_PrintableStructure): ] +class c_nvmlGridLicenseExpiry_t(_PrintableStructure): + _fields_ = [ + ('year', c_uint32), + ('month', c_uint16), + ('day', c_uint16), + ('hour', c_uint16), + ('min', c_uint16), + ('sec', c_uint16), + ('status', c_uint8), + ] + + +class c_nvmlGridLicensableFeature_v4_t(_PrintableStructure): + _fields_ = [ + ('featureCode', _nvmlGridLicenseFeatureCode_t), + ('featureState', c_uint), + ('licenseInfo', c_char * NVML_GRID_LICENSE_BUFFER_SIZE), + ('productName', c_char * NVML_GRID_LICENSE_BUFFER_SIZE), + ('featureEnabled', c_uint), + ('licenseExpiry', c_nvmlGridLicenseExpiry_t), + ] + + +class c_nvmlGridLicensableFeatures_v4_t(_PrintableStructure): + _fields_ = [ + ('isGridLicenseSupported', c_int), + ('licensableFeaturesCount', c_uint), + ('gridLicensableFeatures', c_nvmlGridLicensableFeature_v4_t * NVML_GRID_LICENSE_FEATURE_MAX_COUNT), + ] + + class c_nvmlGridLicensableFeature_v3_t(_PrintableStructure): _fields_ = [ ('featureCode', _nvmlGridLicenseFeatureCode_t), @@ -1163,8 +1410,7 @@ nvmlEventTypeAll = ( ## Clock Throttle Reasons defines nvmlClocksThrottleReasonGpuIdle = 0x0000000000000001 nvmlClocksThrottleReasonApplicationsClocksSetting = 0x0000000000000002 -nvmlClocksThrottleReasonUserDefinedClocks = nvmlClocksThrottleReasonApplicationsClocksSetting # deprecated, -# use nvmlClocksThrottleReasonApplicationsClocksSetting +nvmlClocksThrottleReasonUserDefinedClocks = nvmlClocksThrottleReasonApplicationsClocksSetting # deprecated, use nvmlClocksThrottleReasonApplicationsClocksSetting nvmlClocksThrottleReasonSwPowerCap = 0x0000000000000004 nvmlClocksThrottleReasonHwSlowdown = 0x0000000000000008 nvmlClocksThrottleReasonSyncBoost = 0x0000000000000010 @@ -1216,7 +1462,7 @@ class c_nvmlVgpuVersion_t(Structure): ] -class c_nvmlVgpuMetadata_t(Structure): +class c_nvmlVgpuMetadata_t(_PrintableStructure): _fields_ = [("version", c_uint), ("revision", c_uint), ("guestInfoState", _nvmlVgpuGuestInfoState_t), @@ -1230,7 +1476,7 @@ class c_nvmlVgpuMetadata_t(Structure): ] -class c_nvmlVgpuPgpuMetadata_t(Structure): +class c_nvmlVgpuPgpuMetadata_t(_PrintableStructure): _fields_ = [("version", c_uint), ("revision", c_uint), ("hostDriverVersion", c_char * NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE), @@ -1248,6 +1494,94 @@ class c_nvmlVgpuPgpuCompatibility_t(Structure): ] +## vGPU scheduler policy defines +NVML_VGPU_SCHEDULER_POLICY_UNKNOWN = 0 +NVML_VGPU_SCHEDULER_POLICY_BEST_EFFORT = 1 +NVML_VGPU_SCHEDULER_POLICY_EQUAL_SHARE = 2 +NVML_VGPU_SCHEDULER_POLICY_FIXED_SHARE = 3 + +## Supported vGPU scheduler policy count +NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT = 3 + +NVML_SCHEDULER_SW_MAX_LOG_ENTRIES = 200 + + +class c_nvmlVgpuSchedDataWithARR_t(_PrintableStructure): + _fields_ = [ + ('avgFactor', c_uint), + ('timeslice', c_uint), + ] + + +class c_nvmlVgpuSchedData_t(_PrintableStructure): + _fields_ = [ + ('timeslice', c_uint), + ] + + +class c_nvmlVgpuSchedulerParams_t(Union): + _fields_ = [ + ('vgpuSchedDataWithARR', c_nvmlVgpuSchedDataWithARR_t), + ('vgpuSchedData', c_nvmlVgpuSchedData_t), + ] + + +class c_nvmlVgpuSchedulerLogEntry_t(_PrintableStructure): + _fields_ = [ + ('timestamp', c_ulonglong), + ('timeRunTotal', c_ulonglong), + ('timeRun', c_ulonglong), + ('swRunlistId', c_uint), + ('targetTimeSlice', c_ulonglong), + ('cumulativePreemptionTime', c_ulonglong), + ] + + +class c_nvmlVgpuSchedulerLog_t(_PrintableStructure): + _fields_ = [ + ('engineId', c_uint), + ('schedulerPolicy', c_uint), + ('isEnabledARR', c_uint), + ('schedulerParams', c_nvmlVgpuSchedulerParams_t), + ('entriesCount', c_uint), + ('logEntries', c_nvmlVgpuSchedulerLogEntry_t * NVML_SCHEDULER_SW_MAX_LOG_ENTRIES), + ] + + +class c_nvmlVgpuSchedulerGetState_t(_PrintableStructure): + _fields_ = [ + ('schedulerPolicy', c_uint), + ('isEnabledARR', c_uint), + ('schedulerParams', c_nvmlVgpuSchedulerParams_t), + ] + + +class c_nvmlVgpuSchedSetDataWithARR_t(_PrintableStructure): + _fields_ = [ + ('avgFactor', c_uint), + ('frequency', c_uint), + ] + + +class c_nvmlVgpuSchedSetData_t(_PrintableStructure): + _fields_ = [ + ('timeslice', c_uint), + ] + + +class c_nvmlVgpuSchedulerCapabilities_t(_PrintableStructure): + _fields_ = [ + ('supportedSchedulers', c_uint * NVML_SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT), + ('maxTimeslice', c_uint), + ('minTimeslice', c_uint), + ('isArrModeSupported', c_uint), + ('maxFrequencyForARR', c_uint), + ('minFrequencyForARR', c_uint), + ('maxAvgFactorForARR', c_uint), + ('minAvgFactorForARR', c_uint), + ] + + class c_nvmlFBCStats_t(Structure): _fields_ = [("sessionsCount", c_uint), ("averageFPS", c_uint), @@ -1280,7 +1614,12 @@ NVML_GPU_INSTANCE_PROFILE_2_SLICE = 0x1 NVML_GPU_INSTANCE_PROFILE_3_SLICE = 0x2 NVML_GPU_INSTANCE_PROFILE_4_SLICE = 0x3 NVML_GPU_INSTANCE_PROFILE_7_SLICE = 0x4 -NVML_GPU_INSTANCE_PROFILE_COUNT = 0x5 +NVML_GPU_INSTANCE_PROFILE_8_SLICE = 0x5 +NVML_GPU_INSTANCE_PROFILE_6_SLICE = 0x6 +NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7 +NVML_GPU_INSTANCE_PROFILE_2_SLICE_REV1 = 0x8 +NVML_GPU_INSTANCE_PROFILE_1_SLICE_REV2 = 0x9 +NVML_GPU_INSTANCE_PROFILE_COUNT = 0xA class c_nvmlGpuInstancePlacement_t(Structure): @@ -1304,6 +1643,29 @@ class c_nvmlGpuInstanceProfileInfo_t(Structure): ] +nvmlGpuInstanceProfileInfo_v2 = 0x02000098 + + +class c_nvmlGpuInstanceProfileInfo_v2_t(_PrintableStructure): + _fields_ = [("version", c_uint), + ("id", c_uint), + ("isP2pSupported", c_uint), + ("sliceCount", c_uint), + ("instanceCount", c_uint), + ("multiprocessorCount", c_uint), + ("copyEngineCount", c_uint), + ("decoderCount", c_uint), + ("encoderCount", c_uint), + ("jpegCount", c_uint), + ("ofaCount", c_uint), + ("memorySizeMB", c_ulonglong), + ("name", c_char * NVML_DEVICE_NAME_V2_BUFFER_SIZE) + ] + + def __init__(self): + super(c_nvmlGpuInstanceProfileInfo_v2_t, self).__init__(version=nvmlGpuInstanceProfileInfo_v2) + + class c_nvmlGpuInstanceInfo_t(Structure): _fields_ = [("device", c_nvmlDevice_t), ("id", c_uint), @@ -1323,12 +1685,21 @@ NVML_COMPUTE_INSTANCE_PROFILE_2_SLICE = 0x1 NVML_COMPUTE_INSTANCE_PROFILE_3_SLICE = 0x2 NVML_COMPUTE_INSTANCE_PROFILE_4_SLICE = 0x3 NVML_COMPUTE_INSTANCE_PROFILE_7_SLICE = 0x4 -NVML_COMPUTE_INSTANCE_PROFILE_COUNT = 0x5 +NVML_COMPUTE_INSTANCE_PROFILE_8_SLICE = 0x5 +NVML_COMPUTE_INSTANCE_PROFILE_6_SLICE = 0x6 +NVML_COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7 +NVML_COMPUTE_INSTANCE_PROFILE_COUNT = 0x8 NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = 0x0 NVML_COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = 0x1 +class c_nvmlComputeInstancePlacement_t(Structure): + _fields_ = [("start", c_uint), + ("size", c_uint) + ] + + class c_nvmlComputeInstanceProfileInfo_t(Structure): _fields_ = [("id", c_uint), ("sliceCount", c_uint), @@ -1342,14 +1713,102 @@ class c_nvmlComputeInstanceProfileInfo_t(Structure): ] +nvmlComputeInstanceProfileInfo_v2 = 0x02000088 + + +class c_nvmlComputeInstanceProfileInfo_v2_t(_PrintableStructure): + _fields_ = [("version", c_uint), + ("id", c_uint), + ("sliceCount", c_uint), + ("instanceCount", c_uint), + ("multiprocessorCount", c_uint), + ("sharedCopyEngineCount", c_uint), + ("sharedDecoderCount", c_uint), + ("sharedEncoderCount", c_uint), + ("sharedJpegCount", c_uint), + ("sharedOfaCount", c_uint), + ("name", c_char * NVML_DEVICE_NAME_V2_BUFFER_SIZE) + ] + + def __init__(self): + super(c_nvmlComputeInstanceProfileInfo_v2_t, self).__init__(version=nvmlComputeInstanceProfileInfo_v2) + + class c_nvmlComputeInstanceInfo_t(Structure): _fields_ = [("device", c_nvmlDevice_t), ("gpuInstance", c_nvmlGpuInstance_t), ("id", c_uint), - ("profileId", c_uint) + ("profileId", c_uint), + ("placement", c_nvmlComputeInstancePlacement_t) ] +NVML_MAX_GPU_UTILIZATIONS = 8 +NVML_GPU_UTILIZATION_DOMAIN_GPU = 0 +NVML_GPU_UTILIZATION_DOMAIN_FB = 1 +NVML_GPU_UTILIZATION_DOMAIN_VID = 2 +NVML_GPU_UTILIZATION_DOMAIN_BUS = 3 + + +class c_nvmlGpuDynamicPstatesUtilization_t(Structure): + _fields_ = [("bIsPresent", c_uint, 1), + ("percentage", c_uint), + ("incThreshold", c_uint), + ("decThreshold", c_uint)] + + +class c_nvmlGpuDynamicPstatesInfo_t(Structure): + _fields_ = [("flags", c_uint), + ("utilization", c_nvmlGpuDynamicPstatesUtilization_t * NVML_MAX_GPU_UTILIZATIONS)] + + +NVML_MAX_THERMAL_SENSORS_PER_GPU = 3 + +NVML_THERMAL_TARGET_NONE = 0 +NVML_THERMAL_TARGET_GPU = 1 +NVML_THERMAL_TARGET_MEMORY = 2 +NVML_THERMAL_TARGET_POWER_SUPPLY = 4 +NVML_THERMAL_TARGET_BOARD = 8 +NVML_THERMAL_TARGET_VCD_BOARD = 9 +NVML_THERMAL_TARGET_VCD_INLET = 10 +NVML_THERMAL_TARGET_VCD_OUTLET = 11 +NVML_THERMAL_TARGET_ALL = 15 +NVML_THERMAL_TARGET_UNKNOWN = -1 + +NVML_THERMAL_CONTROLLER_NONE = 0 +NVML_THERMAL_CONTROLLER_GPU_INTERNAL = 1 +NVML_THERMAL_CONTROLLER_ADM1032 = 2 +NVML_THERMAL_CONTROLLER_ADT7461 = 3 +NVML_THERMAL_CONTROLLER_MAX6649 = 4 +NVML_THERMAL_CONTROLLER_MAX1617 = 5 +NVML_THERMAL_CONTROLLER_LM99 = 6 +NVML_THERMAL_CONTROLLER_LM89 = 7 +NVML_THERMAL_CONTROLLER_LM64 = 8 +NVML_THERMAL_CONTROLLER_G781 = 9 +NVML_THERMAL_CONTROLLER_ADT7473 = 10 +NVML_THERMAL_CONTROLLER_SBMAX6649 = 11 +NVML_THERMAL_CONTROLLER_VBIOSEVT = 12 +NVML_THERMAL_CONTROLLER_OS = 13 +NVML_THERMAL_CONTROLLER_NVSYSCON_CANOAS = 14 +NVML_THERMAL_CONTROLLER_NVSYSCON_E551 = 15 +NVML_THERMAL_CONTROLLER_MAX6649R = 16 +NVML_THERMAL_CONTROLLER_ADT7473S = 17 +NVML_THERMAL_CONTROLLER_UNKNOWN = -1 + + +class c_nvmlGpuThermalSensor_t(Structure): + _fields_ = [("controller", c_int), + ("defaultMinTemp", c_int), + ("defaultMaxTemp", c_int), + ("currentTemp", c_int), + ("target", c_int)] + + +class c_nvmlGpuThermalSettings_t(Structure): + _fields_ = [("count", c_uint), + ("sensor", c_nvmlGpuThermalSensor_t * NVML_MAX_THERMAL_SENSORS_PER_GPU)] + + class struct_c_nvmlComputeInstance_t(Structure): pass # opaque handle @@ -1379,6 +1838,36 @@ class c_nvmlRowRemapperHistogramValues(Structure): ] +## string/bytes conversion for ease of use +def convertStrBytes(func): + ''' + In python 3, strings are unicode instead of bytes, and need to be converted for ctypes + Args from caller: (1, 'string', <__main__.c_nvmlDevice_t at 0xFFFFFFFF>) + Args passed to function: (1, b'string', <__main__.c_nvmlDevice_t at 0xFFFFFFFF)> + ---- + Returned from function: b'returned string' + Returned to caller: 'returned string' + ''' + + @wraps(func) + def wrapper(*args, **kwargs): + # encoding a str returns bytes in python 2 and 3 + args = [arg.encode() if isinstance(arg, str) else arg for arg in args] + res = func(*args, **kwargs) + # In python 2, str and bytes are the same + # In python 3, str is unicode and should be decoded. + # Ctypes handles most conversions, this only effects c_char and char arrays. + if isinstance(res, bytes): + if isinstance(res, str): + return res + return res.decode() + return res + + if sys.version_info >= (3,): + return wrapper + return func + + ## C function wrappers ## def nvmlInitWithFlags(flags): _LoadNvmlLibrary() @@ -1418,21 +1907,19 @@ def _LoadNvmlLibrary(): if nvmlLib is None: try: if (sys.platform[:3] == "win"): - searchPaths = [ - os.path.join(os.getenv("ProgramFiles", r"C:\Program Files"), - r"NVIDIA Corporation\NVSMI\nvml.dll"), - os.path.join(os.getenv("WinDir", r"C:\Windows"), r"System32\nvml.dll"), - ] - nvmlPath = next((x for x in searchPaths if os.path.isfile(x)), None) - if nvmlPath is None: - _nvmlCheckReturn(NVML_ERROR_LIBRARY_NOT_FOUND) - else: - # cdecl calling convention - nvmlLib = CDLL(nvmlPath) + # cdecl calling convention + try: + # Check for nvml.dll in System32 first for DCH drivers + nvmlLib = CDLL(os.path.join(os.getenv("WINDIR", "C:/Windows"), "System32/nvml.dll")) + except OSError as ose: + # If nvml.dll is not found in System32, it should be in ProgramFiles + # load nvml.dll from %ProgramFiles%/NVIDIA Corporation/NVSMI/nvml.dll + nvmlLib = CDLL(os.path.join(os.getenv("ProgramFiles", "C:/Program Files"), + "NVIDIA Corporation/NVSMI/nvml.dll")) else: # assume linux nvmlLib = CDLL("libnvidia-ml.so.1") - except OSError: + except OSError as ose: _nvmlCheckReturn(NVML_ERROR_LIBRARY_NOT_FOUND) if nvmlLib is None: _nvmlCheckReturn(NVML_ERROR_LIBRARY_NOT_FOUND) @@ -1459,6 +1946,7 @@ def nvmlShutdown(): # Added in 2.285 +@convertStrBytes def nvmlErrorString(result): fn = _nvmlGetFunctionPointer("nvmlErrorString") fn.restype = c_char_p # otherwise return is an int @@ -1467,6 +1955,7 @@ def nvmlErrorString(result): # Added in 2.285 +@convertStrBytes def nvmlSystemGetNVMLVersion(): c_version = create_string_buffer(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlSystemGetNVMLVersion") @@ -1492,6 +1981,7 @@ def nvmlSystemGetCudaDriverVersion_v2(): # Added in 2.285 +@convertStrBytes def nvmlSystemGetProcessName(pid): c_name = create_string_buffer(1024) fn = _nvmlGetFunctionPointer("nvmlSystemGetProcessName") @@ -1500,6 +1990,7 @@ def nvmlSystemGetProcessName(pid): return c_name.value +@convertStrBytes def nvmlSystemGetDriverVersion(): c_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlSystemGetDriverVersion") @@ -1522,7 +2013,7 @@ def nvmlSystemGetHicVersion(): (ret != NVML_ERROR_INSUFFICIENT_SIZE)): raise NVMLError(ret) - # if there are no hics + # If there are no hics if (c_count.value == 0): return [] @@ -1533,7 +2024,7 @@ def nvmlSystemGetHicVersion(): return hics -# Unit get functions +## Unit get functions def nvmlUnitGetCount(): c_count = c_uint() fn = _nvmlGetFunctionPointer("nvmlUnitGetCount") @@ -1613,7 +2104,7 @@ def nvmlUnitGetDevices(unit): return c_devices -# Device get functions +## Device get functions def nvmlDeviceGetCount(): c_count = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetCount_v2") @@ -1631,6 +2122,7 @@ def nvmlDeviceGetHandleByIndex(index): return device +@convertStrBytes def nvmlDeviceGetHandleBySerial(serial): c_serial = c_char_p(serial) device = c_nvmlDevice_t() @@ -1640,6 +2132,7 @@ def nvmlDeviceGetHandleBySerial(serial): return device +@convertStrBytes def nvmlDeviceGetHandleByUUID(uuid): c_uuid = c_char_p(uuid) device = c_nvmlDevice_t() @@ -1649,6 +2142,7 @@ def nvmlDeviceGetHandleByUUID(uuid): return device +@convertStrBytes def nvmlDeviceGetHandleByPciBusId(pciBusId): c_busId = c_char_p(pciBusId) device = c_nvmlDevice_t() @@ -1658,10 +2152,11 @@ def nvmlDeviceGetHandleByPciBusId(pciBusId): return device +@convertStrBytes def nvmlDeviceGetName(handle): - c_name = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE) + c_name = create_string_buffer(NVML_DEVICE_NAME_V2_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlDeviceGetName") - ret = fn(handle, c_name, c_uint(NVML_DEVICE_NAME_BUFFER_SIZE)) + ret = fn(handle, c_name, c_uint(NVML_DEVICE_NAME_V2_BUFFER_SIZE)) _nvmlCheckReturn(ret) return c_name.value @@ -1690,6 +2185,7 @@ def nvmlDeviceGetBrand(handle): return c_type.value +@convertStrBytes def nvmlDeviceGetBoardPartNumber(handle): c_part_number = create_string_buffer(NVML_DEVICE_PART_NUMBER_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlDeviceGetBoardPartNumber") @@ -1698,6 +2194,7 @@ def nvmlDeviceGetBoardPartNumber(handle): return c_part_number.value +@convertStrBytes def nvmlDeviceGetSerial(handle): c_serial = create_string_buffer(NVML_DEVICE_SERIAL_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlDeviceGetSerial") @@ -1755,6 +2252,7 @@ def nvmlDeviceGetMinorNumber(handle): return c_minor_number.value +@convertStrBytes def nvmlDeviceGetUUID(handle): c_uuid = create_string_buffer(NVML_DEVICE_UUID_V2_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlDeviceGetUUID") @@ -1763,6 +2261,7 @@ def nvmlDeviceGetUUID(handle): return c_uuid.value +@convertStrBytes def nvmlDeviceGetInforomVersion(handle, infoRomObject): c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomVersion") @@ -1773,6 +2272,7 @@ def nvmlDeviceGetInforomVersion(handle, infoRomObject): # Added in 4.304 +@convertStrBytes def nvmlDeviceGetInforomImageVersion(handle): c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomImageVersion") @@ -1959,6 +2459,50 @@ def nvmlDeviceGetFanSpeed_v2(handle, fan): return c_speed.value +def nvmlDeviceGetTargetFanSpeed(handle, fan): + c_speed = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetTargetFanSpeed") + ret = fn(handle, fan, byref(c_speed)) + _nvmlCheckReturn(ret) + return c_speed.value + + +def nvmlDeviceGetNumFans(device): + c_numFans = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumFans") + ret = fn(device, byref(c_numFans)) + _nvmlCheckReturn(ret) + return c_numFans.value + + +def nvmlDeviceSetDefaultFanSpeed_v2(handle, index): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetDefaultFanSpeed_v2") + ret = fn(handle, index) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceGetMinMaxFanSpeed(handle, minSpeed, maxSpeed): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinMaxFanSpeed") + ret = fn(handle, minSpeed, maxSpeed) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceGetFanControlPolicy_v2(handle, fan, fanControlPolicy): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanControlPolicy_v2") + ret = fn(handle, fan, fanControlPolicy) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceSetFanControlPolicy(handle, fan, fanControlPolicy): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetFanControlPolicy") + ret = fn(handle, fan, _nvmlFanControlPolicy_t(fanControlPolicy)) + _nvmlCheckReturn(ret) + return ret + + def nvmlDeviceGetTemperature(handle, sensor): c_temp = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperature") @@ -1975,6 +2519,15 @@ def nvmlDeviceGetTemperatureThreshold(handle, threshold): return c_temp.value +def nvmlDeviceSetTemperatureThreshold(handle, threshold, temp): + c_temp = c_uint() + c_temp.value = temp + fn = _nvmlGetFunctionPointer("nvmlDeviceSetTemperatureThreshold") + ret = fn(handle, _nvmlTemperatureThresholds_t(threshold), byref(c_temp)) + _nvmlCheckReturn(ret) + return None + + # DEPRECATED use nvmlDeviceGetPerformanceState def nvmlDeviceGetPowerState(handle): c_pstate = _nvmlPstates_t() @@ -2072,9 +2625,14 @@ def nvmlDeviceGetPendingGpuOperationMode(handle): return nvmlDeviceGetGpuOperationMode(handle)[1] -def nvmlDeviceGetMemoryInfo(handle): - c_memory = c_nvmlMemory_t() - fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo") +def nvmlDeviceGetMemoryInfo(handle, version=None): + if not version: + c_memory = c_nvmlMemory_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo") + else: + c_memory = c_nvmlMemory_v2_t() + c_memory.version = version + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo_v2") ret = fn(handle, byref(c_memory)) _nvmlCheckReturn(ret) return c_memory @@ -2124,6 +2682,14 @@ def nvmlDeviceGetPendingEccMode(handle): return nvmlDeviceGetEccMode(handle)[1] +def nvmlDeviceGetDefaultEccMode(handle): + c_defaultState = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDefaultEccMode") + ret = fn(handle, byref(c_defaultState)) + _nvmlCheckReturn(ret) + return [c_defaultState.value] + + def nvmlDeviceGetTotalEccErrors(handle, errorType, counterType): c_count = c_ulonglong() fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEccErrors") @@ -2210,6 +2776,7 @@ def nvmlDeviceGetPendingDriverModel(handle): # Added in 2.285 +@convertStrBytes def nvmlDeviceGetVbiosVersion(handle): c_version = create_string_buffer(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE) fn = _nvmlGetFunctionPointer("nvmlDeviceGetVbiosVersion") @@ -2219,10 +2786,49 @@ def nvmlDeviceGetVbiosVersion(handle): # Added in 2.285 -def nvmlDeviceGetComputeRunningProcesses(handle): +def nvmlDeviceGetComputeRunningProcesses_v3(handle): # first call to get the size c_count = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses_v3") + ret = fn(handle, byref(c_count), None) + + if (ret == NVML_SUCCESS): + # special case, no running processes + return [] + elif (ret == NVML_ERROR_INSUFFICIENT_SIZE): + # typical case + # oversize the array incase more processes are created + c_count.value = c_count.value * 2 + 5 + proc_array = c_nvmlProcessInfo_t * c_count.value + c_procs = proc_array() + + # make the call again + ret = fn(handle, byref(c_count), c_procs) + _nvmlCheckReturn(ret) + + procs = [] + for i in range(c_count.value): + # use an alternative struct for this object + obj = nvmlStructToFriendlyObject(c_procs[i]) + if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value): + # special case for WDDM on Windows, see comment above + obj.usedGpuMemory = None + procs.append(obj) + + return procs + else: + # error case + raise NVMLError(ret) + + +def nvmlDeviceGetComputeRunningProcesses(handle): + return nvmlDeviceGetComputeRunningProcesses_v3(handle) + + +def nvmlDeviceGetGraphicsRunningProcesses_v3(handle): + # first call to get the size + c_count = c_uint(0) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGraphicsRunningProcesses_v3") ret = fn(handle, byref(c_count), None) if (ret == NVML_SUCCESS): @@ -2255,9 +2861,17 @@ def nvmlDeviceGetComputeRunningProcesses(handle): def nvmlDeviceGetGraphicsRunningProcesses(handle): + return nvmlDeviceGetGraphicsRunningProcesses_v3(handle) + + +def nvmlDeviceGetMPSComputeRunningProcesses(handle): + return nvmlDeviceGetMPSComputeRunningProcesses_v3(handle) + + +def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle): # first call to get the size c_count = c_uint(0) - fn = _nvmlGetFunctionPointer("nvmlDeviceGetGraphicsRunningProcesses") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMPSComputeRunningProcesses_v3") ret = fn(handle, byref(c_count), None) if (ret == NVML_SUCCESS): @@ -2372,6 +2986,26 @@ def nvmlDeviceResetGpuLockedClocks(handle): return None +def nvmlDeviceSetMemoryLockedClocks(handle, minMemClockMHz, maxMemClockMHz): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetMemoryLockedClocks") + ret = fn(handle, c_uint(minMemClockMHz), c_uint(maxMemClockMHz)) + _nvmlCheckReturn(ret) + return None + + +def nvmlDeviceResetMemoryLockedClocks(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceResetMemoryLockedClocks") + ret = fn(handle) + _nvmlCheckReturn(ret) + return None + + +def nvmlDeviceGetClkMonStatus(handle, c_clkMonInfo): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetClkMonStatus") + ret = fn(handle, c_clkMonInfo) + return ret + + # Added in 4.304 def nvmlDeviceSetApplicationsClocks(handle, maxMemClockMHz, maxGraphicsClockMHz): fn = _nvmlGetFunctionPointer("nvmlDeviceSetApplicationsClocks") @@ -2496,6 +3130,14 @@ def nvmlDeviceGetMaxPcieLinkWidth(handle): return width.value +def nvmlDeviceGetGpuMaxPcieLinkGeneration(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuMaxPcieLinkGeneration") + gen = c_uint() + ret = fn(handle, byref(gen)) + _nvmlCheckReturn(ret) + return gen.value + + # Added in 4.304 def nvmlDeviceGetSupportedClocksThrottleReasons(handle): c_reasons = c_ulonglong() @@ -2621,7 +3263,7 @@ def nvmlDeviceGetRetiredPages_v2(device, sourceFilter): c_times = times_array() ret = fn(device, c_source, byref(c_count), c_pages, c_times) _nvmlCheckReturn(ret) - return [{'address': int(c_pages[i]), 'timestamp': int(c_times[i])} for i in range(c_count.value)]; + return [{'address': int(c_pages[i]), 'timestamp': int(c_times[i])} for i in range(c_count.value)] def nvmlDeviceGetRetiredPagesPendingStatus(device): @@ -2807,6 +3449,14 @@ def nvmlDeviceGetNvLinkRemotePciInfo(device, link): return c_pci +def nvmlDeviceGetNvLinkRemoteDeviceType(handle, link): + c_type = _nvmlNvLinkDeviceType_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkRemoteDeviceType") + ret = fn(handle, link, byref(c_type)) + _nvmlCheckReturn(ret) + return c_type.value + + def nvmlDeviceGetNvLinkState(device, link): c_isActive = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetNvLinkState") @@ -2868,6 +3518,22 @@ def nvmlDeviceGetFieldValues(handle, fieldIds): return values +def nvmlDeviceClearFieldValues(handle, fieldIds): + values_arr = c_nvmlFieldValue_t * len(fieldIds) + values = values_arr() + fn = _nvmlGetFunctionPointer("nvmlDeviceClearFieldValues") + + for i, fieldId in enumerate(fieldIds): + try: + (values[i].fieldId, values[i].scopeId) = fieldId + except TypeError: + values[i].fieldId = fieldId + + ret = fn(handle, c_int32(len(fieldIds)), byref(values)) + _nvmlCheckReturn(ret) + return values + + def nvmlDeviceGetVirtualizationMode(handle): c_virtualization_mode = c_ulonglong() fn = _nvmlGetFunctionPointer("nvmlDeviceGetVirtualizationMode") @@ -2881,6 +3547,22 @@ def nvmlDeviceSetVirtualizationMode(handle, virtualization_mode): return fn(handle, virtualization_mode) +def nvmlGetVgpuDriverCapabilities(capability): + c_capResult = c_uint() + fn = _nvmlGetFunctionPointer("nvmlGetVgpuDriverCapabilities") + ret = fn(_nvmlVgpuDriverCapability_t(capability), byref(c_capResult)) + _nvmlCheckReturn(ret) + return c_capResult.value + + +def nvmlDeviceGetVgpuCapabilities(handle, capability): + c_capResult = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuCapabilities") + ret = fn(handle, _nvmlDeviceVgpuCapability_t(capability), byref(c_capResult)) + _nvmlCheckReturn(ret) + return c_capResult.value + + def nvmlDeviceGetSupportedVgpus(handle): # first call to get the size c_vgpu_count = c_uint(0) @@ -2935,6 +3617,15 @@ def nvmlDeviceGetCreatableVgpus(handle): raise NVMLError(ret) +def nvmlVgpuTypeGetGpuInstanceProfileId(vgpuTypeId): + c_profile_id = c_uint(0) + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetGpuInstanceProfileId") + ret = fn(vgpuTypeId, byref(c_profile_id)) + _nvmlCheckReturn(ret) + return (c_profile_id.value) + + +@convertStrBytes def nvmlVgpuTypeGetClass(vgpuTypeId): c_class = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE) c_buffer_size = c_uint(NVML_DEVICE_NAME_BUFFER_SIZE) @@ -2944,6 +3635,7 @@ def nvmlVgpuTypeGetClass(vgpuTypeId): return c_class.value +@convertStrBytes def nvmlVgpuTypeGetName(vgpuTypeId): c_name = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE) c_buffer_size = c_uint(NVML_DEVICE_NAME_BUFFER_SIZE) @@ -2987,6 +3679,7 @@ def nvmlVgpuTypeGetResolution(vgpuTypeId): return (c_xdim.value, c_ydim.value) +@convertStrBytes def nvmlVgpuTypeGetLicense(vgpuTypeId): c_license = create_string_buffer(NVML_GRID_LICENSE_BUFFER_SIZE) c_buffer_size = c_uint(NVML_GRID_LICENSE_BUFFER_SIZE) @@ -3047,6 +3740,7 @@ def nvmlDeviceGetActiveVgpus(handle): raise NVMLError(ret) +@convertStrBytes def nvmlVgpuInstanceGetVmID(vgpuInstance): c_vm_id = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) c_buffer_size = c_uint(NVML_GRID_LICENSE_BUFFER_SIZE) @@ -3057,6 +3751,7 @@ def nvmlVgpuInstanceGetVmID(vgpuInstance): return (c_vm_id.value, c_vm_id_type.value) +@convertStrBytes def nvmlVgpuInstanceGetUUID(vgpuInstance): c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) c_buffer_size = c_uint(NVML_DEVICE_UUID_BUFFER_SIZE) @@ -3066,6 +3761,7 @@ def nvmlVgpuInstanceGetUUID(vgpuInstance): return c_uuid.value +@convertStrBytes def nvmlVgpuInstanceGetMdevUUID(vgpuInstance): c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) c_buffer_size = c_uint(NVML_DEVICE_UUID_BUFFER_SIZE) @@ -3075,6 +3771,7 @@ def nvmlVgpuInstanceGetMdevUUID(vgpuInstance): return c_uuid.value +@convertStrBytes def nvmlVgpuInstanceGetVmDriverVersion(vgpuInstance): c_driver_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) c_buffer_size = c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) @@ -3092,6 +3789,18 @@ def nvmlVgpuInstanceGetLicenseStatus(vgpuInstance): return c_license_status.value +def nvmlVgpuInstanceGetLicenseInfo_v2(vgpuInstance): + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetLicenseInfo_v2") + c_license_info = c_nvmlVgpuLicenseInfo_t() + ret = fn(vgpuInstance, byref(c_license_info)) + _nvmlCheckReturn(ret) + return c_license_info + + +def nvmlVgpuInstanceGetLicenseInfo(vgpuInstance): + return nvmlVgpuInstanceGetLicenseInfo_v2(vgpuInstance) + + def nvmlVgpuInstanceGetFrameRateLimit(vgpuInstance): c_frl = c_uint(0) fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetFrameRateLimit") @@ -3137,6 +3846,31 @@ def nvmlVgpuInstanceGetFbUsage(vgpuInstance): return c_fb_usage.value +def nvmlVgpuTypeGetCapabilities(vgpuTypeId, capability): + c_cap_result = c_uint(0) + fn = _nvmlGetFunctionPointer("nvmlVgpuTypeGetCapabilities") + ret = fn(vgpuTypeId, _nvmlVgpuCapability_t(capability), byref(c_cap_result)) + _nvmlCheckReturn(ret) + return (c_cap_result.value) + + +def nvmlVgpuInstanceGetGpuInstanceId(vgpuInstance): + c_id = c_uint(0) + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetGpuInstanceId") + ret = fn(vgpuInstance, byref(c_id)) + _nvmlCheckReturn(ret) + return (c_id.value) + + +@convertStrBytes +def nvmlVgpuInstanceGetGpuPciId(vgpuInstance): + c_vgpuPciId = create_string_buffer(NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlVgpuInstanceGetGpuPciId") + ret = fn(vgpuInstance, c_vgpuPciId, byref(c_uint(NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE))) + _nvmlCheckReturn(ret) + return c_vgpuPciId.value + + def nvmlDeviceGetVgpuUtilization(handle, timeStamp): # first call to get the size c_vgpu_count = c_uint(0) @@ -3172,9 +3906,9 @@ def nvmlDeviceGetP2PStatus(device1, device2, p2pIndex): return c_p2pstatus.value -def nvmlDeviceGetGridLicensableFeatures_v3(handle): - c_get_grid_licensable_features = c_nvmlGridLicensableFeatures_v3_t() - fn = _nvmlGetFunctionPointer("nvmlDeviceGetGridLicensableFeatures_v3") +def nvmlDeviceGetGridLicensableFeatures_v4(handle): + c_get_grid_licensable_features = c_nvmlGridLicensableFeatures_v4_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGridLicensableFeatures_v4") ret = fn(handle, byref(c_get_grid_licensable_features)) _nvmlCheckReturn(ret) @@ -3182,7 +3916,21 @@ def nvmlDeviceGetGridLicensableFeatures_v3(handle): def nvmlDeviceGetGridLicensableFeatures(handle): - return nvmlDeviceGetGridLicensableFeatures_v3(handle) + return nvmlDeviceGetGridLicensableFeatures_v4(handle) + + +def nvmlDeviceGetGspFirmwareVersion(handle, version): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGspFirmwareVersion") + ret = fn(handle, version) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceGetGspFirmwareMode(handle, isEnabled, defaultMode): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGspFirmwareMode") + ret = fn(handle, isEnabled, defaultMode) + _nvmlCheckReturn(ret) + return ret def nvmlDeviceGetEncoderCapacity(handle, encoderQueryType): @@ -3426,6 +4174,7 @@ def nvmlGetVgpuCompatibility(vgpuMetadata, pgpuMetadata): return c_vgpuPgpuCompatibility +@convertStrBytes def nvmlDeviceGetPgpuMetadataString(handle): fn = _nvmlGetFunctionPointer("nvmlDeviceGetPgpuMetadataString") c_pgpuMetadata = create_string_buffer(NVML_VGPU_PGPU_METADATA_OPAQUE_DATA_SIZE) @@ -3441,6 +4190,30 @@ def nvmlDeviceGetPgpuMetadataString(handle): return (c_pgpuMetadata.value, c_bufferSize.value) +def nvmlDeviceGetVgpuSchedulerLog(handle): + c_vgpu_sched_log = c_nvmlVgpuSchedulerLog_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerLog") + ret = fn(handle, byref(c_vgpu_sched_log)) + _nvmlCheckReturn(ret) + return c_vgpu_sched_log + + +def nvmlDeviceGetVgpuSchedulerState(handle): + c_vgpu_sched_state = c_nvmlVgpuSchedulerGetState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerState") + ret = fn(handle, byref(c_vgpu_sched_state)) + _nvmlCheckReturn(ret) + return c_vgpu_sched_state + + +def nvmlDeviceGetVgpuSchedulerCapabilities(handle): + c_vgpu_sched_caps = c_nvmlVgpuSchedulerCapabilities_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVgpuSchedulerCapabilities") + ret = fn(handle, byref(c_vgpu_sched_caps)) + _nvmlCheckReturn(ret) + return c_vgpu_sched_caps + + def nvmlSetVgpuVersion(vgpuVersion): fn = _nvmlGetFunctionPointer("nvmlSetVgpuVersion") ret = fn(byref(vgpuVersion)) @@ -3492,18 +4265,18 @@ def nvmlVgpuInstanceClearAccountingPids(vgpuInstance): return ret -def nvmlGetBlacklistDeviceCount(): +def nvmlGetExcludedDeviceCount(): c_count = c_uint() - fn = _nvmlGetFunctionPointer("nvmlGetBlacklistDeviceCount") + fn = _nvmlGetFunctionPointer("nvmlGetExcludedDeviceCount") ret = fn(byref(c_count)) _nvmlCheckReturn(ret) return c_count.value -def nvmlGetBlacklistDeviceInfoByIndex(index): +def nvmlGetExcludedDeviceInfoByIndex(index): c_index = c_uint(index) - info = c_nvmlBlacklistDeviceInfo_t() - fn = _nvmlGetFunctionPointer("nvmlGetBlacklistDeviceInfoByIndex") + info = c_nvmlExcludedDeviceInfo_t() + fn = _nvmlGetFunctionPointer("nvmlGetExcludedDeviceInfoByIndex") ret = fn(c_index, byref(info)) _nvmlCheckReturn(ret) return info @@ -3534,14 +4307,24 @@ def nvmlDeviceGetMigMode(device): return [c_currentMode.value, c_pendingMode.value] -def nvmlDeviceGetGpuInstanceProfileInfo(device, profile): - c_info = c_nvmlGpuInstanceProfileInfo_t() - fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceProfileInfo") +def nvmlDeviceGetGpuInstanceProfileInfo(device, profile, version=2): + if version == 2: + c_info = c_nvmlGpuInstanceProfileInfo_v2_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceProfileInfoV") + elif version == 1: + c_info = c_nvmlGpuInstanceProfileInfo_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceProfileInfo") + else: + raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND) ret = fn(device, profile, byref(c_info)) _nvmlCheckReturn(ret) return c_info +# Define function alias for the API exposed by NVML +nvmlDeviceGetGpuInstanceProfileInfoV = nvmlDeviceGetGpuInstanceProfileInfo + + def nvmlDeviceGetGpuInstanceRemainingCapacity(device, profileId): c_count = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstanceRemainingCapacity") @@ -3551,7 +4334,7 @@ def nvmlDeviceGetGpuInstanceRemainingCapacity(device, profileId): def nvmlDeviceGetGpuInstancePossiblePlacements(device, profileId, placementsRef, countRef): - fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstancePossiblePlacements") + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuInstancePossiblePlacements_v2") ret = fn(device, profileId, placementsRef, countRef) _nvmlCheckReturn(ret) return ret @@ -3565,6 +4348,14 @@ def nvmlDeviceCreateGpuInstance(device, profileId): return c_instance +def nvmlDeviceCreateGpuInstanceWithPlacement(device, profileId, placement): + c_instance = c_nvmlGpuInstance_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceCreateGpuInstanceWithPlacement") + ret = fn(device, profileId, placement, byref(c_instance)) + _nvmlCheckReturn(ret) + return c_instance + + def nvmlGpuInstanceDestroy(gpuInstance): fn = _nvmlGetFunctionPointer("nvmlGpuInstanceDestroy") ret = fn(gpuInstance) @@ -3595,14 +4386,24 @@ def nvmlGpuInstanceGetInfo(gpuInstance): return c_info -def nvmlGpuInstanceGetComputeInstanceProfileInfo(device, profile, engProfile): - c_info = c_nvmlComputeInstanceProfileInfo_t() - fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceProfileInfo") +def nvmlGpuInstanceGetComputeInstanceProfileInfo(device, profile, engProfile, version=2): + if version == 2: + c_info = c_nvmlComputeInstanceProfileInfo_v2_t() + fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceProfileInfoV") + elif version == 1: + c_info = c_nvmlComputeInstanceProfileInfo_t() + fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceProfileInfo") + else: + raise NVMLError(NVML_ERROR_FUNCTION_NOT_FOUND) ret = fn(device, profile, engProfile, byref(c_info)) _nvmlCheckReturn(ret) return c_info +# Define function alias for the API exposed by NVML +nvmlGpuInstanceGetComputeInstanceProfileInfoV = nvmlGpuInstanceGetComputeInstanceProfileInfo + + def nvmlGpuInstanceGetComputeInstanceRemainingCapacity(gpuInstance, profileId): c_count = c_uint() fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstanceRemainingCapacity") @@ -3611,6 +4412,13 @@ def nvmlGpuInstanceGetComputeInstanceRemainingCapacity(gpuInstance, profileId): return c_count.value +def nvmlGpuInstanceGetComputeInstancePossiblePlacements(gpuInstance, profileId, placementsRef, countRef): + fn = _nvmlGetFunctionPointer("nvmlGpuInstanceGetComputeInstancePossiblePlacements") + ret = fn(gpuInstance, profileId, placementsRef, countRef) + _nvmlCheckReturn(ret) + return ret + + def nvmlGpuInstanceCreateComputeInstance(gpuInstance, profileId): c_instance = c_nvmlComputeInstance_t() fn = _nvmlGetFunctionPointer("nvmlGpuInstanceCreateComputeInstance") @@ -3619,6 +4427,14 @@ def nvmlGpuInstanceCreateComputeInstance(gpuInstance, profileId): return c_instance +def nvmlGpuInstanceCreateComputeInstanceWithPlacement(gpuInstance, profileId, placement): + c_instance = c_nvmlComputeInstance_t() + fn = _nvmlGetFunctionPointer("nvmlGpuInstanceCreateComputeInstanceWithPlacement") + ret = fn(gpuInstance, profileId, placement, byref(c_instance)) + _nvmlCheckReturn(ret) + return c_instance + + def nvmlComputeInstanceDestroy(computeInstance): fn = _nvmlGetFunctionPointer("nvmlComputeInstanceDestroy") ret = fn(computeInstance) @@ -3641,14 +4457,18 @@ def nvmlGpuInstanceGetComputeInstanceById(gpuInstance, computeInstanceId): return c_instance -def nvmlComputeInstanceGetInfo(computeInstance): +def nvmlComputeInstanceGetInfo_v2(computeInstance): c_info = c_nvmlComputeInstanceInfo_t() - fn = _nvmlGetFunctionPointer("nvmlComputeInstanceGetInfo") + fn = _nvmlGetFunctionPointer("nvmlComputeInstanceGetInfo_v2") ret = fn(computeInstance, byref(c_info)) _nvmlCheckReturn(ret) return c_info +def nvmlComputeInstanceGetInfo(computeInstance): + return nvmlComputeInstanceGetInfo_v2(computeInstance) + + def nvmlDeviceIsMigDeviceHandle(device): c_isMigDevice = c_uint() fn = _nvmlGetFunctionPointer("nvmlDeviceIsMigDeviceHandle") @@ -3735,3 +4555,412 @@ def nvmlDeviceGetArchitecture(device): ret = fn(device, byref(arch)) _nvmlCheckReturn(ret) return arch.value + + +def nvmlDeviceGetBusType(device): + c_busType = _nvmlBusType_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetBusType") + ret = fn(device, byref(c_busType)) + _nvmlCheckReturn(ret) + return c_busType.value + + +def nvmlDeviceGetIrqNum(device): + c_irqNum = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetIrqNum") + ret = fn(device, byref(c_irqNum)) + _nvmlCheckReturn(ret) + return c_irqNum.value + + +def nvmlDeviceGetNumGpuCores(device): + c_numCores = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetNumGpuCores") + ret = fn(device, byref(c_numCores)) + _nvmlCheckReturn(ret) + return c_numCores.value + + +def nvmlDeviceGetPowerSource(device): + c_powerSource = _nvmlPowerSource_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerSource") + ret = fn(device, byref(c_powerSource)) + _nvmlCheckReturn(ret) + return c_powerSource.value + + +def nvmlDeviceGetMemoryBusWidth(device): + c_memBusWidth = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryBusWidth") + ret = fn(device, byref(c_memBusWidth)) + _nvmlCheckReturn(ret) + return c_memBusWidth.value + + +def nvmlDeviceGetPcieLinkMaxSpeed(device): + c_speed = _nvmlPcieLinkMaxSpeed_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieLinkMaxSpeed") + ret = fn(device, byref(c_speed)) + _nvmlCheckReturn(ret) + return c_speed.value + + +def nvmlDeviceGetAdaptiveClockInfoStatus(device): + c_adaptiveClockInfoStatus = _nvmlAdaptiveClockInfoStatus_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetAdaptiveClockInfoStatus") + ret = fn(device, byref(c_adaptiveClockInfoStatus)) + _nvmlCheckReturn(ret) + return c_adaptiveClockInfoStatus.value + + +def nvmlDeviceGetPcieSpeed(device): + c_speed = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPcieSpeed") + ret = fn(device, byref(c_speed)) + _nvmlCheckReturn(ret) + return c_speed.value + + +def nvmlDeviceGetDynamicPstatesInfo(device, c_dynamicpstatesinfo): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDynamicPstatesInfo") + ret = fn(device, c_dynamicpstatesinfo) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceSetFanSpeed_v2(handle, index, speed): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetFanSpeed_v2") + ret = fn(handle, index, speed) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceGetThermalSettings(device, sensorindex, c_thermalsettings): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetThermalSettings") + ret = fn(device, sensorindex, c_thermalsettings) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceGetMinMaxClockOfPState(device, type, pstate, minClockMHz, maxClockMHz): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMinMaxClockOfPState") + ret = fn(device, _nvmlClockType_t(type), _nvmlClockType_t(pstate), minClockMHz, maxClockMHz) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceGetSupportedPerformanceStates(device): + pstates = [] + c_count = c_uint(NVML_MAX_GPU_PERF_PSTATES) + c_size = sizeof(c_uint) * c_count.value + + # NOTE: use 'c_uint' to represent the size of the nvmlPstate_t enumeration. + pstates_array = _nvmlPstates_t * c_count.value + c_pstates = pstates_array() + + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedPerformanceStates") + ret = fn(device, c_pstates, c_size) + _nvmlCheckReturn(ret) + + for value in c_pstates: + if value != NVML_PSTATE_UNKNOWN: + pstates.append(value) + + return pstates + + +def nvmlDeviceGetGpcClkVfOffset(device): + offset = c_int32() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpcClkVfOffset") + ret = fn(device, byref(offset)) + _nvmlCheckReturn(ret) + return offset.value + + +def nvmlDeviceSetGpcClkVfOffset(device, offset): + c_offset = c_int32(offset) + fn = _nvmlGetFunctionPointer("nvmlDeviceSetGpcClkVfOffset") + ret = fn(device, c_offset) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceGetGpcClkMinMaxVfOffset(device, minOffset, maxOffset): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpcClkMinMaxVfOffset") + ret = fn(device, minOffset, maxOffset) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceGetMemClkVfOffset(device): + offset = c_int32() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemClkVfOffset") + ret = fn(device, byref(offset)) + _nvmlCheckReturn(ret) + return offset.value + + +def nvmlDeviceSetMemClkVfOffset(device, offset): + c_offset = c_int32(offset) + fn = _nvmlGetFunctionPointer("nvmlDeviceSetMemClkVfOffset") + ret = fn(device, c_offset) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceGetMemClkMinMaxVfOffset(device, minOffset, maxOffset): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemClkMinMaxVfOffset") + ret = fn(device, minOffset, maxOffset) + _nvmlCheckReturn(ret) + return ret + + +## GPM ## +######### + +## Enums/defines + +#### GPM Metric Identifiers +NVML_GPM_METRIC_GRAPHICS_UTIL = 1 # Percentage of time any compute/graphics app was active on the GPU. 0.0 - 100.0 +NVML_GPM_METRIC_SM_UTIL = 2 # Percentage of SMs that were busy. 0.0 - 100.0 +NVML_GPM_METRIC_SM_OCCUPANCY = 3 # Percentage of warps that were active vs theoretical maximum. 0.0 - 100.0 +NVML_GPM_METRIC_INTEGER_UTIL = 4 # Percentage of time the GPU's SMs were doing integer operations. 0.0 - 100.0 +NVML_GPM_METRIC_ANY_TENSOR_UTIL = 5 # Percentage of time the GPU's SMs were doing ANY tensor operations. 0.0 - 100.0 +NVML_GPM_METRIC_DFMA_TENSOR_UTIL = 6 # Percentage of time the GPU's SMs were doing DFMA tensor operations. 0.0 - 100.0 +NVML_GPM_METRIC_HMMA_TENSOR_UTIL = 7 # Percentage of time the GPU's SMs were doing HMMA tensor operations. 0.0 - 100.0 +NVML_GPM_METRIC_IMMA_TENSOR_UTIL = 9 # Percentage of time the GPU's SMs were doing IMMA tensor operations. 0.0 - 100.0 +NVML_GPM_METRIC_DRAM_BW_UTIL = 10 # Percentage of DRAM bw used vs theoretical maximum. 0.0 - 100.0 +NVML_GPM_METRIC_FP64_UTIL = 11 # Percentage of time the GPU's SMs were doing non-tensor FP64 math. 0.0 - 100.0 +NVML_GPM_METRIC_FP32_UTIL = 12 # Percentage of time the GPU's SMs were doing non-tensor FP32 math. 0.0 - 100.0 +NVML_GPM_METRIC_FP16_UTIL = 13 # Percentage of time the GPU's SMs were doing non-tensor FP16 math. 0.0 - 100.0 +NVML_GPM_METRIC_PCIE_TX_PER_SEC = 20 # PCIe traffic from this GPU in MiB/sec +NVML_GPM_METRIC_PCIE_RX_PER_SEC = 21 # PCIe traffic to this GPU in MiB/sec +NVML_GPM_METRIC_NVDEC_0_UTIL = 30 # Percent utilization of NVDEC 0. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_1_UTIL = 31 # Percent utilization of NVDEC 1. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_2_UTIL = 32 # Percent utilization of NVDEC 2. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_3_UTIL = 33 # Percent utilization of NVDEC 3. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_4_UTIL = 34 # Percent utilization of NVDEC 4. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_5_UTIL = 35 # Percent utilization of NVDEC 5. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_6_UTIL = 36 # Percent utilization of NVDEC 6. 0.0 - 100.0 +NVML_GPM_METRIC_NVDEC_7_UTIL = 37 # Percent utilization of NVDEC 7. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_0_UTIL = 40 # Percent utilization of NVJPG 0. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_1_UTIL = 41 # Percent utilization of NVJPG 1. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_2_UTIL = 42 # Percent utilization of NVJPG 2. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_3_UTIL = 43 # Percent utilization of NVJPG 3. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_4_UTIL = 44 # Percent utilization of NVJPG 4. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_5_UTIL = 45 # Percent utilization of NVJPG 5. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_6_UTIL = 46 # Percent utilization of NVJPG 6. 0.0 - 100.0 +NVML_GPM_METRIC_NVJPG_7_UTIL = 47 # Percent utilization of NVJPG 7. 0.0 - 100.0 +NVML_GPM_METRIC_NVOFA_0_UTIL = 50 # Percent utilization of NVOFA 0. 0.0 - 100.0 +NVML_GPM_METRIC_NVLINK_TOTAL_RX_PER_SEC = 60 # NvLink read bandwidth for all links in MiB/sec +NVML_GPM_METRIC_NVLINK_TOTAL_TX_PER_SEC = 61 # NvLink write bandwidth for all links in MiB/sec +NVML_GPM_METRIC_NVLINK_L0_RX_PER_SEC = 62 # NvLink read bandwidth for link 0 in MiB/sec +NVML_GPM_METRIC_NVLINK_L0_TX_PER_SEC = 63 # NvLink write bandwidth for link 0 in MiB/sec +NVML_GPM_METRIC_NVLINK_L1_RX_PER_SEC = 64 # NvLink read bandwidth for link 1 in MiB/sec +NVML_GPM_METRIC_NVLINK_L1_TX_PER_SEC = 65 # NvLink write bandwidth for link 1 in MiB/sec +NVML_GPM_METRIC_NVLINK_L2_RX_PER_SEC = 66 # NvLink read bandwidth for link 2 in MiB/sec +NVML_GPM_METRIC_NVLINK_L2_TX_PER_SEC = 67 # NvLink write bandwidth for link 2 in MiB/sec +NVML_GPM_METRIC_NVLINK_L3_RX_PER_SEC = 68 # NvLink read bandwidth for link 3 in MiB/sec +NVML_GPM_METRIC_NVLINK_L3_TX_PER_SEC = 69 # NvLink write bandwidth for link 3 in MiB/sec +NVML_GPM_METRIC_NVLINK_L4_RX_PER_SEC = 70 # NvLink read bandwidth for link 4 in MiB/sec +NVML_GPM_METRIC_NVLINK_L4_TX_PER_SEC = 71 # NvLink write bandwidth for link 4 in MiB/sec +NVML_GPM_METRIC_NVLINK_L5_RX_PER_SEC = 72 # NvLink read bandwidth for link 5 in MiB/sec +NVML_GPM_METRIC_NVLINK_L5_TX_PER_SEC = 73 # NvLink write bandwidth for link 5 in MiB/sec +NVML_GPM_METRIC_NVLINK_L6_RX_PER_SEC = 74 # NvLink read bandwidth for link 6 in MiB/sec +NVML_GPM_METRIC_NVLINK_L6_TX_PER_SEC = 75 # NvLink write bandwidth for link 6 in MiB/sec +NVML_GPM_METRIC_NVLINK_L7_RX_PER_SEC = 76 # NvLink read bandwidth for link 7 in MiB/sec +NVML_GPM_METRIC_NVLINK_L7_TX_PER_SEC = 77 # NvLink write bandwidth for link 7 in MiB/sec +NVML_GPM_METRIC_NVLINK_L8_RX_PER_SEC = 78 # NvLink read bandwidth for link 8 in MiB/sec +NVML_GPM_METRIC_NVLINK_L8_TX_PER_SEC = 79 # NvLink write bandwidth for link 8 in MiB/sec +NVML_GPM_METRIC_NVLINK_L9_RX_PER_SEC = 80 # NvLink read bandwidth for link 9 in MiB/sec +NVML_GPM_METRIC_NVLINK_L9_TX_PER_SEC = 81 # NvLink write bandwidth for link 9 in MiB/sec +NVML_GPM_METRIC_NVLINK_L10_RX_PER_SEC = 82 # NvLink read bandwidth for link 10 in MiB/sec +NVML_GPM_METRIC_NVLINK_L10_TX_PER_SEC = 83 # NvLink write bandwidth for link 10 in MiB/sec +NVML_GPM_METRIC_NVLINK_L11_RX_PER_SEC = 84 # NvLink read bandwidth for link 11 in MiB/sec +NVML_GPM_METRIC_NVLINK_L11_TX_PER_SEC = 85 # NvLink write bandwidth for link 11 in MiB/sec +NVML_GPM_METRIC_NVLINK_L12_RX_PER_SEC = 86 # NvLink read bandwidth for link 12 in MiB/sec +NVML_GPM_METRIC_NVLINK_L12_TX_PER_SEC = 87 # NvLink write bandwidth for link 12 in MiB/sec +NVML_GPM_METRIC_NVLINK_L13_RX_PER_SEC = 88 # NvLink read bandwidth for link 13 in MiB/sec +NVML_GPM_METRIC_NVLINK_L13_TX_PER_SEC = 89 # NvLink write bandwidth for link 13 in MiB/sec +NVML_GPM_METRIC_NVLINK_L14_RX_PER_SEC = 90 # NvLink read bandwidth for link 14 in MiB/sec +NVML_GPM_METRIC_NVLINK_L14_TX_PER_SEC = 91 # NvLink write bandwidth for link 14 in MiB/sec +NVML_GPM_METRIC_NVLINK_L15_RX_PER_SEC = 92 # NvLink read bandwidth for link 15 in MiB/sec +NVML_GPM_METRIC_NVLINK_L15_TX_PER_SEC = 93 # NvLink write bandwidth for link 15 in MiB/sec +NVML_GPM_METRIC_NVLINK_L16_RX_PER_SEC = 94 # NvLink read bandwidth for link 16 in MiB/sec +NVML_GPM_METRIC_NVLINK_L16_TX_PER_SEC = 95 # NvLink write bandwidth for link 16 in MiB/sec +NVML_GPM_METRIC_NVLINK_L17_RX_PER_SEC = 96 # NvLink read bandwidth for link 17 in MiB/sec +NVML_GPM_METRIC_NVLINK_L17_TX_PER_SEC = 97 # NvLink write bandwidth for link 17 in MiB/sec +NVML_GPM_METRIC_MAX = 98 + + +## Structs + +class c_nvmlUnitInfo_t(_PrintableStructure): + _fields_ = [ + ('name', c_char * 96), + ('id', c_char * 96), + ('serial', c_char * 96), + ('firmwareVersion', c_char * 96), + ] + + +class struct_c_nvmlGpmSample_t(Structure): + pass # opaque handle + + +c_nvmlGpmSample_t = POINTER(struct_c_nvmlGpmSample_t) + + +class c_metricInfo_t(Structure): + _fields_ = [ + ("shortName", c_char_p), + ("longName", c_char_p), + ("unit", c_char_p), + ] + + +class c_nvmlGpmMetric_t(_PrintableStructure): + _fields_ = [ + ('metricId', c_uint), + ('nvmlReturn', _nvmlReturn_t), + ('value', c_double), + ('metricInfo', c_metricInfo_t) + ] + + +class c_nvmlGpmMetricsGet_t(_PrintableStructure): + _fields_ = [ + ('version', c_uint), + ('numMetrics', c_uint), + ('sample1', c_nvmlGpmSample_t), + ('sample2', c_nvmlGpmSample_t), + ('metrics', c_nvmlGpmMetric_t * NVML_GPM_METRIC_MAX) + ] + + +NVML_GPM_METRICS_GET_VERSION = 1 + + +class c_nvmlGpmSupport_t(_PrintableStructure): + _fields_ = [ + ('version', c_uint), + ('isSupportedDevice', c_uint), + ] + + +NVML_GPM_SUPPORT_VERSION = 1 + + +## Functions + +def nvmlGpmMetricsGet(metricsGet): + fn = _nvmlGetFunctionPointer("nvmlGpmMetricsGet") + ret = fn(byref(metricsGet)) + _nvmlCheckReturn(ret) + return metricsGet + + +def nvmlGpmSampleFree(gpmSample): + fn = _nvmlGetFunctionPointer("nvmlGpmSampleFree") + ret = fn(gpmSample) + _nvmlCheckReturn(ret) + return + + +def nvmlGpmSampleAlloc(): + gpmSample = c_nvmlGpmSample_t() + fn = _nvmlGetFunctionPointer("nvmlGpmSampleAlloc") + ret = fn(byref(gpmSample)) + _nvmlCheckReturn(ret) + return gpmSample + + +def nvmlGpmSampleGet(device, gpmSample): + fn = _nvmlGetFunctionPointer("nvmlGpmSampleGet") + ret = fn(device, gpmSample) + _nvmlCheckReturn(ret) + return gpmSample + + +def nvmlGpmMigSampleGet(device, gpuInstanceId, gpmSample): + fn = _nvmlGetFunctionPointer("nvmlGpmMigSampleGet") + ret = fn(device, gpuInstanceId, gpmSample) + _nvmlCheckReturn(ret) + return gpmSample + + +def nvmlGpmQueryDeviceSupport(device): + gpmSupport = c_nvmlGpmSupport_t() + gpmSupport.version = NVML_GPM_SUPPORT_VERSION + fn = _nvmlGetFunctionPointer("nvmlGpmQueryDeviceSupport") + ret = fn(device, byref(gpmSupport)) + _nvmlCheckReturn(ret) + return gpmSupport + + +## CCU ## +######### + +## Enums/defines + +#### CCU Stream State +NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_DISABLE = 0 +NVML_COUNTER_COLLECTION_UNIT_STREAM_STATE_ENABLE = 1 + + +## Functions + +def nvmlDeviceCcuSetStreamState(device, state): + c_state = c_uint(state) + fn = _nvmlGetFunctionPointer("nvmlDeviceCcuSetStreamState") + ret = fn(device, c_state) + _nvmlCheckReturn(ret) + return ret + + +def nvmlDeviceCcuGetStreamState(device): + c_state = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceCcuGetStreamState") + ret = fn(device, byref(c_state)) + _nvmlCheckReturn(ret) + return c_state.value + + +# Low Power Structure and Function + +class c_nvmlNvLinkPowerThres_t(Structure): + _fields_ = [ + ("lowPwrThreshold", c_uint), + ] + + +def nvmlDeviceSetNvLinkDeviceLowPowerThreshold(device, l1threshold): + c_info = c_nvmlNvLinkPowerThres_t() + c_info.lowPwrThreshold = l1threshold + fn = _nvmlGetFunctionPointer("nvmlDeviceSetNvLinkDeviceLowPowerThreshold") + ret = fn(device, byref(c_info)) + _nvmlCheckReturn(ret) + return ret + + +_nvmlGpuFabricState_t = c_uint +NVML_GPU_FABRIC_STATE_NOT_SUPPORTED = 0 +NVML_GPU_FABRIC_STATE_NOT_STARTED = 1 +NVML_GPU_FABRIC_STATE_IN_PROGRESS = 2 +NVML_GPU_FABRIC_STATE_COMPLETED = 3 + + +class c_nvmlGpuFabricInfo_t(_PrintableStructure): + _fields_ = [ + ("clusterUuid", c_char * NVML_DEVICE_UUID_BUFFER_SIZE), + ("status", _nvmlReturn_t), + ("partitionId", c_uint32), + ("state", _nvmlGpuFabricState_t) + ] + + +def nvmlDeviceGetGpuFabricInfo(device, gpuFabricInfo): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetGpuFabricInfo") + ret = fn(device, gpuFabricInfo) + _nvmlCheckReturn(ret) + return ret \ No newline at end of file From f6ad5e6c064ab61ff7cd85d7e59fcadd7fdeace3 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Thu, 17 Aug 2023 23:28:03 +0300 Subject: [PATCH 19/21] Fix PyPI Downloads badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c5e29140..9145295d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@
Experiment Manager, MLOps and Data-Management** [![GitHub license](https://img.shields.io/github/license/allegroai/clearml.svg)](https://img.shields.io/github/license/allegroai/clearml.svg) [![PyPI pyversions](https://img.shields.io/pypi/pyversions/clearml.svg)](https://img.shields.io/pypi/pyversions/clearml.svg) [![PyPI version shields.io](https://img.shields.io/pypi/v/clearml.svg)](https://pypi.org/project/clearml/) [![Conda version shields.io](https://img.shields.io/conda/v/clearml/clearml)](https://anaconda.org/clearml/clearml) [![Optuna](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org)
-[![PyPI Downloads](https://pepy.tech/badge/clearml/month)](https://pypi.org/project/clearml/) [![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/allegroai)](https://artifacthub.io/packages/search?repo=allegroai) [![Youtube](https://img.shields.io/badge/ClearML-DD0000?logo=youtube&logoColor=white)](https://www.youtube.com/c/clearml) [![Slack Channel](https://img.shields.io/badge/slack-%23clearml--community-blueviolet?logo=slack)](https://joinslack.clear.ml) [![Signup](https://img.shields.io/badge/Clear%7CML-Signup-brightgreen)](https://app.clear.ml) +[![PyPI Downloads](https://static.pepy.tech/badge/clearml/month)](https://pypi.org/project/clearml/) [![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/allegroai)](https://artifacthub.io/packages/search?repo=allegroai) [![Youtube](https://img.shields.io/badge/ClearML-DD0000?logo=youtube&logoColor=white)](https://www.youtube.com/c/clearml) [![Slack Channel](https://img.shields.io/badge/slack-%23clearml--community-blueviolet?logo=slack)](https://joinslack.clear.ml) [![Signup](https://img.shields.io/badge/Clear%7CML-Signup-brightgreen)](https://app.clear.ml) From 0b521b00a68a298e0cbe0adb52e3908c03e3aa93 Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Mon, 21 Aug 2023 14:06:59 +0300 Subject: [PATCH 20/21] Use os.register_at_fork instead of monkey patching fork for python > 3.6 --- clearml/binding/environ_bind.py | 114 +++++++++++++++++++------------- 1 file changed, 69 insertions(+), 45 deletions(-) diff --git a/clearml/binding/environ_bind.py b/clearml/binding/environ_bind.py index 605c2949..e5167368 100644 --- a/clearml/binding/environ_bind.py +++ b/clearml/binding/environ_bind.py @@ -89,6 +89,7 @@ class SimpleQueueWrapper(object): class PatchOsFork(object): _original_fork = None + _registered_fork_callbacks = False _current_task = None _original_process_run = None @@ -104,13 +105,20 @@ class PatchOsFork(object): # noinspection PyBroadException try: # only once - if cls._original_fork: + if cls._registered_fork_callbacks or cls._original_fork: return - if six.PY2: - cls._original_fork = staticmethod(os.fork) - else: - cls._original_fork = os.fork - os.fork = cls._patched_fork + try: + os.register_at_fork(before=PatchOsFork._fork_callback_before, + after_in_child=PatchOsFork._fork_callback_after_child) + cls._registered_fork_callbacks = True + except Exception: + # python <3.6 + if six.PY2: + cls._original_fork = staticmethod(os.fork) + else: + cls._original_fork = os.fork + os.fork = cls._patched_fork + except Exception: pass @@ -182,10 +190,9 @@ class PatchOsFork(object): pass @staticmethod - def _patched_fork(*args, **kwargs): + def _fork_callback_before(): if not PatchOsFork._current_task: - return PatchOsFork._original_fork(*args, **kwargs) - + return from ..task import Task # ensure deferred is done, but never try to generate a Task object @@ -195,46 +202,63 @@ class PatchOsFork(object): # noinspection PyProtectedMember Task._wait_for_deferred(task) + @staticmethod + def _fork_callback_after_child(): + if not PatchOsFork._current_task: + return + + from ..task import Task + + # force creating a Task + task = Task.current_task() + if not task: + return + + PatchOsFork._current_task = task + # # Hack: now make sure we setup the reporter threads (Log+Reporter) + # noinspection PyProtectedMember + if not bool(task._report_subprocess_enabled): + BackgroundMonitor.start_all(task=task) + + # The signal handler method is Not enough, for the time being, we have both + # even though it makes little sense + # # if we got here patch the os._exit of our instance to call us + def _at_exit_callback(*a_args, **a_kwargs): + # just make sure we flush the internal state (the at exist caught by the external signal does the rest + # in theory we should not have to do any of that, but for some reason if we do not + # the signal is never caught by the signal call backs, not sure why.... + sleep(0.1) + # Since at_exist handlers do not work on forked processes, we have to manually call them here + if task: + try: + # not to worry there is a double _at_exit protection implemented inside task._at_exit() + # noinspection PyProtectedMember + task._at_exit() + except: # noqa + pass + + # noinspection PyProtectedMember, PyUnresolvedReferences + return os._org_exit(*a_args, **a_kwargs) + + if not hasattr(os, '_org_exit'): + # noinspection PyProtectedMember, PyUnresolvedReferences + os._org_exit = os._exit + + os._exit = _at_exit_callback + + + @staticmethod + def _patched_fork(*args, **kwargs): + if not PatchOsFork._current_task: + return PatchOsFork._original_fork(*args, **kwargs) + + PatchOsFork._fork_callback_before() + ret = PatchOsFork._original_fork(*args, **kwargs) if not PatchOsFork._current_task: return ret # Make sure the new process stdout is logged if not ret: - # force creating a Task - task = Task.current_task() - if not task: - return ret - - PatchOsFork._current_task = task - # # Hack: now make sure we setup the reporter threads (Log+Reporter) - # noinspection PyProtectedMember - if not bool(task._report_subprocess_enabled): - BackgroundMonitor.start_all(task=task) - - # The signal handler method is Not enough, for the time being, we have both - # even though it makes little sense - # # if we got here patch the os._exit of our instance to call us - def _at_exit_callback(*a_args, **a_kwargs): - # just make sure we flush the internal state (the at exist caught by the external signal does the rest - # in theory we should not have to do any of that, but for some reason if we do not - # the signal is never caught by the signal call backs, not sure why.... - sleep(0.1) - # Since at_exist handlers do not work on forked processes, we have to manually call them here - if task: - try: - # not to worry there is a double _at_exit protection implemented inside task._at_exit() - # noinspection PyProtectedMember - task._at_exit() - except: # noqa - pass - - # noinspection PyProtectedMember, PyUnresolvedReferences - return os._org_exit(*a_args, **a_kwargs) - - if not hasattr(os, '_org_exit'): - # noinspection PyProtectedMember, PyUnresolvedReferences - os._org_exit = os._exit - - os._exit = _at_exit_callback + PatchOsFork._fork_callback_after_child() return ret From 095997492008f332a063ccc0086b4aac000c6218 Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Thu, 24 Aug 2023 13:55:05 +0300 Subject: [PATCH 21/21] Fix fastAI binding not reporting scalars if tensorboard is installed --- clearml/binding/frameworks/fastai_bind.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clearml/binding/frameworks/fastai_bind.py b/clearml/binding/frameworks/fastai_bind.py index 82bb2bd0..9f113e6b 100644 --- a/clearml/binding/frameworks/fastai_bind.py +++ b/clearml/binding/frameworks/fastai_bind.py @@ -8,6 +8,7 @@ from . import _patched_call from .tensorflow_bind import WeightsGradientHistHelper from ..import_bind import PostImportHookPatching from ...debugging.log import LoggerRoot +from .tensorflow_bind import IsTensorboardInit try: import fastai @@ -51,7 +52,7 @@ class PatchFastaiV1(object): @staticmethod def patch_model_callback(): # if you have tensorboard, we assume you use TensorboardLogger, which we catch, so no need to patch. - if "tensorboard" in sys.modules: + if "tensorboard" in sys.modules and IsTensorboardInit.tensorboard_used(): return try: @@ -191,7 +192,7 @@ class PatchFastaiV2(object): @staticmethod def patch_model_callback(): - if "tensorboard" in sys.modules: + if "tensorboard" in sys.modules and IsTensorboardInit.tensorboard_used(): return # noinspection PyBroadException