diff --git a/README.md b/README.md index 8a891232..2f6f441c 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,8 @@ Clear|MLClear|ML

-**[ClearML](https://clear.ml) - Auto-Magical Suite of tools to streamline your ML workflow -
Experiment Manager, MLOps and Data-Management** +**[ClearML](https://clear.ml) - Auto-Magical Suite of tools to streamline your AI workflow +
Experiment Manager, MLOps/LLMOps and Data-Management** [![GitHub license](https://img.shields.io/github/license/allegroai/clearml.svg)](https://img.shields.io/github/license/allegroai/clearml.svg) [![PyPI pyversions](https://img.shields.io/pypi/pyversions/clearml.svg)](https://img.shields.io/pypi/pyversions/clearml.svg) [![PyPI version shields.io](https://img.shields.io/pypi/v/clearml.svg)](https://pypi.org/project/clearml/) [![Conda version shields.io](https://img.shields.io/conda/v/clearml/clearml)](https://anaconda.org/clearml/clearml) [![Optuna](https://img.shields.io/badge/Optuna-integrated-blue)](https://optuna.org)
[![PyPI Downloads](https://static.pepy.tech/badge/clearml/month)](https://pypi.org/project/clearml/) [![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/allegroai)](https://artifacthub.io/packages/search?repo=allegroai) [![Youtube](https://img.shields.io/badge/ClearML-DD0000?logo=youtube&logoColor=white)](https://www.youtube.com/c/clearml) [![Slack Channel](https://img.shields.io/badge/slack-%23clearml--community-blueviolet?logo=slack)](https://joinslack.clear.ml) [![Signup](https://img.shields.io/badge/Clear%7CML-Signup-brightgreen)](https://app.clear.ml) @@ -19,14 +19,15 @@ ClearML is a ML/DL development and production suite. It contains FIVE main modules: - [Experiment Manager](#clearml-experiment-manager) - Automagical experiment tracking, environments and results -- [MLOps](https://github.com/allegroai/clearml-agent) - Orchestration, Automation & Pipelines solution for ML/DL jobs (K8s / Cloud / bare-metal) +- [MLOps / LLMOps](https://github.com/allegroai/clearml-agent) - Orchestration, Automation & Pipelines solution for ML/DL/GenAI jobs (Kubernetes / Cloud / bare-metal) - [Data-Management](https://github.com/allegroai/clearml/blob/master/docs/datasets.md) - Fully differentiable data management & version control solution on top of object-storage (S3 / GS / Azure / NAS) - [Model-Serving](https://github.com/allegroai/clearml-serving) - *cloud-ready* Scalable model serving solution! - **Deploy new model endpoints in under 5 minutes** - Includes optimized GPU serving support backed by Nvidia-Triton - **with out-of-the-box Model Monitoring** -- **NEW** :fire: [Reports](https://clear.ml/docs/latest/docs/webapp/webapp_reports) - Create and share rich MarkDown documents supporting embeddable online content +- [Reports](https://clear.ml/docs/latest/docs/webapp/webapp_reports) - Create and share rich MarkDown documents supporting embeddable online content +- **NEW** :fire: [Orchestration Dashboard](https://clear.ml/docs/latest/docs/webapp/webapp_orchestration_dash/) - Live rich dashboard for your entire compute cluster (Cloud / Kubernetes / On-Prem) Instrumenting these components is the **ClearML-server**, see [Self-Hosting](https://clear.ml/docs/latest/docs/deploying_clearml/clearml_server) & [Free tier Hosting](https://app.clear.ml) @@ -141,7 +142,7 @@ The ClearML run-time components: - [clearml-session](https://github.com/allegroai/clearml-session) - **Launch remote JupyterLab / VSCode-server inside any docker, on Cloud/On-Prem machines** - [clearml-task](https://github.com/allegroai/clearml/blob/master/docs/clearml-task.md) - Run any codebase on remote machines with full remote logging of Tensorboard, Matplotlib & Console outputs - [clearml-data](https://github.com/allegroai/clearml/blob/master/docs/datasets.md) - **CLI for managing and versioning your datasets, including creating / uploading / downloading of data from S3/GS/Azure/NAS** -- [AWS Auto-Scaler](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler) - Automatically spin EC2 instances based on your workloads with preconfigured budget! No need for K8s! +- [AWS Auto-Scaler](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler) - Automatically spin EC2 instances based on your workloads with preconfigured budget! No need for AKE! - [Hyper-Parameter Optimization](https://clear.ml/docs/latest/docs/guides/optimization/hyper-parameter-optimization/examples_hyperparam_opt) - Optimize any code with black-box approach and state-of-the-art Bayesian optimization algorithms - [Automation Pipeline](https://clear.ml/docs/latest/docs/guides/pipeline/pipeline_controller) - Build pipelines based on existing experiments / jobs, supports building pipelines of pipelines! - [Slack Integration](https://clear.ml/docs/latest/docs/guides/services/slack_alerts) - Report experiments progress / failure directly to Slack (fully customizable!) diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..1ac0d525 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,5 @@ +## Reporting Security Issues + +Thanks for taking the time to make ClearML more secure! + +To carry on the discussion more securely - Please send your report to [security@clear.ml](mailto:security@clear.ml). \ No newline at end of file diff --git a/clearml/automation/controller.py b/clearml/automation/controller.py index 26cb9516..8326b399 100644 --- a/clearml/automation/controller.py +++ b/clearml/automation/controller.py @@ -142,7 +142,7 @@ class PipelineController(object): try: self.job.task.reload() self.job_ended = self.job_started + self.job.task.data.active_duration - except Exception as e: + except Exception: pass def set_job_started(self): @@ -154,7 +154,6 @@ class PipelineController(object): except Exception: pass - def __init__( self, name, # type: str @@ -558,7 +557,7 @@ class PipelineController(object): previous_status # type: str ): pass - + :param output_uri: The storage / output url for this step. This is the default location for output models and other artifacts. Check Task.init reference docs for more info (output_uri is a parameter). @@ -1631,7 +1630,7 @@ class PipelineController(object): 'target_project': self._target_project, } pipeline_dag = self._serialize() - + # serialize pipeline state if self._task and self._auto_connect_task: # check if we are either running locally or that we are running remotely, @@ -2733,7 +2732,7 @@ class PipelineController(object): if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail: nodes_failed_stop_pipeline.append(node.name) elif node.timeout: - node.set_job_started() + started = node.job.task.data.started if (datetime.now().astimezone(started.tzinfo) - started).total_seconds() > node.timeout: node.job.abort() completed_jobs.append(j) diff --git a/clearml/backend_api/api_proxy.py b/clearml/backend_api/api_proxy.py index 22281cfd..888a27b0 100644 --- a/clearml/backend_api/api_proxy.py +++ b/clearml/backend_api/api_proxy.py @@ -32,7 +32,10 @@ class ApiServiceProxy(object): ) # get the most advanced service version that supports our api - version = [str(v) for v in ApiServiceProxy._available_versions if Session.check_min_api_version(v)][-1] + version = [ + str(v) for v in ApiServiceProxy._available_versions + if Session.check_min_api_version(v) + ][-1] Session.api_version = version self.__dict__["__wrapped_version__"] = Session.api_version name = ".v{}.{}".format( diff --git a/clearml/backend_api/services/v2_23/queues.py b/clearml/backend_api/services/v2_23/queues.py index c22ccd4f..1df26606 100644 --- a/clearml/backend_api/services/v2_23/queues.py +++ b/clearml/backend_api/services/v2_23/queues.py @@ -490,7 +490,7 @@ class Queue(NonStrictDataModel): self._property_metadata = None return - self.assert_isinstance(value, "metadata", (dict,), is_array=True) + self.assert_isinstance(value, "metadata", (dict,)) self._property_metadata = value diff --git a/clearml/backend_api/session/defs.py b/clearml/backend_api/session/defs.py index cdbbcf3c..a477ebd9 100644 --- a/clearml/backend_api/session/defs.py +++ b/clearml/backend_api/session/defs.py @@ -40,7 +40,6 @@ for a very long time for a non-responding or mis-configured server """ ENV_API_EXTRA_RETRY_CODES = EnvEntry("CLEARML_API_EXTRA_RETRY_CODES") - ENV_FORCE_MAX_API_VERSION = EnvEntry("CLEARML_FORCE_MAX_API_VERSION", type=str) diff --git a/clearml/backend_api/session/request.py b/clearml/backend_api/session/request.py index 2a85cb0c..9004127a 100644 --- a/clearml/backend_api/session/request.py +++ b/clearml/backend_api/session/request.py @@ -1,7 +1,14 @@ import abc -import jsonschema import six +from jsonschema.exceptions import FormatError, SchemaError, ValidationError + +try: + # Since `referencing`` only supports Python >= 3.8, this try-except blocks maintain support + # for earlier python versions. + from referencing.exceptions import Unresolvable +except ImportError: + from jsonschema.exceptions import RefResolutionError as Unresolvable from .apimodel import ApiModel from .datamodel import DataModel @@ -39,8 +46,7 @@ class BatchRequest(Request): _batched_request_cls = abc.abstractproperty() - _schema_errors = (jsonschema.SchemaError, jsonschema.ValidationError, jsonschema.FormatError, - jsonschema.RefResolutionError) + _schema_errors = (SchemaError, ValidationError, FormatError, Unresolvable) def __init__(self, requests, validate_requests=False, allow_raw_requests=True, **kwargs): super(BatchRequest, self).__init__(**kwargs) @@ -70,8 +76,7 @@ class BatchRequest(Request): for i, req in enumerate(self.requests): try: req.validate() - except (jsonschema.SchemaError, jsonschema.ValidationError, - jsonschema.FormatError, jsonschema.RefResolutionError) as e: + except (SchemaError, ValidationError, FormatError, Unresolvable) as e: raise Exception('Validation error in batch item #%d: %s' % (i, str(e))) def get_json(self): diff --git a/clearml/backend_api/session/session.py b/clearml/backend_api/session/session.py index 73403619..efdd697f 100644 --- a/clearml/backend_api/session/session.py +++ b/clearml/backend_api/session/session.py @@ -12,6 +12,7 @@ import jwt import requests import six from requests.auth import HTTPBasicAuth +from requests.exceptions import ChunkedEncodingError, ContentDecodingError, StreamConsumedError from six.moves.urllib.parse import urlparse, urlunparse from typing import List, Optional @@ -31,6 +32,7 @@ from .defs import ( ENV_API_EXTRA_RETRY_CODES, ENV_API_DEFAULT_REQ_METHOD, ENV_FORCE_MAX_API_VERSION, + ENV_IGNORE_MISSING_CONFIG, MissingConfigError ) from .request import Request, BatchRequest # noqa: F401 @@ -417,6 +419,13 @@ class Session(TokenManager): (self._logger or get_logger()).warning("SSLError Retrying {}".format(ex)) sleep(0.1) continue + except (ChunkedEncodingError, ContentDecodingError, StreamConsumedError) as ex: + retry_counter += 1 + # we should retry + if retry_counter >= self._ssl_error_count_verbosity: + (self._logger or get_logger()).warning("Network decoding error Retrying {}".format(ex)) + sleep(0.1) + continue if ( refresh_token_if_unauthorized @@ -736,7 +745,7 @@ class Session(TokenManager): return urlunparse(parsed) @classmethod - def check_min_api_version(cls, min_api_version): + def check_min_api_version(cls, min_api_version, raise_error=False): """ Return True if Session.api_version is greater or equal >= to min_api_version """ @@ -764,18 +773,24 @@ class Session(TokenManager): # noinspection PyBroadException try: cls() + except MissingConfigError: + if raise_error and not ENV_IGNORE_MISSING_CONFIG.get(): + raise + except LoginError: + if raise_error: + raise except Exception: pass return cls._version_tuple(cls.api_version) >= cls._version_tuple(str(min_api_version)) @classmethod - def check_min_api_server_version(cls, min_api_version): + def check_min_api_server_version(cls, min_api_version, raise_error=False): """ Return True if Session.max_api_version is greater or equal >= to min_api_version Notice this is the api version server reported, not the current SDK max supported api version """ - if cls.check_min_api_version(min_api_version): + if cls.check_min_api_version(min_api_version, raise_error=raise_error): return True return cls._version_tuple(cls.max_api_version) >= cls._version_tuple(str(min_api_version)) diff --git a/clearml/backend_interface/base.py b/clearml/backend_interface/base.py index afbe67a7..8fe54b4b 100644 --- a/clearml/backend_interface/base.py +++ b/clearml/backend_interface/base.py @@ -178,8 +178,9 @@ class IdObjectBase(InterfaceBase): # noinspection PyBroadException try: self._data = self._reload() - except Exception: - self.log.error("Failed reloading task {}".format(self.id)) + except Exception as ex: + self.log.error("Failed reloading {} {}".format(type(self).__name__.lower(), self.id)) + self.log.debug("Failed reloading {} {}: {}".format(type(self).__name__.lower(), self.id, ex)) @classmethod def normalize_id(cls, id): diff --git a/clearml/backend_interface/model.py b/clearml/backend_interface/model.py index bc14258f..3d870e80 100644 --- a/clearml/backend_interface/model.py +++ b/clearml/backend_interface/model.py @@ -79,7 +79,7 @@ class Model(IdObjectBase, AsyncManagerMixin, _StorageUriMixin): self.reload() def archive(self): - if Session.check_min_api_server_version("2.13"): + if Session.check_min_api_server_version("2.13", raise_error=True): self.send(models.ArchiveManyRequest(ids=[self.id])) self.reload() else: @@ -90,7 +90,7 @@ class Model(IdObjectBase, AsyncManagerMixin, _StorageUriMixin): ) def unarchive(self): - if Session.check_min_api_server_version("2.13"): + if Session.check_min_api_server_version("2.13", raise_error=True): self.send(models.UnarchiveManyRequest(ids=[self.id])) self.reload() else: diff --git a/clearml/backend_interface/task/log.py b/clearml/backend_interface/task/log.py index 9f348cdc..9e67cd38 100644 --- a/clearml/backend_interface/task/log.py +++ b/clearml/backend_interface/task/log.py @@ -263,8 +263,8 @@ class TaskHandler(BufferingHandler): # flush pending logs if not self._task_id: return - # avoid deadlocks just skip the lock, we are shutting down anyway - self.lock = None + # Never null the lock, it might be used by internal Python at some point + # self.lock = None self._task_id = None # shut down the TaskHandler, from this point onwards. No events will be logged diff --git a/clearml/backend_interface/task/task.py b/clearml/backend_interface/task/task.py index fc00233c..30c098a3 100644 --- a/clearml/backend_interface/task/task.py +++ b/clearml/backend_interface/task/task.py @@ -1088,12 +1088,13 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin): if not model.ready: # raise ValueError('Model %s is not published (not ready)' % model_id) self.log.debug('Model %s [%s] is not published yet (not ready)' % (model_id, model.uri)) - name = name or Path(model.uri).stem else: # clear the input model model = None model_id = '' - name = name or 'Input Model' + from ...model import InputModel + # noinspection PyProtectedMember + name = name or InputModel._get_connect_name(model) with self._edit_lock: self.reload() @@ -1345,7 +1346,7 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin): return params.get(name, default) def delete_parameter(self, name, force=False): - # type: (str) -> bool + # type: (str, bool) -> bool """ Delete a parameter by its full name Section/name. diff --git a/clearml/binding/hydra_bind.py b/clearml/binding/hydra_bind.py index b89f1dbc..6e58139a 100644 --- a/clearml/binding/hydra_bind.py +++ b/clearml/binding/hydra_bind.py @@ -1,7 +1,6 @@ import io import sys from functools import partial -import yaml from ..config import running_remotely, get_remote_task_id, DEV_TASK_NO_REUSE from ..debugging.log import LoggerRoot @@ -17,6 +16,9 @@ class PatchHydra(object): _parameter_allow_full_edit = '_allow_omegaconf_edit_' _should_delete_overrides = False _overrides_section = "Args/overrides" + _default_hydra_context = None + _overrides_parser = None + _config_group_warning_sent = False @classmethod def patch_hydra(cls): @@ -83,6 +85,7 @@ class PatchHydra(object): @staticmethod def _patched_hydra_run(self, config_name, task_function, overrides, *args, **kwargs): + PatchHydra._default_hydra_context = self PatchHydra._allow_omegaconf_edit = False if not running_remotely(): return PatchHydra._original_hydra_run(self, config_name, task_function, overrides, *args, **kwargs) @@ -104,12 +107,11 @@ class PatchHydra(object): if k.startswith(PatchHydra._parameter_section+'/')} stored_config.pop(PatchHydra._parameter_allow_full_edit, None) for override_k, override_v in stored_config.items(): - if override_k.startswith("~"): - new_override = override_k - else: - new_override = "++" + override_k.lstrip("+") + new_override = override_k if override_v is not None and override_v != "": new_override += "=" + override_v + if not new_override.startswith("~") and not PatchHydra._is_group(self, new_override): + new_override = "++" + new_override.lstrip("+") overrides.append(new_override) PatchHydra._should_delete_overrides = True except Exception: @@ -117,6 +119,28 @@ class PatchHydra(object): return PatchHydra._original_hydra_run(self, config_name, task_function, overrides, *args, **kwargs) + @staticmethod + def _parse_override(override): + if PatchHydra._overrides_parser is None: + from hydra.core.override_parser.overrides_parser import OverridesParser + PatchHydra._overrides_parser = OverridesParser.create() + return PatchHydra._overrides_parser.parse_overrides(overrides=[override])[0] + + @staticmethod + def _is_group(hydra_context, override): + # noinspection PyBroadException + try: + override = PatchHydra._parse_override(override) + group_exists = hydra_context.config_loader.repository.group_exists(override.key_or_group) + return group_exists + except Exception: + if not PatchHydra._config_group_warning_sent: + LoggerRoot.get_base_logger().warning( + "Could not determine if Hydra is overriding a Config Group" + ) + PatchHydra._config_group_warning_sent = True + return False + @staticmethod def _patched_run_job(config, task_function, *args, **kwargs): # noinspection PyBroadException @@ -125,11 +149,12 @@ class PatchHydra(object): failed_status = JobStatus.FAILED except Exception: - LoggerRoot.get_base_logger(PatchHydra).warning( + LoggerRoot.get_base_logger().warning( "Could not import JobStatus from Hydra. Failed tasks will be marked as completed" ) failed_status = None + hydra_context = kwargs.get("hydra_context", PatchHydra._default_hydra_context) # store the config # noinspection PyBroadException try: @@ -139,7 +164,8 @@ class PatchHydra(object): overrides = config.hydra.overrides.task stored_config = {} for arg in overrides: - arg = arg.lstrip("+") + if not PatchHydra._is_group(hydra_context, arg): + arg = arg.lstrip("+") if "=" in arg: k, v = arg.split("=", 1) stored_config[k] = v diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index a56cdaa0..d57b158e 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -1241,7 +1241,7 @@ class Dataset(object): :return: Newly created Dataset object """ - if not Dataset.is_offline() and not Session.check_min_api_server_version("2.13"): + if not Dataset.is_offline() and not Session.check_min_api_server_version("2.13", raise_error=True): raise NotImplementedError("Datasets are not supported with your current ClearML server version. Please update your server.") parent_datasets = [cls.get(dataset_id=p) if not isinstance(p, Dataset) else p for p in (parent_datasets or [])] @@ -1531,7 +1531,7 @@ class Dataset(object): """ if Dataset.is_offline(): raise ValueError("Cannot rename dataset in offline mode") - if not bool(Session.check_min_api_server_version(cls.__min_api_version)): + if not bool(Session.check_min_api_server_version(cls.__min_api_version, raise_error=True)): LoggerRoot.get_base_logger().warning( "Could not rename dataset because API version < {}".format(cls.__min_api_version) ) @@ -1578,7 +1578,7 @@ class Dataset(object): """ if cls.is_offline(): raise ValueError("Cannot move dataset project in offlime mode") - if not bool(Session.check_min_api_server_version(cls.__min_api_version)): + if not bool(Session.check_min_api_server_version(cls.__min_api_version, raise_error=True)): LoggerRoot.get_base_logger().warning( "Could not move dataset to another project because API version < {}".format(cls.__min_api_version) ) @@ -3221,14 +3221,14 @@ class Dataset(object): # type: (Path, Union[FileEntry, LinkEntry], Optional[int], Optional[dict]) -> bool # check if we need the file for the requested dataset part - if ds_part is not None: - f_parts = ds_chunk_selection.get(file_entry.parent_dataset_id, []) + if part is not None: + f_parts = chunk_selection.get(file_entry.parent_dataset_id, []) # file is not in requested dataset part, no need to check it. if self._get_chunk_idx_from_artifact_name(file_entry.artifact_name) not in f_parts: return True # check if the local size and the stored size match (faster than comparing hash) - if (base_folder / file_entry.relative_path).stat().st_size != file_entry.size: + if (target_base_folder / file_entry.relative_path).stat().st_size != file_entry.size: return False return True @@ -3240,23 +3240,19 @@ class Dataset(object): tp = None try: futures_ = [] - tp = ThreadPoolExecutor(max_workers=max_workers) - for f in self._dataset_file_entries.values(): - future = tp.submit(__verify_file_or_link, target_base_folder, f, part, chunk_selection) - futures_.append(future) + with ThreadPoolExecutor(max_workers=max_workers) as tp: + for f in self._dataset_file_entries.values(): + future = tp.submit(__verify_file_or_link, target_base_folder, f, part, chunk_selection) + futures_.append(future) - for f in self._dataset_link_entries.values(): - # don't check whether link is in dataset part, hence None for part and chunk_selection - future = tp.submit(__verify_file_or_link, target_base_folder, f, None, None) - futures_.append(future) + for f in self._dataset_link_entries.values(): + # don't check whether link is in dataset part, hence None for part and chunk_selection + future = tp.submit(__verify_file_or_link, target_base_folder, f, None, None) + futures_.append(future) - verified = all(f.result() for f in futures_) + verified = all(f.result() for f in futures_) except Exception: verified = False - finally: - if tp is not None: - # we already have our result, close all pending checks (improves performance when verified==False) - tp.shutdown(cancel_futures=True) return verified diff --git a/clearml/model.py b/clearml/model.py index 7f529bda..146f1404 100644 --- a/clearml/model.py +++ b/clearml/model.py @@ -1,9 +1,8 @@ import abc import os -import tarfile import zipfile import shutil -from tempfile import mkdtemp, mkstemp +from tempfile import mkstemp import six import math @@ -1427,12 +1426,31 @@ class Model(BaseModel): :param project_name: Optional, filter based project name string, if not given query models from all projects :param model_name: Optional Model name as shown in the model artifactory - :param tags: Filter based on the requested list of tags (strings) - To exclude a tag add "-" prefix to the tag. Example: ['production', 'verified', '-qa'] - To include All tags (instead of the default Any behaviour) use "__$all" as the first string, example: - ["__$all", "best", "model", "ever"] - To combine All tags and exclude a list of tags use "__$not" before the excluded tags, example: - ["__$all", "best", "model", "ever", "__$not", "internal", "__$not", "test"] + :param tags: Filter based on the requested list of tags (strings). + To exclude a tag add "-" prefix to the tag. Example: ``["production", "verified", "-qa"]``. + The default behaviour is to join all tags with a logical "OR" operator. + To join all tags with a logical "AND" operator instead, use "__$all" as the first string, for example: + + .. code-block:: py + + ["__$all", "best", "model", "ever"] + + To join all tags with AND, but exclude a tag use "__$not" before the excluded tag, for example: + + .. code-block:: py + + ["__$all", "best", "model", "ever", "__$not", "internal", "__$not", "test"] + + The "OR" and "AND" operators apply to all tags that follow them until another operator is specified. + The NOT operator applies only to the immediately following tag. + For example: + + .. code-block:: py + + ["__$all", "a", "b", "c", "__$or", "d", "__$not", "e", "__$and", "__$or" "f", "g"] + + This example means ("a" AND "b" AND "c" AND ("d" OR NOT "e") AND ("f" OR "g")). + See https://clear.ml/docs/latest/docs/clearml_sdk/model_sdk#tag-filters for details. :param only_published: If True, only return published models. :param include_archived: If True, return archived models. :param max_results: Optional return the last X models, @@ -1596,6 +1614,7 @@ class InputModel(Model): # noinspection PyProtectedMember _EMPTY_MODEL_ID = _Model._EMPTY_MODEL_ID + _WARNING_CONNECTED_NAMES = {} @classmethod def import_model( @@ -1933,9 +1952,11 @@ class InputModel(Model): :param object task: A Task object. :param str name: The model name to be stored on the Task - (default the filename, of the model weights, without the file extension) + (default to filename of the model weights, without the file extension, or to `Input Model` if that is not found) """ self._set_task(task) + name = name or InputModel._get_connect_name(self) + InputModel._warn_on_same_name_connect(name) model_id = None # noinspection PyProtectedMember @@ -1967,6 +1988,28 @@ class InputModel(Model): if not self._task.get_labels_enumeration() and model.data.labels: task.set_model_label_enumeration(model.data.labels) + @classmethod + def _warn_on_same_name_connect(cls, name): + if name not in cls._WARNING_CONNECTED_NAMES: + cls._WARNING_CONNECTED_NAMES[name] = False + return + if cls._WARNING_CONNECTED_NAMES[name]: + return + get_logger().warning("Connecting multiple input models with the same name: `{}`. This might result in the wrong model being used when executing remotely".format(name)) + cls._WARNING_CONNECTED_NAMES[name] = True + + @staticmethod + def _get_connect_name(model): + default_name = "Input Model" + if model is None: + return default_name + # noinspection PyBroadException + try: + model_uri = getattr(model, "url", getattr(model, "uri", None)) + return Path(model_uri).stem + except Exception: + return default_name + class OutputModel(BaseModel): """ diff --git a/clearml/storage/helper.py b/clearml/storage/helper.py index 1d62cc68..8fca9c85 100644 --- a/clearml/storage/helper.py +++ b/clearml/storage/helper.py @@ -791,10 +791,21 @@ class _GoogleCloudStorageDriver(_Driver): self.name = name[len(_GoogleCloudStorageDriver.scheme_prefix):] if cfg.credentials_json: + # noinspection PyBroadException try: credentials = service_account.Credentials.from_service_account_file(cfg.credentials_json) - except ValueError: + except Exception: credentials = None + + if not credentials: + # noinspection PyBroadException + try: + # Try parsing this as json to support actual json content and not a file path + credentials = service_account.Credentials.from_service_account_info( + json.loads(cfg.credentials_json) + ) + except Exception: + pass else: credentials = None diff --git a/clearml/task.py b/clearml/task.py index ac896798..aa7b2853 100644 --- a/clearml/task.py +++ b/clearml/task.py @@ -331,7 +331,9 @@ class Task(_Task): :param str output_uri: The default location for output models and other artifacts. If True, the default files_server will be used for model storage. In the default location, ClearML creates a subfolder for the - output. The subfolder structure is the following: ` / / .`. + output. If set to False, local runs will not upload output models and artifacts, + and remote runs will not use any default values provided using ``default_output_uri``. + The subfolder structure is the following: ` / / .`. Note that for cloud storage, you must install the **ClearML** package for your cloud storage type, and then configure your storage credentials. For detailed information, see "Storage" in the ClearML Documentation. @@ -606,10 +608,14 @@ class Task(_Task): task_id=get_remote_task_id(), log_to_backend=False, ) - if task.get_project_object().default_output_destination and not task.output_uri: - task.output_uri = task.get_project_object().default_output_destination - if cls.__default_output_uri and not task.output_uri: - task.output_uri = cls.__default_output_uri + if output_uri is False and not task.output_uri: + # Setting output_uri=False argument will disable using any default when running remotely + pass + else: + if task.get_project_object().default_output_destination and not task.output_uri: + task.output_uri = task.get_project_object().default_output_destination + if cls.__default_output_uri and not task.output_uri: + task.output_uri = cls.__default_output_uri # store new task ID cls.__update_master_pid_task(task=task) # make sure we are started @@ -931,8 +937,31 @@ class Task(_Task): If specified, ``project_name`` and ``task_name`` are ignored. :param str project_name: The project name of the Task to get. :param str task_name: The name of the Task within ``project_name`` to get. - :param list tags: Filter based on the requested list of tags (strings) (Task must have at least one of the - listed tags). To exclude a tag add "-" prefix to the tag. Example: ["best", "-debug"] + :param list tags: Filter based on the requested list of tags (strings). To exclude a tag add "-" prefix to the + tag. Example: ``["best", "-debug"]``. + The default behaviour is to join all tags with a logical "OR" operator. + To join all tags with a logical "AND" operator instead, use "__$all" as the first string, for example: + + .. code-block:: py + + ["__$all", "best", "experiment", "ever"] + + To join all tags with AND, but exclude a tag use "__$not" before the excluded tag, for example: + + .. code-block:: py + + ["__$all", "best", "experiment", "ever", "__$not", "internal", "__$not", "test"] + + The "OR" and "AND" operators apply to all tags that follow them until another operator is specified. + The NOT operator applies only to the immediately following tag. + For example: + + .. code-block:: py + + ["__$all", "a", "b", "c", "__$or", "d", "__$not", "e", "__$and", "__$or" "f", "g"] + + This example means ("a" AND "b" AND "c" AND ("d" OR NOT "e") AND ("f" OR "g")). + See https://clear.ml/docs/latest/docs/clearml_sdk/task_sdk/#tag-filters for more information. :param bool allow_archived: Only applicable if *not* using specific ``task_id``, If True (default), allow to return archived Tasks, if False filter out archived Tasks :param bool task_filter: Only applicable if *not* using specific ``task_id``, @@ -980,8 +1009,31 @@ class Task(_Task): avoid any regex behaviour, use re.escape()). (Optional) To match an exact task name (i.e. not partial matching), add ^/$ at the beginning/end of the string, for example: "^exact_task_name_here$" - :param list tags: Filter based on the requested list of tags (strings) (Task must have all the listed tags) - To exclude a tag add "-" prefix to the tag. Example: ["best", "-debug"] + :param list tags: Filter based on the requested list of tags (strings). To exclude a tag add "-" prefix to the + tag. Example: ``["best", "-debug"]``. + The default behaviour is to join all tags with a logical "OR" operator. + To join all tags with a logical "AND" operator instead, use "__$all" as the first string, for example: + + .. code-block:: py + + ["__$all", "best", "experiment", "ever"] + + To join all tags with AND, but exclude a tag use "__$not" before the excluded tag, for example: + + .. code-block:: py + + ["__$all", "best", "experiment", "ever", "__$not", "internal", "__$not", "test"] + + The "OR" and "AND" operators apply to all tags that follow them until another operator is specified. + The NOT operator applies only to the immediately following tag. + For example: + + .. code-block:: py + + ["__$all", "a", "b", "c", "__$or", "d", "__$not", "e", "__$and", "__$or" "f", "g"] + + This example means ("a" AND "b" AND "c" AND ("d" OR NOT "e") AND ("f" OR "g")). + See https://clear.ml/docs/latest/docs/clearml_sdk/task_sdk/#tag-filters for more information. :param bool allow_archived: If True (default), allow to return archived Tasks, if False filter out archived Tasks :param dict task_filter: filter and order Tasks. See :class:`.backend_api.service.v?.tasks.GetAllRequest` for details; the ? needs to be replaced by the appropriate version. @@ -1032,17 +1084,30 @@ class Task(_Task): :param str task_name: task name (str) within the selected project Return any partial match of task_name, regular expressions matching is also supported. If None is passed, returns all tasks within the project - :param list tags: Filter based on the requested list of tags (strings) - To exclude a tag add "-" prefix to the tag. Example: ["best", "-debug"] + :param list tags: Filter based on the requested list of tags (strings). + To exclude a tag add "-" prefix to the tag. Example: ``["best", "-debug"]``. The default behaviour is to join all tags with a logical "OR" operator. To join all tags with a logical "AND" operator instead, use "__$all" as the first string, for example: - ["__$all", "best", "experiment", "ever"] + + .. code-block:: py + + ["__$all", "best", "experiment", "ever"] + To join all tags with AND, but exclude a tag use "__$not" before the excluded tag, for example: - ["__$all", "best", "experiment", "ever", "__$not", "internal", "__$not", "test"] + + .. code-block:: py + + ["__$all", "best", "experiment", "ever", "__$not", "internal", "__$not", "test"] + The "OR" and "AND" operators apply to all tags that follow them until another operator is specified. The NOT operator applies only to the immediately following tag. - For example, ["__$all", "a", "b", "c", "__$or", "d", "__$not", "e", "__$and", "__$or" "f", "g"] - means ("a" AND "b" AND "c" AND ("d" OR NOT "e") AND ("f" OR "g")). + For example: + + .. code-block:: py + + ["__$all", "a", "b", "c", "__$or", "d", "__$not", "e", "__$and", "__$or" "f", "g"] + + This example means ("a" AND "b" AND "c" AND ("d" OR NOT "e") AND ("f" OR "g")). See https://clear.ml/docs/latest/docs/clearml_sdk/task_sdk/#tag-filters for more information. :param list additional_return_fields: Optional, if not provided return a list of Task IDs. If provided return dict per Task with the additional requested fields. @@ -1330,7 +1395,7 @@ class Task(_Task): :return: The number of tasks enqueued in the given queue """ - if not Session.check_min_api_server_version("2.20"): + if not Session.check_min_api_server_version("2.20", raise_error=True): raise ValueError("You version of clearml-server does not support the 'queues.get_num_entries' endpoint") mutually_exclusive(queue_name=queue_name, queue_id=queue_id) session = cls._get_default_session() @@ -1524,6 +1589,7 @@ class Task(_Task): specified, then a path to a local configuration file is returned. Configuration object. """ pathlib_Path = None # noqa + cast_Path = Path if not isinstance(configuration, (dict, list, Path, six.string_types)): try: from pathlib import Path as pathlib_Path # noqa @@ -1532,6 +1598,8 @@ class Task(_Task): if not pathlib_Path or not isinstance(configuration, pathlib_Path): raise ValueError("connect_configuration supports `dict`, `str` and 'Path' types, " "{} is not supported".format(type(configuration))) + if pathlib_Path and isinstance(configuration, pathlib_Path): + cast_Path = pathlib_Path multi_config_support = Session.check_min_api_version('2.9') if multi_config_support and not name: @@ -1599,7 +1667,7 @@ class Task(_Task): # it is a path to a local file if not running_remotely() or not (self.is_main_task() or self._is_remote_main_task()): # check if not absolute path - configuration_path = Path(configuration) + configuration_path = cast_Path(configuration) if not configuration_path.is_file(): ValueError("Configuration file does not exist") try: @@ -1626,7 +1694,7 @@ class Task(_Task): "Using default configuration: {}".format(name, str(configuration))) # update back configuration section if multi_config_support: - configuration_path = Path(configuration) + configuration_path = cast_Path(configuration) if configuration_path.is_file(): with open(configuration_path.as_posix(), 'rt') as f: configuration_text = f.read() @@ -1638,15 +1706,13 @@ class Task(_Task): config_text=configuration_text) return configuration - configuration_path = Path(configuration) + configuration_path = cast_Path(configuration) fd, local_filename = mkstemp(prefix='clearml_task_config_', suffix=configuration_path.suffixes[-1] if configuration_path.suffixes else '.txt') with open(fd, "w") as f: f.write(configuration_text) - if pathlib_Path: - return pathlib_Path(local_filename) - return Path(local_filename) if isinstance(configuration, Path) else local_filename + return cast_Path(local_filename) if isinstance(configuration, cast_Path) else local_filename def connect_label_enumeration(self, enumeration): # type: (Dict[str, int]) -> Dict[str, int] diff --git a/clearml/utilities/attrs.py b/clearml/utilities/attrs.py index 0ecee840..e874808c 100644 --- a/clearml/utilities/attrs.py +++ b/clearml/utilities/attrs.py @@ -2,12 +2,19 @@ import attr from .version import Version +try: + # noinspection PyUnresolvedReferences + import importlib.metadata + attr_version = importlib.metadata.version("attrs") +except ImportError: + attr_version = attr.__version__ + class attrs(object): def __init__(self, *args, **kwargs): if any(x in kwargs for x in ("eq", "order")): raise RuntimeError("Only `cmp` is supported for attr.attrs, not `eq` or `order`") - if Version(attr.__version__) >= Version("19.2"): + if Version(attr_version) >= Version("19.2"): cmp = kwargs.pop("cmp", None) if cmp is not None: kwargs["eq"] = kwargs["order"] = cmp diff --git a/clearml/version.py b/clearml/version.py index 82b6a36f..f43a2786 100644 --- a/clearml/version.py +++ b/clearml/version.py @@ -1 +1 @@ -__version__ = '1.13.1' +__version__ = '1.13.2' diff --git a/examples/frameworks/pytorch/requirements.txt b/examples/frameworks/pytorch/requirements.txt index e74fea99..7d2cdfe9 100644 --- a/examples/frameworks/pytorch/requirements.txt +++ b/examples/frameworks/pytorch/requirements.txt @@ -2,9 +2,11 @@ clearml jsonschema==3.2.0 ; python_version <= '3.5' matplotlib pytorch-ignite -tensorboard>=1.14.0 +tensorboard<=2.11.2 ; python_version <= '3.7' +tensorboard>2.11.2 ; python_version > '3.7' tensorboardX torch>=1.1.0 torchvision>=0.3.0 tqdm -protobuf>=4.21.1 +protobuf==3.20.* ; python_version <= '3.7' +protobuf>=4.21.1 ; python_version > '3.7' diff --git a/examples/reporting/config_files.py b/examples/reporting/config_files.py index 1a18c610..d3b43463 100644 --- a/examples/reporting/config_files.py +++ b/examples/reporting/config_files.py @@ -21,14 +21,14 @@ task = Task.init(project_name='FirstTrial', task_name='config_files_example') config_file = task.connect_configuration(Path("data_samples") / "sample.json", name='json config file') -with open(config_file, "rt") as f: +with open(config_file.as_posix(), "rt") as f: config_json = json.load(f) print(config_json) config_file = task.connect_configuration(Path("data_samples") / "config_yaml.yaml", name='yaml config file') -with open(config_file, "rt") as f: +with open(config_file.as_posix(), "rt") as f: config_yaml = yaml.load(f, Loader=yaml.SafeLoader) print(config_yaml) diff --git a/requirements.txt b/requirements.txt index 82654c24..f4862419 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ python-dateutil>=2.6.1 pyjwt>=2.4.0,<2.9.0 ; python_version > '3.5' pyjwt>=1.6.4,<2.0.0 ; python_version <= '3.5' PyYAML>=3.12 +referencing<0.40 ; python_version >= '3.8' requests>=2.20.0 six>=1.13.0 typing>=3.6.4 ; python_version < '3.5'