This commit is contained in:
revital 2023-10-18 20:54:44 +03:00
commit fe5601a8ca
25 changed files with 360 additions and 62 deletions

5
.gitignore vendored
View File

@ -7,8 +7,9 @@
# Python
*.pyc
__pycache__
build/
dist/
/build/
/dist/
*/conda_build/build/
*.egg-info
.env
.venv/

View File

@ -182,7 +182,7 @@ If ClearML is part of your development process / project / publication, please c
```
@misc{clearml,
title = {ClearML - Your entire MLOps stack in one open-source tool},
year = {2019},
year = {2023},
note = {Software available from http://github.com/allegroai/clearml},
url={https://clear.ml/},
author = {ClearML},

View File

@ -99,6 +99,8 @@ class PipelineController(object):
monitor_artifacts = attrib(type=list, default=None) # List of artifact names to monitor
monitor_models = attrib(type=list, default=None) # List of models to monitor
explicit_docker_image = attrib(type=str, default=None) # The Docker image the node uses, specified at creation
recursively_parse_parameters = attrib(type=bool, default=False) # if True, recursively parse parameters in
# lists, dicts, or tuples
def __attrs_post_init__(self):
if self.parents is None:
@ -384,7 +386,8 @@ class PipelineController(object):
cache_executed_step=False, # type: bool
base_task_factory=None, # type: Optional[Callable[[PipelineController.Node], Task]]
retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa
status_change_callback=None # type: Optional[Callable[[PipelineController, PipelineController.Node, str], None]] # noqa
status_change_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, str], None]] # noqa
recursively_parse_parameters=False # type: bool
):
# type: (...) -> bool
"""
@ -405,7 +408,10 @@ class PipelineController(object):
- Parameter access ``parameter_override={'Args/input_file': '${<step_name>.parameters.Args/input_file}' }``
- Pipeline Task argument (see `Pipeline.add_parameter`) ``parameter_override={'Args/input_file': '${pipeline.<pipeline_parameter>}' }``
- Task ID ``parameter_override={'Args/input_file': '${stage3.id}' }``
:param recursively_parse_parameters: If True, recursively parse parameters from parameter_override in lists, dicts, or tuples.
Example:
- ``parameter_override={'Args/input_file': ['${<step_name>.artifacts.<artifact_name>.url}', 'file2.txt']}`` will be correctly parsed.
- ``parameter_override={'Args/input_file': ('${<step_name_1>.parameters.Args/input_file}', '${<step_name_2>.parameters.Args/input_file}')}`` will be correctly parsed.
:param configuration_overrides: Optional, override Task configuration objects.
Expected dictionary of configuration object name and configuration object content.
Examples:
@ -572,6 +578,7 @@ class PipelineController(object):
name=name, base_task_id=base_task_id, parents=parents or [],
queue=execution_queue, timeout=time_limit,
parameters=parameter_override or {},
recursively_parse_parameters=recursively_parse_parameters,
configurations=configuration_overrides,
clone_task=clone_base_task,
task_overrides=task_overrides,
@ -2237,7 +2244,7 @@ class PipelineController(object):
updated_hyper_parameters = {}
for k, v in node.parameters.items():
updated_hyper_parameters[k] = self._parse_step_ref(v)
updated_hyper_parameters[k] = self._parse_step_ref(v, recursive=node.recursively_parse_parameters)
task_overrides = self._parse_task_overrides(node.task_overrides) if node.task_overrides else None
@ -2776,11 +2783,12 @@ class PipelineController(object):
except Exception:
pass
def _parse_step_ref(self, value):
def _parse_step_ref(self, value, recursive=False):
# type: (Any) -> Optional[str]
"""
Return the step reference. For example "${step1.parameters.Args/param}"
:param value: string
:param recursive: if True, recursively parse all values in the dict, list or tuple
:return:
"""
# look for all the step references
@ -2793,6 +2801,18 @@ class PipelineController(object):
if not isinstance(new_val, six.string_types):
return new_val
updated_value = updated_value.replace(g, new_val, 1)
# if we have a dict, list or tuple, we need to recursively update the values
if recursive:
if isinstance(value, dict):
updated_value = {}
for k, v in value.items():
updated_value[k] = self._parse_step_ref(v, recursive=True)
elif isinstance(value, list):
updated_value = [self._parse_step_ref(v, recursive=True) for v in value]
elif isinstance(value, tuple):
updated_value = tuple(self._parse_step_ref(v, recursive=True) for v in value)
return updated_value
def _parse_task_overrides(self, task_overrides):
@ -3201,8 +3221,10 @@ class PipelineController(object):
name=artifact_name,
artifact_object=artifact_object,
wait_on_upload=True,
extension_name=".pkl" if isinstance(artifact_object, dict) and
not self._artifact_serialization_function else None,
extension_name=(
".pkl" if isinstance(artifact_object, dict) and not self._artifact_serialization_function
else None
),
serialization_function=self._artifact_serialization_function
)

View File

@ -3075,6 +3075,8 @@ class GetScalarMetricDataRequest(Request):
:param model_events: If set then the retrieving model events. Otherwise task
events
:type model_events: bool
:param scroll_id: Pass this value on next call to get next page
:type scroll_id: str
"""
_service = "events"
@ -3095,16 +3097,21 @@ class GetScalarMetricDataRequest(Request):
"type": ["boolean", "null"],
},
"task": {"description": "task ID", "type": ["string", "null"]},
"scroll_id": {
"description": "Pass this value on next call to get next page",
"type": "string",
},
},
"type": "object",
}
def __init__(self, task=None, metric=None, no_scroll=False, model_events=False, **kwargs):
def __init__(self, task=None, metric=None, no_scroll=False, model_events=False, scroll_id=None, **kwargs):
super(GetScalarMetricDataRequest, self).__init__(**kwargs)
self.task = task
self.metric = metric
self.no_scroll = no_scroll
self.model_events = model_events
self.scroll_id = scroll_id
@schema_property("task")
def task(self):
@ -3158,6 +3165,19 @@ class GetScalarMetricDataRequest(Request):
self.assert_isinstance(value, "model_events", (bool,))
self._property_model_events = value
@schema_property("scroll_id")
def scroll_id(self):
return self._property_scroll_id
@scroll_id.setter
def scroll_id(self, value):
if value is None:
self._property_scroll_id = None
return
self.assert_isinstance(value, "scroll_id", six.string_types)
self._property_scroll_id = value
class GetScalarMetricDataResponse(Response):
"""

View File

@ -25,6 +25,8 @@ class InterfaceBase(SessionInterface):
_default_session = None
_num_retry_warning_display = 1
_offline_mode = ENV_OFFLINE_MODE.get()
_JSON_EXCEPTION = (jsonschema.ValidationError, requests.exceptions.InvalidJSONError) \
if hasattr(requests.exceptions, "InvalidJSONError") else (jsonschema.ValidationError,)
@property
def session(self):
@ -83,7 +85,7 @@ class InterfaceBase(SessionInterface):
if raise_on_errors:
raise
res = None
except jsonschema.ValidationError as e:
except cls._JSON_EXCEPTION as e:
if log:
log.error(
'Field %s contains illegal schema: %s', '.'.join(e.path), str(e.message)

View File

@ -12,6 +12,7 @@ from ..storage import StorageManager
from ..storage.helper import StorageHelper
from ..utilities.async_manager import AsyncManagerMixin
ModelPackage = namedtuple("ModelPackage", "weights design")
@ -77,6 +78,28 @@ class Model(IdObjectBase, AsyncManagerMixin, _StorageUriMixin):
self.send(models.SetReadyRequest(model=self.id, publish_task=False))
self.reload()
def archive(self):
if Session.check_min_api_server_version("2.13"):
self.send(models.ArchiveManyRequest(ids=[self.id]))
self.reload()
else:
from ..model import BaseModel
# edit will reload
self._edit(
system_tags=list(set((self.data.system_tags or []) if hasattr(self.data, "system_tags") else []) | {BaseModel._archived_tag})
)
def unarchive(self):
if Session.check_min_api_server_version("2.13"):
self.send(models.UnarchiveManyRequest(ids=[self.id]))
self.reload()
else:
from ..model import BaseModel
# edit will reload
self._edit(
system_tags=list(set((self.data.system_tags or []) if hasattr(self.data, "system_tags") else []) - {BaseModel._archived_tag})
)
def _reload(self):
"""Reload the model object"""
if self._offline_mode:

View File

@ -46,6 +46,7 @@ class CreateAndPopulate(object):
output_uri=None, # type: Optional[str]
base_task_id=None, # type: Optional[str]
add_task_init_call=True, # type: bool
force_single_script_file=False, # type: bool
raise_on_missing_entries=False, # type: bool
verbose=False, # type: bool
):
@ -84,6 +85,7 @@ class CreateAndPopulate(object):
:param base_task_id: Use a pre-existing task in the system, instead of a local repo/script.
Essentially clones an existing task and overrides arguments/requirements.
:param add_task_init_call: If True, a 'Task.init()' call is added to the script entry point in remote execution.
:param force_single_script_file: If True, do not auto-detect local repository
:param raise_on_missing_entries: If True, raise ValueError on missing entries when populating
:param verbose: If True, print verbose logging
"""
@ -125,6 +127,7 @@ class CreateAndPopulate(object):
self.task_type = task_type
self.output_uri = output_uri
self.task = None
self.force_single_script_file = bool(force_single_script_file)
self.raise_on_missing_entries = raise_on_missing_entries
self.verbose = verbose
@ -159,6 +162,7 @@ class CreateAndPopulate(object):
detect_jupyter_notebook=False,
add_missing_installed_packages=True,
detailed_req_report=False,
force_single_script=self.force_single_script_file,
)
# check if we have no repository and no requirements raise error
@ -237,6 +241,23 @@ class CreateAndPopulate(object):
task_state['script']['diff'] = ''
task_state['script']['working_dir'] = cwd or '.'
task_state['script']['entry_point'] = entry_point or ""
if self.force_single_script_file and Path(self.script).is_file():
create_requirements = self.packages is True
repo_info, requirements = ScriptInfo.get(
filepaths=[Path(self.script).as_posix()],
log=getLogger(),
create_requirements=create_requirements,
uncommitted_from_remote=True,
detect_jupyter_notebook=False,
add_missing_installed_packages=True,
detailed_req_report=False,
force_single_script=self.force_single_script_file,
)
task_state['script']['diff'] = repo_info.script['diff'] or ''
task_state['script']['entry_point'] = repo_info.script['entry_point']
if create_requirements:
task_state['script']['requirements'] = repo_info.script.get('requirements') or {}
else:
# standalone task
task_state['script']['entry_point'] = self.script or ""

View File

@ -246,7 +246,6 @@ class PatchOsFork(object):
os._exit = _at_exit_callback
@staticmethod
def _patched_fork(*args, **kwargs):
if not PatchOsFork._current_task:

View File

@ -71,7 +71,7 @@ class PatchLIGHTgbmModelIO(PatchBaseModelIO):
return ret
@staticmethod
def _load(original_fn, model_file, *args, **kwargs):
def _load(original_fn, model_file=None, *args, **kwargs):
if not PatchLIGHTgbmModelIO._current_task:
return original_fn(model_file, *args, **kwargs)

View File

@ -1589,6 +1589,11 @@ class PatchKerasModelIO(object):
from keras import models as keras_saving # noqa
except ImportError:
keras_saving = None
try:
from keras.src.saving import saving_api as keras_saving_v3
except ImportError:
keras_saving_v3 = None
# check that we are not patching anything twice
if PatchKerasModelIO.__patched_tensorflow:
PatchKerasModelIO.__patched_keras = [
@ -1598,9 +1603,10 @@ class PatchKerasModelIO(object):
Functional if PatchKerasModelIO.__patched_tensorflow[3] != Functional else None,
None,
None,
keras_saving_v3
]
else:
PatchKerasModelIO.__patched_keras = [Network, Sequential, keras_saving, Functional, None, None]
PatchKerasModelIO.__patched_keras = [Network, Sequential, keras_saving, Functional, None, None, keras_saving_v3]
PatchKerasModelIO._patch_io_calls(*PatchKerasModelIO.__patched_keras)
if 'tensorflow' in sys.modules and not PatchKerasModelIO.__patched_tensorflow:
@ -1643,6 +1649,8 @@ class PatchKerasModelIO(object):
except ImportError:
keras_hdf5 = None
keras_saving_v3 = None
if PatchKerasModelIO.__patched_keras:
PatchKerasModelIO.__patched_tensorflow = [
Network if PatchKerasModelIO.__patched_keras[0] != Network else None,
@ -1651,14 +1659,23 @@ class PatchKerasModelIO(object):
Functional if PatchKerasModelIO.__patched_keras[3] != Functional else None,
keras_saving_legacy if PatchKerasModelIO.__patched_keras[4] != keras_saving_legacy else None,
keras_hdf5 if PatchKerasModelIO.__patched_keras[5] != keras_hdf5 else None,
keras_saving_v3 if PatchKerasModelIO.__patched_keras[6] != keras_saving_v3 else None,
]
else:
PatchKerasModelIO.__patched_tensorflow = [
Network, Sequential, keras_saving, Functional, keras_saving_legacy, keras_hdf5]
Network, Sequential, keras_saving, Functional, keras_saving_legacy, keras_hdf5, keras_saving_v3]
PatchKerasModelIO._patch_io_calls(*PatchKerasModelIO.__patched_tensorflow)
@staticmethod
def _patch_io_calls(Network, Sequential, keras_saving, Functional, keras_saving_legacy=None, keras_hdf5=None):
def _patch_io_calls(
Network,
Sequential,
keras_saving,
Functional,
keras_saving_legacy=None,
keras_hdf5=None,
keras_saving_v3=None
):
try:
if Sequential is not None:
Sequential._updated_config = _patched_call(Sequential._updated_config,
@ -1718,6 +1735,9 @@ class PatchKerasModelIO(object):
keras_hdf5.save_model_to_hdf5 = _patched_call(
keras_hdf5.save_model_to_hdf5, PatchKerasModelIO._save_model)
if keras_saving_v3 is not None:
keras_saving_v3.save_model = _patched_call(keras_saving_v3.save_model, PatchKerasModelIO._save_model)
except Exception as ex:
LoggerRoot.get_base_logger(TensorflowBinding).warning(str(ex))
@ -2058,6 +2078,11 @@ class PatchTensorflowModelIO(object):
Checkpoint.write = _patched_call(Checkpoint.write, PatchTensorflowModelIO._ckpt_write)
except Exception:
pass
# noinspection PyBroadException
try:
Checkpoint._write = _patched_call(Checkpoint._write, PatchTensorflowModelIO._ckpt_write)
except Exception:
pass
except ImportError:
pass
except Exception:
@ -2227,21 +2252,24 @@ class PatchTensorflow2ModelIO(object):
return
PatchTensorflow2ModelIO.__patched = True
# noinspection PyBroadException
try:
# hack: make sure tensorflow.__init__ is called
import tensorflow # noqa
from tensorflow.python.training.tracking import util # noqa
# noinspection PyBroadException
try:
util.TrackableSaver.save = _patched_call(util.TrackableSaver.save,
PatchTensorflow2ModelIO._save)
util.TrackableSaver.save = _patched_call(util.TrackableSaver.save, PatchTensorflow2ModelIO._save)
except Exception:
pass
# noinspection PyBroadException
try:
util.TrackableSaver.restore = _patched_call(util.TrackableSaver.restore,
PatchTensorflow2ModelIO._restore)
util.TrackableSaver.restore = _patched_call(
util.TrackableSaver.restore, PatchTensorflow2ModelIO._restore
)
except Exception:
pass
except ImportError:
@ -2249,6 +2277,32 @@ class PatchTensorflow2ModelIO(object):
except Exception:
LoggerRoot.get_base_logger(TensorflowBinding).debug('Failed patching tensorflow v2')
# noinspection PyBroadException
try:
# hack: make sure tensorflow.__init__ is called
import tensorflow # noqa
from tensorflow.python.checkpoint import checkpoint
# noinspection PyBroadException
try:
checkpoint.TrackableSaver.save = _patched_call(
checkpoint.TrackableSaver.save, PatchTensorflow2ModelIO._save
)
except Exception:
pass
# noinspection PyBroadException
try:
checkpoint.TrackableSaver.restore = _patched_call(
checkpoint.TrackableSaver.restore, PatchTensorflow2ModelIO._restore
)
except Exception:
pass
except ImportError:
pass
except Exception:
LoggerRoot.get_base_logger(TensorflowBinding).debug('Failed patching tensorflow v2.11')
@staticmethod
def _save(original_fn, self, file_prefix, *args, **kwargs):
model = original_fn(self, file_prefix, *args, **kwargs)

View File

@ -89,6 +89,7 @@ class PatchHydra(object):
if overrides and not isinstance(overrides, (list, tuple)):
overrides = [overrides]
overrides += ['{}={}'.format(k, v) for k, v in stored_config.items()]
overrides = [("+" + o) if (o.startswith("+") and not o.startswith("++")) else o for o in overrides]
else:
# We take care of it inside the _patched_run_job
pass

View File

@ -2497,7 +2497,7 @@ class Dataset(object):
# check if target folder is not empty, see if it contains everything we need
if target_base_folder and next(target_base_folder.iterdir(), None):
if self._verify_dataset_folder(target_base_folder, part, chunk_selection):
if self._verify_dataset_folder(target_base_folder, part, chunk_selection, max_workers):
target_base_folder.touch()
self._release_lock_ds_target_folder(target_base_folder)
return target_base_folder.as_posix()
@ -2538,7 +2538,7 @@ class Dataset(object):
raise_on_error=False, force=False)
# verify entire dataset (if failed, force downloading parent datasets)
if not self._verify_dataset_folder(target_base_folder, part, chunk_selection):
if not self._verify_dataset_folder(target_base_folder, part, chunk_selection, max_workers):
LoggerRoot.get_base_logger().info('Dataset parents need refreshing, re-fetching all parent datasets')
# we should delete the entire cache folder
self._extract_parent_datasets(
@ -3214,31 +3214,42 @@ class Dataset(object):
raise ValueError("Dataset merging failed: {}".format([e for e in errors if e is not None]))
pool.close()
def _verify_dataset_folder(self, target_base_folder, part, chunk_selection):
# type: (Path, Optional[int], Optional[dict]) -> bool
def _verify_dataset_folder(self, target_base_folder, part, chunk_selection, max_workers):
# type: (Path, Optional[int], Optional[dict], Optional[int]) -> bool
def verify_file_or_link(base_folder, ds_part, ds_chunk_selection, file_entry):
# type: (Path, Optional[int], Optional[dict], FileEntry) -> Optional[bool]
# check if we need the file for the requested dataset part
if ds_part is not None:
f_parts = ds_chunk_selection.get(file_entry.parent_dataset_id, [])
# file is not in requested dataset part, no need to check it.
if self._get_chunk_idx_from_artifact_name(file_entry.artifact_name) not in f_parts:
return None
# check if the local size and the stored size match (faster than comparing hash)
if (base_folder / file_entry.relative_path).stat().st_size != file_entry.size:
return False
return True
target_base_folder = Path(target_base_folder)
# check dataset file size, if we have a full match no need for parent dataset download / merge
verified = True
# noinspection PyBroadException
try:
for f in self._dataset_file_entries.values():
# check if we need it for the current part
if part is not None:
f_parts = chunk_selection.get(f.parent_dataset_id, [])
# this is not in our current part, no need to check it.
if self._get_chunk_idx_from_artifact_name(f.artifact_name) not in f_parts:
continue
futures_ = []
with ThreadPoolExecutor(max_workers=max_workers) as tp:
for f in self._dataset_file_entries.values():
future = tp.submit(verify_file_or_link, target_base_folder, part, chunk_selection, f)
futures_.append(future)
# check if the local size and the stored size match (faster than comparing hash)
if (target_base_folder / f.relative_path).stat().st_size != f.size:
verified = False
break
for f in self._dataset_link_entries.values():
if (target_base_folder / f.relative_path).stat().st_size != f.size:
verified = False
break
for f in self._dataset_link_entries.values():
# don't check whether link is in dataset part, hence None for part and chunk_selection
future = tp.submit(verify_file_or_link, target_base_folder, None, None, f)
futures_.append(future)
verified = all(f.result() is not False for f in futures_)
except Exception:
verified = False

View File

@ -1055,6 +1055,26 @@ class BaseModel(object):
if not self.published:
self._get_base_model().publish()
def archive(self):
# type: () -> ()
"""
Archive the model. If the model is already archived, this is a no-op
"""
try:
self._get_base_model().archive()
except Exception:
pass
def unarchive(self):
# type: () -> ()
"""
Unarchive the model. If the model is not archived, this is a no-op
"""
try:
self._get_base_model().unarchive()
except Exception:
pass
def _init_reporter(self):
if self._reporter:
return
@ -2380,11 +2400,15 @@ class OutputModel(BaseModel):
# make sure the created model is updated:
out_model_file_name = target_filename or weights_filename or register_uri
name = (
Path(out_model_file_name).stem
if out_model_file_name
else (self._task_connect_name or "Output Model")
)
# prefer self._task_connect_name if exists
if self._task_connect_name:
name = self._task_connect_name
elif out_model_file_name:
name = Path(out_model_file_name).stem
else:
name = "Output Model"
if not self._base_model:
model = self._get_force_base_model(task_model_entry=name)
else:

View File

@ -894,7 +894,7 @@ class _GoogleCloudStorageDriver(_Driver):
obj.download_to_filename(str(p))
def test_upload(self, test_path, config, **_):
bucket_url = str(furl(scheme=self.scheme, netloc=config.bucket, path=config.subdir))
bucket_url = str(furl(scheme=self.scheme, netloc=config.bucket))
bucket = self.get_container(container_name=bucket_url, config=config).bucket
test_obj = bucket

View File

@ -12,7 +12,7 @@ from pathlib2 import Path
from .cache import CacheManager
from .callbacks import ProgressReport
from .helper import StorageHelper
from .util import encode_string_to_filename, safe_extract
from .util import encode_string_to_filename, safe_extract, create_zip_directories
from ..debugging.log import LoggerRoot
from ..config import deferred_config
@ -163,7 +163,9 @@ class StorageManager(object):
temp_target_folder.mkdir(parents=True, exist_ok=True)
if suffix == ".zip":
ZipFile(cached_file.as_posix()).extractall(path=temp_target_folder.as_posix())
zip_file = ZipFile(cached_file.as_posix())
create_zip_directories(zip_file, path=temp_target_folder.as_posix())
zip_file.extractall(path=temp_target_folder.as_posix())
elif suffix == ".tar.gz":
with tarfile.open(cached_file.as_posix()) as file:
safe_extract(file, temp_target_folder.as_posix())

View File

@ -1,7 +1,7 @@
import fnmatch
import hashlib
import json
import os.path
import os
import re
import sys
from typing import Optional, Union, Sequence, Dict
@ -338,6 +338,37 @@ def is_within_directory(directory, target):
return prefix == abs_directory
def create_zip_directories(zipfile, path=None):
try:
path = os.getcwd() if path is None else os.fspath(path)
for member in zipfile.namelist():
arcname = member.replace("/", os.path.sep)
if os.path.altsep:
arcname = arcname.replace(os.path.altsep, os.path.sep)
# interpret absolute pathname as relative, remove drive letter or
# UNC path, redundant separators, "." and ".." components.
arcname = os.path.splitdrive(arcname)[1]
invalid_path_parts = ("", os.path.curdir, os.path.pardir)
arcname = os.path.sep.join(x for x in arcname.split(os.path.sep) if x not in invalid_path_parts)
if os.path.sep == "\\":
# noinspection PyBroadException
try:
# filter illegal characters on Windows
# noinspection PyProtectedMember
arcname = zipfile._sanitize_windows_name(arcname, os.path.sep)
except Exception:
pass
targetpath = os.path.normpath(os.path.join(path, arcname))
# Create all upper directories if necessary.
upperdirs = os.path.dirname(targetpath)
if upperdirs:
os.makedirs(upperdirs, exist_ok=True)
except Exception as e:
LoggerRoot.get_base_logger().warning("Failed creating zip directories: " + str(e))
def safe_extract(tar, path=".", members=None, numeric_owner=False):
"""Tarfile member sanitization (addresses CVE-2007-4559)"""
for member in tar.getmembers():

View File

@ -791,6 +791,7 @@ class Task(_Task):
argparse_args=None, # type: Optional[Sequence[Tuple[str, str]]]
base_task_id=None, # type: Optional[str]
add_task_init_call=True, # type: bool
force_single_script_file=False, # type: bool
):
# type: (...) -> TaskInstance
"""
@ -832,6 +833,7 @@ class Task(_Task):
:param base_task_id: Use a pre-existing task in the system, instead of a local repo/script.
Essentially clones an existing task and overrides arguments/requirements.
:param add_task_init_call: If True, a 'Task.init()' call is added to the script entry point in remote execution.
:param force_single_script_file: If True, do not auto-detect local repository
:return: The newly created Task (experiment)
:rtype: Task
@ -852,6 +854,7 @@ class Task(_Task):
docker=docker, docker_args=docker_args, docker_bash_setup_script=docker_bash_setup_script,
base_task_id=base_task_id,
add_task_init_call=add_task_init_call,
force_single_script_file=force_single_script_file,
raise_on_missing_entries=False,
)
task = manual_populate.create_task()

View File

@ -285,11 +285,11 @@ class GPUStatCollection(object):
for nv_process in nv_comp_processes + nv_graphics_processes:
try:
process = get_process_info(nv_process)
processes.append(process)
except psutil.NoSuchProcess:
# TODO: add some reminder for NVML broken context
# e.g. nvidia-smi reset or reboot the system
pass
process = None
processes.append(process)
# we do not actually use these, so no point in collecting them
# # TODO: Do not block if full process info is not requested
@ -313,7 +313,7 @@ class GPUStatCollection(object):
# Convert bytes into MBytes
'memory.used': memory.used // MB if memory else None,
'memory.total': memory.total // MB if memory else None,
'processes': processes,
'processes': None if (processes and all(p is None for p in processes)) else processes
}
if per_process_stats:
GPUStatCollection.clean_processes()

View File

@ -284,6 +284,11 @@ def is_std_or_local_lib(name):
False if installed package
str if local library
"""
# check if one of the builtin modules first
if name in sys.builtin_module_names:
return True
exist = True
if six.PY2:
import imp # noqa

View File

@ -287,7 +287,7 @@ class WrapperBase(type):
# (http://code.activestate.com/recipes/496741/). It adds special methods
# to the wrapper class so it can proxy the wrapped class. In addition, it
# adds a field __overrides__ in the wrapper class dictionary, containing
# all attributes decorated to be overriden.
# all attributes decorated to be overridden.
_special_names = [
'__abs__', '__add__', '__and__', '__call__', '__cmp__', '__coerce__',
@ -303,7 +303,7 @@ class WrapperBase(type):
'__repr__', '__reversed__', '__rfloorfiv__', '__rlshift__', '__rmod__',
'__rmul__', '__ror__', '__rpow__', '__rrshift__', '__rshift__', '__rsub__',
'__rtruediv__', '__rxor__', '__setitem__', '__setslice__', '__sub__',
'__truediv__', '__xor__', 'next', '__str__', '__repr__',
'__truediv__', '__xor__', 'next', '__str__', '__repr__',
'__round__', '__fspath__', '__bytes__', '__index__'
]

View File

@ -43,6 +43,8 @@ class ResourceMonitor(BackgroundMonitor):
self._process_info = psutil.Process() if report_mem_used_per_process else None
self._last_process_pool = {}
self._last_process_id_list = []
self._gpu_memory_per_process = True
if not self._gpustat:
self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available')
else: # if running_remotely():
@ -309,27 +311,40 @@ class ResourceMonitor(BackgroundMonitor):
# On the rest of the samples we return the previous memory measurement
# update mem used by our process and sub processes
if self._process_info and (not self._last_process_pool.get('gpu') or
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
gpu_stat = self._gpustat.new_query(per_process_stats=True)
if self._gpu_memory_per_process and self._process_info and \
(not self._last_process_pool.get('gpu') or
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
gpu_mem = {}
# noinspection PyBroadException
try:
gpu_stat = self._gpustat.new_query(per_process_stats=True)
except Exception:
gpu_stat = self._gpustat.new_query(per_process_stats=False)
for i, g in enumerate(gpu_stat.gpus):
# if processes is None, that means we can't query GPU memory usage per proces, so we can stop
if g.processes is None:
self._gpu_memory_per_process = False
break
# only monitor the active gpu's, if none were selected, monitor everything
if self._active_gpus and i not in self._active_gpus:
continue
gpu_mem[i] = 0
for p in g.processes:
if p['pid'] in self._last_process_id_list:
if p is not None and p['pid'] in self._last_process_id_list:
gpu_mem[i] += p.get('gpu_memory_usage', 0)
self._last_process_pool['gpu'] = time(), gpu_mem
else:
# if we do no need to update the memory usage, run global query
# if we have no parent process (backward compatibility), return global stats
gpu_stat = self._gpustat.new_query()
gpu_stat = self._gpustat.new_query(per_process_stats=False)
gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
# generate the statistics dict for actual report
stats = {}
for i, g in enumerate(gpu_stat.gpus):
# only monitor the active gpu's, if none were selected, monitor everything
if self._active_gpus and i not in self._active_gpus:
@ -367,7 +382,7 @@ class ResourceMonitor(BackgroundMonitor):
specs.update(
gpu_count=int(len(gpus)),
gpu_type=', '.join(g.name for g in gpus),
gpu_memory=', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus),
gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus),
gpu_driver_version=gpu_stat.driver_version or '',
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
)

View File

@ -1 +1 @@
__version__ = '1.12.2'
__version__ = '1.13.1'

View File

@ -0,0 +1,28 @@
import numpy as np
import keras
from clearml import Task
def get_model():
# Create a simple model.
inputs = keras.Input(shape=(32,))
outputs = keras.layers.Dense(1)(inputs)
model = keras.Model(inputs, outputs)
model.compile(optimizer=keras.optimizers.Adam(), loss="mean_squared_error")
return model
Task.init(project_name="examples", task_name="keras_v3")
model = get_model()
test_input = np.random.random((128, 32))
test_target = np.random.random((128, 1))
model.fit(test_input, test_target)
model.save("my_model.keras")
reconstructed_model = keras.models.load_model("my_model.keras")
np.testing.assert_allclose(
model.predict(test_input), reconstructed_model.predict(test_input)
)

View File

@ -0,0 +1,36 @@
# ClearML - example code for logging configuration files to Task":
#
import json
from pathlib import Path
import yaml
from clearml import Task
# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='FirstTrial', task_name='config_files_example')
# -----------------------------------------------
# Log config file
# Notice any file format i supported
# In the Web UI you could edit the configuration file directly as text
# and launch on a remote worker with the new configuration automatically applied
# -----------------------------------------------
config_file = task.connect_configuration(Path("data_samples") / "sample.json", name='json config file')
with open(config_file, "rt") as f:
config_json = json.load(f)
print(config_json)
config_file = task.connect_configuration(Path("data_samples") / "config_yaml.yaml", name='yaml config file')
with open(config_file, "rt") as f:
config_yaml = yaml.load(f, Loader=yaml.SafeLoader)
print(config_yaml)
print("done")

View File

@ -11,7 +11,7 @@ Pillow>=4.1.1
psutil>=3.4.2
pyparsing>=2.0.3
python-dateutil>=2.6.1
pyjwt>=2.4.0,<2.5.0 ; python_version > '3.5'
pyjwt>=2.4.0,<2.9.0 ; python_version > '3.5'
pyjwt>=1.6.4,<2.0.0 ; python_version <= '3.5'
PyYAML>=3.12
requests>=2.20.0