mirror of
https://github.com/clearml/clearml
synced 2025-06-26 18:16:07 +00:00
Merge branch 'master' of https://github.com/allegroai/clearml
This commit is contained in:
commit
fe5601a8ca
5
.gitignore
vendored
5
.gitignore
vendored
@ -7,8 +7,9 @@
|
||||
# Python
|
||||
*.pyc
|
||||
__pycache__
|
||||
build/
|
||||
dist/
|
||||
/build/
|
||||
/dist/
|
||||
*/conda_build/build/
|
||||
*.egg-info
|
||||
.env
|
||||
.venv/
|
||||
|
@ -182,7 +182,7 @@ If ClearML is part of your development process / project / publication, please c
|
||||
```
|
||||
@misc{clearml,
|
||||
title = {ClearML - Your entire MLOps stack in one open-source tool},
|
||||
year = {2019},
|
||||
year = {2023},
|
||||
note = {Software available from http://github.com/allegroai/clearml},
|
||||
url={https://clear.ml/},
|
||||
author = {ClearML},
|
||||
|
@ -99,6 +99,8 @@ class PipelineController(object):
|
||||
monitor_artifacts = attrib(type=list, default=None) # List of artifact names to monitor
|
||||
monitor_models = attrib(type=list, default=None) # List of models to monitor
|
||||
explicit_docker_image = attrib(type=str, default=None) # The Docker image the node uses, specified at creation
|
||||
recursively_parse_parameters = attrib(type=bool, default=False) # if True, recursively parse parameters in
|
||||
# lists, dicts, or tuples
|
||||
|
||||
def __attrs_post_init__(self):
|
||||
if self.parents is None:
|
||||
@ -384,7 +386,8 @@ class PipelineController(object):
|
||||
cache_executed_step=False, # type: bool
|
||||
base_task_factory=None, # type: Optional[Callable[[PipelineController.Node], Task]]
|
||||
retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa
|
||||
status_change_callback=None # type: Optional[Callable[[PipelineController, PipelineController.Node, str], None]] # noqa
|
||||
status_change_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, str], None]] # noqa
|
||||
recursively_parse_parameters=False # type: bool
|
||||
):
|
||||
# type: (...) -> bool
|
||||
"""
|
||||
@ -405,7 +408,10 @@ class PipelineController(object):
|
||||
- Parameter access ``parameter_override={'Args/input_file': '${<step_name>.parameters.Args/input_file}' }``
|
||||
- Pipeline Task argument (see `Pipeline.add_parameter`) ``parameter_override={'Args/input_file': '${pipeline.<pipeline_parameter>}' }``
|
||||
- Task ID ``parameter_override={'Args/input_file': '${stage3.id}' }``
|
||||
|
||||
:param recursively_parse_parameters: If True, recursively parse parameters from parameter_override in lists, dicts, or tuples.
|
||||
Example:
|
||||
- ``parameter_override={'Args/input_file': ['${<step_name>.artifacts.<artifact_name>.url}', 'file2.txt']}`` will be correctly parsed.
|
||||
- ``parameter_override={'Args/input_file': ('${<step_name_1>.parameters.Args/input_file}', '${<step_name_2>.parameters.Args/input_file}')}`` will be correctly parsed.
|
||||
:param configuration_overrides: Optional, override Task configuration objects.
|
||||
Expected dictionary of configuration object name and configuration object content.
|
||||
Examples:
|
||||
@ -572,6 +578,7 @@ class PipelineController(object):
|
||||
name=name, base_task_id=base_task_id, parents=parents or [],
|
||||
queue=execution_queue, timeout=time_limit,
|
||||
parameters=parameter_override or {},
|
||||
recursively_parse_parameters=recursively_parse_parameters,
|
||||
configurations=configuration_overrides,
|
||||
clone_task=clone_base_task,
|
||||
task_overrides=task_overrides,
|
||||
@ -2237,7 +2244,7 @@ class PipelineController(object):
|
||||
|
||||
updated_hyper_parameters = {}
|
||||
for k, v in node.parameters.items():
|
||||
updated_hyper_parameters[k] = self._parse_step_ref(v)
|
||||
updated_hyper_parameters[k] = self._parse_step_ref(v, recursive=node.recursively_parse_parameters)
|
||||
|
||||
task_overrides = self._parse_task_overrides(node.task_overrides) if node.task_overrides else None
|
||||
|
||||
@ -2776,11 +2783,12 @@ class PipelineController(object):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _parse_step_ref(self, value):
|
||||
def _parse_step_ref(self, value, recursive=False):
|
||||
# type: (Any) -> Optional[str]
|
||||
"""
|
||||
Return the step reference. For example "${step1.parameters.Args/param}"
|
||||
:param value: string
|
||||
:param recursive: if True, recursively parse all values in the dict, list or tuple
|
||||
:return:
|
||||
"""
|
||||
# look for all the step references
|
||||
@ -2793,6 +2801,18 @@ class PipelineController(object):
|
||||
if not isinstance(new_val, six.string_types):
|
||||
return new_val
|
||||
updated_value = updated_value.replace(g, new_val, 1)
|
||||
|
||||
# if we have a dict, list or tuple, we need to recursively update the values
|
||||
if recursive:
|
||||
if isinstance(value, dict):
|
||||
updated_value = {}
|
||||
for k, v in value.items():
|
||||
updated_value[k] = self._parse_step_ref(v, recursive=True)
|
||||
elif isinstance(value, list):
|
||||
updated_value = [self._parse_step_ref(v, recursive=True) for v in value]
|
||||
elif isinstance(value, tuple):
|
||||
updated_value = tuple(self._parse_step_ref(v, recursive=True) for v in value)
|
||||
|
||||
return updated_value
|
||||
|
||||
def _parse_task_overrides(self, task_overrides):
|
||||
@ -3201,8 +3221,10 @@ class PipelineController(object):
|
||||
name=artifact_name,
|
||||
artifact_object=artifact_object,
|
||||
wait_on_upload=True,
|
||||
extension_name=".pkl" if isinstance(artifact_object, dict) and
|
||||
not self._artifact_serialization_function else None,
|
||||
extension_name=(
|
||||
".pkl" if isinstance(artifact_object, dict) and not self._artifact_serialization_function
|
||||
else None
|
||||
),
|
||||
serialization_function=self._artifact_serialization_function
|
||||
)
|
||||
|
||||
|
@ -3075,6 +3075,8 @@ class GetScalarMetricDataRequest(Request):
|
||||
:param model_events: If set then the retrieving model events. Otherwise task
|
||||
events
|
||||
:type model_events: bool
|
||||
:param scroll_id: Pass this value on next call to get next page
|
||||
:type scroll_id: str
|
||||
"""
|
||||
|
||||
_service = "events"
|
||||
@ -3095,16 +3097,21 @@ class GetScalarMetricDataRequest(Request):
|
||||
"type": ["boolean", "null"],
|
||||
},
|
||||
"task": {"description": "task ID", "type": ["string", "null"]},
|
||||
"scroll_id": {
|
||||
"description": "Pass this value on next call to get next page",
|
||||
"type": "string",
|
||||
},
|
||||
},
|
||||
"type": "object",
|
||||
}
|
||||
|
||||
def __init__(self, task=None, metric=None, no_scroll=False, model_events=False, **kwargs):
|
||||
def __init__(self, task=None, metric=None, no_scroll=False, model_events=False, scroll_id=None, **kwargs):
|
||||
super(GetScalarMetricDataRequest, self).__init__(**kwargs)
|
||||
self.task = task
|
||||
self.metric = metric
|
||||
self.no_scroll = no_scroll
|
||||
self.model_events = model_events
|
||||
self.scroll_id = scroll_id
|
||||
|
||||
@schema_property("task")
|
||||
def task(self):
|
||||
@ -3158,6 +3165,19 @@ class GetScalarMetricDataRequest(Request):
|
||||
self.assert_isinstance(value, "model_events", (bool,))
|
||||
self._property_model_events = value
|
||||
|
||||
@schema_property("scroll_id")
|
||||
def scroll_id(self):
|
||||
return self._property_scroll_id
|
||||
|
||||
@scroll_id.setter
|
||||
def scroll_id(self, value):
|
||||
if value is None:
|
||||
self._property_scroll_id = None
|
||||
return
|
||||
|
||||
self.assert_isinstance(value, "scroll_id", six.string_types)
|
||||
self._property_scroll_id = value
|
||||
|
||||
|
||||
class GetScalarMetricDataResponse(Response):
|
||||
"""
|
||||
|
@ -25,6 +25,8 @@ class InterfaceBase(SessionInterface):
|
||||
_default_session = None
|
||||
_num_retry_warning_display = 1
|
||||
_offline_mode = ENV_OFFLINE_MODE.get()
|
||||
_JSON_EXCEPTION = (jsonschema.ValidationError, requests.exceptions.InvalidJSONError) \
|
||||
if hasattr(requests.exceptions, "InvalidJSONError") else (jsonschema.ValidationError,)
|
||||
|
||||
@property
|
||||
def session(self):
|
||||
@ -83,7 +85,7 @@ class InterfaceBase(SessionInterface):
|
||||
if raise_on_errors:
|
||||
raise
|
||||
res = None
|
||||
except jsonschema.ValidationError as e:
|
||||
except cls._JSON_EXCEPTION as e:
|
||||
if log:
|
||||
log.error(
|
||||
'Field %s contains illegal schema: %s', '.'.join(e.path), str(e.message)
|
||||
|
@ -12,6 +12,7 @@ from ..storage import StorageManager
|
||||
from ..storage.helper import StorageHelper
|
||||
from ..utilities.async_manager import AsyncManagerMixin
|
||||
|
||||
|
||||
ModelPackage = namedtuple("ModelPackage", "weights design")
|
||||
|
||||
|
||||
@ -77,6 +78,28 @@ class Model(IdObjectBase, AsyncManagerMixin, _StorageUriMixin):
|
||||
self.send(models.SetReadyRequest(model=self.id, publish_task=False))
|
||||
self.reload()
|
||||
|
||||
def archive(self):
|
||||
if Session.check_min_api_server_version("2.13"):
|
||||
self.send(models.ArchiveManyRequest(ids=[self.id]))
|
||||
self.reload()
|
||||
else:
|
||||
from ..model import BaseModel
|
||||
# edit will reload
|
||||
self._edit(
|
||||
system_tags=list(set((self.data.system_tags or []) if hasattr(self.data, "system_tags") else []) | {BaseModel._archived_tag})
|
||||
)
|
||||
|
||||
def unarchive(self):
|
||||
if Session.check_min_api_server_version("2.13"):
|
||||
self.send(models.UnarchiveManyRequest(ids=[self.id]))
|
||||
self.reload()
|
||||
else:
|
||||
from ..model import BaseModel
|
||||
# edit will reload
|
||||
self._edit(
|
||||
system_tags=list(set((self.data.system_tags or []) if hasattr(self.data, "system_tags") else []) - {BaseModel._archived_tag})
|
||||
)
|
||||
|
||||
def _reload(self):
|
||||
"""Reload the model object"""
|
||||
if self._offline_mode:
|
||||
|
@ -46,6 +46,7 @@ class CreateAndPopulate(object):
|
||||
output_uri=None, # type: Optional[str]
|
||||
base_task_id=None, # type: Optional[str]
|
||||
add_task_init_call=True, # type: bool
|
||||
force_single_script_file=False, # type: bool
|
||||
raise_on_missing_entries=False, # type: bool
|
||||
verbose=False, # type: bool
|
||||
):
|
||||
@ -84,6 +85,7 @@ class CreateAndPopulate(object):
|
||||
:param base_task_id: Use a pre-existing task in the system, instead of a local repo/script.
|
||||
Essentially clones an existing task and overrides arguments/requirements.
|
||||
:param add_task_init_call: If True, a 'Task.init()' call is added to the script entry point in remote execution.
|
||||
:param force_single_script_file: If True, do not auto-detect local repository
|
||||
:param raise_on_missing_entries: If True, raise ValueError on missing entries when populating
|
||||
:param verbose: If True, print verbose logging
|
||||
"""
|
||||
@ -125,6 +127,7 @@ class CreateAndPopulate(object):
|
||||
self.task_type = task_type
|
||||
self.output_uri = output_uri
|
||||
self.task = None
|
||||
self.force_single_script_file = bool(force_single_script_file)
|
||||
self.raise_on_missing_entries = raise_on_missing_entries
|
||||
self.verbose = verbose
|
||||
|
||||
@ -159,6 +162,7 @@ class CreateAndPopulate(object):
|
||||
detect_jupyter_notebook=False,
|
||||
add_missing_installed_packages=True,
|
||||
detailed_req_report=False,
|
||||
force_single_script=self.force_single_script_file,
|
||||
)
|
||||
|
||||
# check if we have no repository and no requirements raise error
|
||||
@ -237,6 +241,23 @@ class CreateAndPopulate(object):
|
||||
task_state['script']['diff'] = ''
|
||||
task_state['script']['working_dir'] = cwd or '.'
|
||||
task_state['script']['entry_point'] = entry_point or ""
|
||||
|
||||
if self.force_single_script_file and Path(self.script).is_file():
|
||||
create_requirements = self.packages is True
|
||||
repo_info, requirements = ScriptInfo.get(
|
||||
filepaths=[Path(self.script).as_posix()],
|
||||
log=getLogger(),
|
||||
create_requirements=create_requirements,
|
||||
uncommitted_from_remote=True,
|
||||
detect_jupyter_notebook=False,
|
||||
add_missing_installed_packages=True,
|
||||
detailed_req_report=False,
|
||||
force_single_script=self.force_single_script_file,
|
||||
)
|
||||
task_state['script']['diff'] = repo_info.script['diff'] or ''
|
||||
task_state['script']['entry_point'] = repo_info.script['entry_point']
|
||||
if create_requirements:
|
||||
task_state['script']['requirements'] = repo_info.script.get('requirements') or {}
|
||||
else:
|
||||
# standalone task
|
||||
task_state['script']['entry_point'] = self.script or ""
|
||||
|
@ -246,7 +246,6 @@ class PatchOsFork(object):
|
||||
|
||||
os._exit = _at_exit_callback
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _patched_fork(*args, **kwargs):
|
||||
if not PatchOsFork._current_task:
|
||||
|
@ -71,7 +71,7 @@ class PatchLIGHTgbmModelIO(PatchBaseModelIO):
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def _load(original_fn, model_file, *args, **kwargs):
|
||||
def _load(original_fn, model_file=None, *args, **kwargs):
|
||||
if not PatchLIGHTgbmModelIO._current_task:
|
||||
return original_fn(model_file, *args, **kwargs)
|
||||
|
||||
|
@ -1589,6 +1589,11 @@ class PatchKerasModelIO(object):
|
||||
from keras import models as keras_saving # noqa
|
||||
except ImportError:
|
||||
keras_saving = None
|
||||
try:
|
||||
from keras.src.saving import saving_api as keras_saving_v3
|
||||
except ImportError:
|
||||
keras_saving_v3 = None
|
||||
|
||||
# check that we are not patching anything twice
|
||||
if PatchKerasModelIO.__patched_tensorflow:
|
||||
PatchKerasModelIO.__patched_keras = [
|
||||
@ -1598,9 +1603,10 @@ class PatchKerasModelIO(object):
|
||||
Functional if PatchKerasModelIO.__patched_tensorflow[3] != Functional else None,
|
||||
None,
|
||||
None,
|
||||
keras_saving_v3
|
||||
]
|
||||
else:
|
||||
PatchKerasModelIO.__patched_keras = [Network, Sequential, keras_saving, Functional, None, None]
|
||||
PatchKerasModelIO.__patched_keras = [Network, Sequential, keras_saving, Functional, None, None, keras_saving_v3]
|
||||
PatchKerasModelIO._patch_io_calls(*PatchKerasModelIO.__patched_keras)
|
||||
|
||||
if 'tensorflow' in sys.modules and not PatchKerasModelIO.__patched_tensorflow:
|
||||
@ -1643,6 +1649,8 @@ class PatchKerasModelIO(object):
|
||||
except ImportError:
|
||||
keras_hdf5 = None
|
||||
|
||||
keras_saving_v3 = None
|
||||
|
||||
if PatchKerasModelIO.__patched_keras:
|
||||
PatchKerasModelIO.__patched_tensorflow = [
|
||||
Network if PatchKerasModelIO.__patched_keras[0] != Network else None,
|
||||
@ -1651,14 +1659,23 @@ class PatchKerasModelIO(object):
|
||||
Functional if PatchKerasModelIO.__patched_keras[3] != Functional else None,
|
||||
keras_saving_legacy if PatchKerasModelIO.__patched_keras[4] != keras_saving_legacy else None,
|
||||
keras_hdf5 if PatchKerasModelIO.__patched_keras[5] != keras_hdf5 else None,
|
||||
keras_saving_v3 if PatchKerasModelIO.__patched_keras[6] != keras_saving_v3 else None,
|
||||
]
|
||||
else:
|
||||
PatchKerasModelIO.__patched_tensorflow = [
|
||||
Network, Sequential, keras_saving, Functional, keras_saving_legacy, keras_hdf5]
|
||||
Network, Sequential, keras_saving, Functional, keras_saving_legacy, keras_hdf5, keras_saving_v3]
|
||||
PatchKerasModelIO._patch_io_calls(*PatchKerasModelIO.__patched_tensorflow)
|
||||
|
||||
@staticmethod
|
||||
def _patch_io_calls(Network, Sequential, keras_saving, Functional, keras_saving_legacy=None, keras_hdf5=None):
|
||||
def _patch_io_calls(
|
||||
Network,
|
||||
Sequential,
|
||||
keras_saving,
|
||||
Functional,
|
||||
keras_saving_legacy=None,
|
||||
keras_hdf5=None,
|
||||
keras_saving_v3=None
|
||||
):
|
||||
try:
|
||||
if Sequential is not None:
|
||||
Sequential._updated_config = _patched_call(Sequential._updated_config,
|
||||
@ -1718,6 +1735,9 @@ class PatchKerasModelIO(object):
|
||||
keras_hdf5.save_model_to_hdf5 = _patched_call(
|
||||
keras_hdf5.save_model_to_hdf5, PatchKerasModelIO._save_model)
|
||||
|
||||
if keras_saving_v3 is not None:
|
||||
keras_saving_v3.save_model = _patched_call(keras_saving_v3.save_model, PatchKerasModelIO._save_model)
|
||||
|
||||
except Exception as ex:
|
||||
LoggerRoot.get_base_logger(TensorflowBinding).warning(str(ex))
|
||||
|
||||
@ -2058,6 +2078,11 @@ class PatchTensorflowModelIO(object):
|
||||
Checkpoint.write = _patched_call(Checkpoint.write, PatchTensorflowModelIO._ckpt_write)
|
||||
except Exception:
|
||||
pass
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
Checkpoint._write = _patched_call(Checkpoint._write, PatchTensorflowModelIO._ckpt_write)
|
||||
except Exception:
|
||||
pass
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
@ -2227,21 +2252,24 @@ class PatchTensorflow2ModelIO(object):
|
||||
return
|
||||
|
||||
PatchTensorflow2ModelIO.__patched = True
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# hack: make sure tensorflow.__init__ is called
|
||||
import tensorflow # noqa
|
||||
from tensorflow.python.training.tracking import util # noqa
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
util.TrackableSaver.save = _patched_call(util.TrackableSaver.save,
|
||||
PatchTensorflow2ModelIO._save)
|
||||
util.TrackableSaver.save = _patched_call(util.TrackableSaver.save, PatchTensorflow2ModelIO._save)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
util.TrackableSaver.restore = _patched_call(util.TrackableSaver.restore,
|
||||
PatchTensorflow2ModelIO._restore)
|
||||
util.TrackableSaver.restore = _patched_call(
|
||||
util.TrackableSaver.restore, PatchTensorflow2ModelIO._restore
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except ImportError:
|
||||
@ -2249,6 +2277,32 @@ class PatchTensorflow2ModelIO(object):
|
||||
except Exception:
|
||||
LoggerRoot.get_base_logger(TensorflowBinding).debug('Failed patching tensorflow v2')
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# hack: make sure tensorflow.__init__ is called
|
||||
import tensorflow # noqa
|
||||
from tensorflow.python.checkpoint import checkpoint
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
checkpoint.TrackableSaver.save = _patched_call(
|
||||
checkpoint.TrackableSaver.save, PatchTensorflow2ModelIO._save
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
checkpoint.TrackableSaver.restore = _patched_call(
|
||||
checkpoint.TrackableSaver.restore, PatchTensorflow2ModelIO._restore
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
LoggerRoot.get_base_logger(TensorflowBinding).debug('Failed patching tensorflow v2.11')
|
||||
|
||||
@staticmethod
|
||||
def _save(original_fn, self, file_prefix, *args, **kwargs):
|
||||
model = original_fn(self, file_prefix, *args, **kwargs)
|
||||
|
@ -89,6 +89,7 @@ class PatchHydra(object):
|
||||
if overrides and not isinstance(overrides, (list, tuple)):
|
||||
overrides = [overrides]
|
||||
overrides += ['{}={}'.format(k, v) for k, v in stored_config.items()]
|
||||
overrides = [("+" + o) if (o.startswith("+") and not o.startswith("++")) else o for o in overrides]
|
||||
else:
|
||||
# We take care of it inside the _patched_run_job
|
||||
pass
|
||||
|
@ -2497,7 +2497,7 @@ class Dataset(object):
|
||||
|
||||
# check if target folder is not empty, see if it contains everything we need
|
||||
if target_base_folder and next(target_base_folder.iterdir(), None):
|
||||
if self._verify_dataset_folder(target_base_folder, part, chunk_selection):
|
||||
if self._verify_dataset_folder(target_base_folder, part, chunk_selection, max_workers):
|
||||
target_base_folder.touch()
|
||||
self._release_lock_ds_target_folder(target_base_folder)
|
||||
return target_base_folder.as_posix()
|
||||
@ -2538,7 +2538,7 @@ class Dataset(object):
|
||||
raise_on_error=False, force=False)
|
||||
|
||||
# verify entire dataset (if failed, force downloading parent datasets)
|
||||
if not self._verify_dataset_folder(target_base_folder, part, chunk_selection):
|
||||
if not self._verify_dataset_folder(target_base_folder, part, chunk_selection, max_workers):
|
||||
LoggerRoot.get_base_logger().info('Dataset parents need refreshing, re-fetching all parent datasets')
|
||||
# we should delete the entire cache folder
|
||||
self._extract_parent_datasets(
|
||||
@ -3214,31 +3214,42 @@ class Dataset(object):
|
||||
raise ValueError("Dataset merging failed: {}".format([e for e in errors if e is not None]))
|
||||
pool.close()
|
||||
|
||||
def _verify_dataset_folder(self, target_base_folder, part, chunk_selection):
|
||||
# type: (Path, Optional[int], Optional[dict]) -> bool
|
||||
def _verify_dataset_folder(self, target_base_folder, part, chunk_selection, max_workers):
|
||||
# type: (Path, Optional[int], Optional[dict], Optional[int]) -> bool
|
||||
|
||||
def verify_file_or_link(base_folder, ds_part, ds_chunk_selection, file_entry):
|
||||
# type: (Path, Optional[int], Optional[dict], FileEntry) -> Optional[bool]
|
||||
|
||||
# check if we need the file for the requested dataset part
|
||||
if ds_part is not None:
|
||||
f_parts = ds_chunk_selection.get(file_entry.parent_dataset_id, [])
|
||||
# file is not in requested dataset part, no need to check it.
|
||||
if self._get_chunk_idx_from_artifact_name(file_entry.artifact_name) not in f_parts:
|
||||
return None
|
||||
|
||||
# check if the local size and the stored size match (faster than comparing hash)
|
||||
if (base_folder / file_entry.relative_path).stat().st_size != file_entry.size:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
target_base_folder = Path(target_base_folder)
|
||||
# check dataset file size, if we have a full match no need for parent dataset download / merge
|
||||
verified = True
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
for f in self._dataset_file_entries.values():
|
||||
# check if we need it for the current part
|
||||
if part is not None:
|
||||
f_parts = chunk_selection.get(f.parent_dataset_id, [])
|
||||
# this is not in our current part, no need to check it.
|
||||
if self._get_chunk_idx_from_artifact_name(f.artifact_name) not in f_parts:
|
||||
continue
|
||||
futures_ = []
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as tp:
|
||||
for f in self._dataset_file_entries.values():
|
||||
future = tp.submit(verify_file_or_link, target_base_folder, part, chunk_selection, f)
|
||||
futures_.append(future)
|
||||
|
||||
# check if the local size and the stored size match (faster than comparing hash)
|
||||
if (target_base_folder / f.relative_path).stat().st_size != f.size:
|
||||
verified = False
|
||||
break
|
||||
|
||||
for f in self._dataset_link_entries.values():
|
||||
if (target_base_folder / f.relative_path).stat().st_size != f.size:
|
||||
verified = False
|
||||
break
|
||||
for f in self._dataset_link_entries.values():
|
||||
# don't check whether link is in dataset part, hence None for part and chunk_selection
|
||||
future = tp.submit(verify_file_or_link, target_base_folder, None, None, f)
|
||||
futures_.append(future)
|
||||
|
||||
verified = all(f.result() is not False for f in futures_)
|
||||
except Exception:
|
||||
verified = False
|
||||
|
||||
|
@ -1055,6 +1055,26 @@ class BaseModel(object):
|
||||
if not self.published:
|
||||
self._get_base_model().publish()
|
||||
|
||||
def archive(self):
|
||||
# type: () -> ()
|
||||
"""
|
||||
Archive the model. If the model is already archived, this is a no-op
|
||||
"""
|
||||
try:
|
||||
self._get_base_model().archive()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def unarchive(self):
|
||||
# type: () -> ()
|
||||
"""
|
||||
Unarchive the model. If the model is not archived, this is a no-op
|
||||
"""
|
||||
try:
|
||||
self._get_base_model().unarchive()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _init_reporter(self):
|
||||
if self._reporter:
|
||||
return
|
||||
@ -2380,11 +2400,15 @@ class OutputModel(BaseModel):
|
||||
|
||||
# make sure the created model is updated:
|
||||
out_model_file_name = target_filename or weights_filename or register_uri
|
||||
name = (
|
||||
Path(out_model_file_name).stem
|
||||
if out_model_file_name
|
||||
else (self._task_connect_name or "Output Model")
|
||||
)
|
||||
|
||||
# prefer self._task_connect_name if exists
|
||||
if self._task_connect_name:
|
||||
name = self._task_connect_name
|
||||
elif out_model_file_name:
|
||||
name = Path(out_model_file_name).stem
|
||||
else:
|
||||
name = "Output Model"
|
||||
|
||||
if not self._base_model:
|
||||
model = self._get_force_base_model(task_model_entry=name)
|
||||
else:
|
||||
|
@ -894,7 +894,7 @@ class _GoogleCloudStorageDriver(_Driver):
|
||||
obj.download_to_filename(str(p))
|
||||
|
||||
def test_upload(self, test_path, config, **_):
|
||||
bucket_url = str(furl(scheme=self.scheme, netloc=config.bucket, path=config.subdir))
|
||||
bucket_url = str(furl(scheme=self.scheme, netloc=config.bucket))
|
||||
bucket = self.get_container(container_name=bucket_url, config=config).bucket
|
||||
|
||||
test_obj = bucket
|
||||
|
@ -12,7 +12,7 @@ from pathlib2 import Path
|
||||
from .cache import CacheManager
|
||||
from .callbacks import ProgressReport
|
||||
from .helper import StorageHelper
|
||||
from .util import encode_string_to_filename, safe_extract
|
||||
from .util import encode_string_to_filename, safe_extract, create_zip_directories
|
||||
from ..debugging.log import LoggerRoot
|
||||
from ..config import deferred_config
|
||||
|
||||
@ -163,7 +163,9 @@ class StorageManager(object):
|
||||
temp_target_folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if suffix == ".zip":
|
||||
ZipFile(cached_file.as_posix()).extractall(path=temp_target_folder.as_posix())
|
||||
zip_file = ZipFile(cached_file.as_posix())
|
||||
create_zip_directories(zip_file, path=temp_target_folder.as_posix())
|
||||
zip_file.extractall(path=temp_target_folder.as_posix())
|
||||
elif suffix == ".tar.gz":
|
||||
with tarfile.open(cached_file.as_posix()) as file:
|
||||
safe_extract(file, temp_target_folder.as_posix())
|
||||
|
@ -1,7 +1,7 @@
|
||||
import fnmatch
|
||||
import hashlib
|
||||
import json
|
||||
import os.path
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from typing import Optional, Union, Sequence, Dict
|
||||
@ -338,6 +338,37 @@ def is_within_directory(directory, target):
|
||||
return prefix == abs_directory
|
||||
|
||||
|
||||
def create_zip_directories(zipfile, path=None):
|
||||
try:
|
||||
path = os.getcwd() if path is None else os.fspath(path)
|
||||
for member in zipfile.namelist():
|
||||
arcname = member.replace("/", os.path.sep)
|
||||
if os.path.altsep:
|
||||
arcname = arcname.replace(os.path.altsep, os.path.sep)
|
||||
# interpret absolute pathname as relative, remove drive letter or
|
||||
# UNC path, redundant separators, "." and ".." components.
|
||||
arcname = os.path.splitdrive(arcname)[1]
|
||||
invalid_path_parts = ("", os.path.curdir, os.path.pardir)
|
||||
arcname = os.path.sep.join(x for x in arcname.split(os.path.sep) if x not in invalid_path_parts)
|
||||
if os.path.sep == "\\":
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
# filter illegal characters on Windows
|
||||
# noinspection PyProtectedMember
|
||||
arcname = zipfile._sanitize_windows_name(arcname, os.path.sep)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
targetpath = os.path.normpath(os.path.join(path, arcname))
|
||||
|
||||
# Create all upper directories if necessary.
|
||||
upperdirs = os.path.dirname(targetpath)
|
||||
if upperdirs:
|
||||
os.makedirs(upperdirs, exist_ok=True)
|
||||
except Exception as e:
|
||||
LoggerRoot.get_base_logger().warning("Failed creating zip directories: " + str(e))
|
||||
|
||||
|
||||
def safe_extract(tar, path=".", members=None, numeric_owner=False):
|
||||
"""Tarfile member sanitization (addresses CVE-2007-4559)"""
|
||||
for member in tar.getmembers():
|
||||
|
@ -791,6 +791,7 @@ class Task(_Task):
|
||||
argparse_args=None, # type: Optional[Sequence[Tuple[str, str]]]
|
||||
base_task_id=None, # type: Optional[str]
|
||||
add_task_init_call=True, # type: bool
|
||||
force_single_script_file=False, # type: bool
|
||||
):
|
||||
# type: (...) -> TaskInstance
|
||||
"""
|
||||
@ -832,6 +833,7 @@ class Task(_Task):
|
||||
:param base_task_id: Use a pre-existing task in the system, instead of a local repo/script.
|
||||
Essentially clones an existing task and overrides arguments/requirements.
|
||||
:param add_task_init_call: If True, a 'Task.init()' call is added to the script entry point in remote execution.
|
||||
:param force_single_script_file: If True, do not auto-detect local repository
|
||||
|
||||
:return: The newly created Task (experiment)
|
||||
:rtype: Task
|
||||
@ -852,6 +854,7 @@ class Task(_Task):
|
||||
docker=docker, docker_args=docker_args, docker_bash_setup_script=docker_bash_setup_script,
|
||||
base_task_id=base_task_id,
|
||||
add_task_init_call=add_task_init_call,
|
||||
force_single_script_file=force_single_script_file,
|
||||
raise_on_missing_entries=False,
|
||||
)
|
||||
task = manual_populate.create_task()
|
||||
|
@ -285,11 +285,11 @@ class GPUStatCollection(object):
|
||||
for nv_process in nv_comp_processes + nv_graphics_processes:
|
||||
try:
|
||||
process = get_process_info(nv_process)
|
||||
processes.append(process)
|
||||
except psutil.NoSuchProcess:
|
||||
# TODO: add some reminder for NVML broken context
|
||||
# e.g. nvidia-smi reset or reboot the system
|
||||
pass
|
||||
process = None
|
||||
processes.append(process)
|
||||
|
||||
# we do not actually use these, so no point in collecting them
|
||||
# # TODO: Do not block if full process info is not requested
|
||||
@ -313,7 +313,7 @@ class GPUStatCollection(object):
|
||||
# Convert bytes into MBytes
|
||||
'memory.used': memory.used // MB if memory else None,
|
||||
'memory.total': memory.total // MB if memory else None,
|
||||
'processes': processes,
|
||||
'processes': None if (processes and all(p is None for p in processes)) else processes
|
||||
}
|
||||
if per_process_stats:
|
||||
GPUStatCollection.clean_processes()
|
||||
|
@ -284,6 +284,11 @@ def is_std_or_local_lib(name):
|
||||
False if installed package
|
||||
str if local library
|
||||
"""
|
||||
|
||||
# check if one of the builtin modules first
|
||||
if name in sys.builtin_module_names:
|
||||
return True
|
||||
|
||||
exist = True
|
||||
if six.PY2:
|
||||
import imp # noqa
|
||||
|
@ -287,7 +287,7 @@ class WrapperBase(type):
|
||||
# (http://code.activestate.com/recipes/496741/). It adds special methods
|
||||
# to the wrapper class so it can proxy the wrapped class. In addition, it
|
||||
# adds a field __overrides__ in the wrapper class dictionary, containing
|
||||
# all attributes decorated to be overriden.
|
||||
# all attributes decorated to be overridden.
|
||||
|
||||
_special_names = [
|
||||
'__abs__', '__add__', '__and__', '__call__', '__cmp__', '__coerce__',
|
||||
@ -303,7 +303,7 @@ class WrapperBase(type):
|
||||
'__repr__', '__reversed__', '__rfloorfiv__', '__rlshift__', '__rmod__',
|
||||
'__rmul__', '__ror__', '__rpow__', '__rrshift__', '__rshift__', '__rsub__',
|
||||
'__rtruediv__', '__rxor__', '__setitem__', '__setslice__', '__sub__',
|
||||
'__truediv__', '__xor__', 'next', '__str__', '__repr__',
|
||||
'__truediv__', '__xor__', 'next', '__str__', '__repr__',
|
||||
'__round__', '__fspath__', '__bytes__', '__index__'
|
||||
]
|
||||
|
||||
|
@ -43,6 +43,8 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
self._process_info = psutil.Process() if report_mem_used_per_process else None
|
||||
self._last_process_pool = {}
|
||||
self._last_process_id_list = []
|
||||
self._gpu_memory_per_process = True
|
||||
|
||||
if not self._gpustat:
|
||||
self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available')
|
||||
else: # if running_remotely():
|
||||
@ -309,27 +311,40 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
# On the rest of the samples we return the previous memory measurement
|
||||
|
||||
# update mem used by our process and sub processes
|
||||
if self._process_info and (not self._last_process_pool.get('gpu') or
|
||||
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
|
||||
gpu_stat = self._gpustat.new_query(per_process_stats=True)
|
||||
if self._gpu_memory_per_process and self._process_info and \
|
||||
(not self._last_process_pool.get('gpu') or
|
||||
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
|
||||
gpu_mem = {}
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
gpu_stat = self._gpustat.new_query(per_process_stats=True)
|
||||
except Exception:
|
||||
gpu_stat = self._gpustat.new_query(per_process_stats=False)
|
||||
|
||||
for i, g in enumerate(gpu_stat.gpus):
|
||||
# if processes is None, that means we can't query GPU memory usage per proces, so we can stop
|
||||
if g.processes is None:
|
||||
self._gpu_memory_per_process = False
|
||||
break
|
||||
# only monitor the active gpu's, if none were selected, monitor everything
|
||||
if self._active_gpus and i not in self._active_gpus:
|
||||
continue
|
||||
|
||||
gpu_mem[i] = 0
|
||||
for p in g.processes:
|
||||
if p['pid'] in self._last_process_id_list:
|
||||
if p is not None and p['pid'] in self._last_process_id_list:
|
||||
gpu_mem[i] += p.get('gpu_memory_usage', 0)
|
||||
|
||||
self._last_process_pool['gpu'] = time(), gpu_mem
|
||||
else:
|
||||
# if we do no need to update the memory usage, run global query
|
||||
# if we have no parent process (backward compatibility), return global stats
|
||||
gpu_stat = self._gpustat.new_query()
|
||||
gpu_stat = self._gpustat.new_query(per_process_stats=False)
|
||||
gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
|
||||
|
||||
# generate the statistics dict for actual report
|
||||
stats = {}
|
||||
|
||||
for i, g in enumerate(gpu_stat.gpus):
|
||||
# only monitor the active gpu's, if none were selected, monitor everything
|
||||
if self._active_gpus and i not in self._active_gpus:
|
||||
@ -367,7 +382,7 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
specs.update(
|
||||
gpu_count=int(len(gpus)),
|
||||
gpu_type=', '.join(g.name for g in gpus),
|
||||
gpu_memory=', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus),
|
||||
gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus),
|
||||
gpu_driver_version=gpu_stat.driver_version or '',
|
||||
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
|
||||
)
|
||||
|
@ -1 +1 @@
|
||||
__version__ = '1.12.2'
|
||||
__version__ = '1.13.1'
|
||||
|
28
examples/frameworks/keras/keras_v3.py
Normal file
28
examples/frameworks/keras/keras_v3.py
Normal file
@ -0,0 +1,28 @@
|
||||
import numpy as np
|
||||
import keras
|
||||
from clearml import Task
|
||||
|
||||
|
||||
def get_model():
|
||||
# Create a simple model.
|
||||
inputs = keras.Input(shape=(32,))
|
||||
outputs = keras.layers.Dense(1)(inputs)
|
||||
model = keras.Model(inputs, outputs)
|
||||
model.compile(optimizer=keras.optimizers.Adam(), loss="mean_squared_error")
|
||||
return model
|
||||
|
||||
Task.init(project_name="examples", task_name="keras_v3")
|
||||
|
||||
model = get_model()
|
||||
|
||||
test_input = np.random.random((128, 32))
|
||||
test_target = np.random.random((128, 1))
|
||||
model.fit(test_input, test_target)
|
||||
|
||||
model.save("my_model.keras")
|
||||
|
||||
reconstructed_model = keras.models.load_model("my_model.keras")
|
||||
|
||||
np.testing.assert_allclose(
|
||||
model.predict(test_input), reconstructed_model.predict(test_input)
|
||||
)
|
36
examples/reporting/config_files.py
Normal file
36
examples/reporting/config_files.py
Normal file
@ -0,0 +1,36 @@
|
||||
# ClearML - example code for logging configuration files to Task":
|
||||
#
|
||||
import json
|
||||
from pathlib import Path
|
||||
import yaml
|
||||
|
||||
from clearml import Task
|
||||
|
||||
|
||||
# Connecting ClearML with the current process,
|
||||
# from here on everything is logged automatically
|
||||
task = Task.init(project_name='FirstTrial', task_name='config_files_example')
|
||||
|
||||
|
||||
# -----------------------------------------------
|
||||
# Log config file
|
||||
# Notice any file format i supported
|
||||
# In the Web UI you could edit the configuration file directly as text
|
||||
# and launch on a remote worker with the new configuration automatically applied
|
||||
# -----------------------------------------------
|
||||
|
||||
config_file = task.connect_configuration(Path("data_samples") / "sample.json", name='json config file')
|
||||
|
||||
with open(config_file, "rt") as f:
|
||||
config_json = json.load(f)
|
||||
|
||||
print(config_json)
|
||||
|
||||
config_file = task.connect_configuration(Path("data_samples") / "config_yaml.yaml", name='yaml config file')
|
||||
|
||||
with open(config_file, "rt") as f:
|
||||
config_yaml = yaml.load(f, Loader=yaml.SafeLoader)
|
||||
|
||||
print(config_yaml)
|
||||
|
||||
print("done")
|
@ -11,7 +11,7 @@ Pillow>=4.1.1
|
||||
psutil>=3.4.2
|
||||
pyparsing>=2.0.3
|
||||
python-dateutil>=2.6.1
|
||||
pyjwt>=2.4.0,<2.5.0 ; python_version > '3.5'
|
||||
pyjwt>=2.4.0,<2.9.0 ; python_version > '3.5'
|
||||
pyjwt>=1.6.4,<2.0.0 ; python_version <= '3.5'
|
||||
PyYAML>=3.12
|
||||
requests>=2.20.0
|
||||
|
Loading…
Reference in New Issue
Block a user