Add machine specs in Task.runtime

This commit is contained in:
allegroai 2021-05-19 15:25:23 +03:00
parent 016a5c2d71
commit ed75eac6ab
2 changed files with 47 additions and 1 deletions

View File

@ -1817,7 +1817,7 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin):
status = self._data.status if self._data and self._reload_skip_flag else self.data.status status = self._data.status if self._data and self._reload_skip_flag else self.data.status
if status not in (tasks.TaskStatusEnum.created, tasks.TaskStatusEnum.in_progress): if status not in (tasks.TaskStatusEnum.created, tasks.TaskStatusEnum.in_progress):
# the exception being name/comment that we can always change. # the exception being name/comment that we can always change.
if kwargs and all(k in ('name', 'comment', 'tags', 'system_tags') for k in kwargs.keys()): if kwargs and all(k in ('name', 'comment', 'tags', 'system_tags', 'runtime') for k in kwargs.keys()):
pass pass
else: else:
raise ValueError('Task object can only be updated if created or in_progress') raise ValueError('Task object can only be updated if created or in_progress')

View File

@ -1,5 +1,7 @@
import logging import logging
import os import os
import platform
import sys
import warnings import warnings
from time import time from time import time
@ -8,6 +10,7 @@ from pathlib2 import Path
from typing import Text from typing import Text
from .process.mp import BackgroundMonitor from .process.mp import BackgroundMonitor
from ..backend_api import Session
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit from ..binding.frameworks.tensorflow_bind import IsTensorboardInit
try: try:
@ -68,6 +71,20 @@ class ResourceMonitor(BackgroundMonitor):
except Exception: except Exception:
pass pass
# add Task runtime_properties with the machine spec
if Session.check_min_api_version('2.13'):
try:
machine_spec = self._get_machine_specs()
if machine_spec:
self._task.reload()
runtime_properties = self._task.data.runtime or {}
runtime_properties.update(machine_spec)
# noinspection PyProtectedMember
self._task._edit(runtime=runtime_properties)
except Exception as ex:
logging.getLogger('clearml.resource_monitor').debug(
'Failed logging machine specification: {}'.format(ex))
# last_iteration_interval = None # last_iteration_interval = None
# last_iteration_ts = 0 # last_iteration_ts = 0
# repeated_iterations = 0 # repeated_iterations = 0
@ -315,3 +332,32 @@ class ResourceMonitor(BackgroundMonitor):
stats["gpu_%d_mem_used_gb" % i] = float(gpu_mem[i] if gpu_mem else g["memory.used"]) / 1024 stats["gpu_%d_mem_used_gb" % i] = float(gpu_mem[i] if gpu_mem else g["memory.used"]) / 1024
return stats return stats
def _get_machine_specs(self):
# type: () -> dict
# noinspection PyBroadException
try:
specs = {
'platform': str(sys.platform),
'python_version': str(platform.python_version()),
'python_exec': str(sys.executable),
'OS': str(platform.platform(aliased=True)),
'processor': str(platform.machine()),
'cores': int(psutil.cpu_count()),
'memory_gb': round(psutil.virtual_memory().total / 1024 ** 3, 1),
'hostname': str(platform.node()),
'gpu_count': 0,
'gpu_type': '',
'gpu_memory': '',
}
if self._gpustat:
gpu_stat = self._gpustat.new_query(shutdown=True)
if gpu_stat.gpus:
gpus = [g for i, g in enumerate(gpu_stat.gpus) if not self._active_gpus or i in self._active_gpus]
specs['gpu_count'] = int(len(gpus))
specs['gpu_type'] = ', '.join(g.name for g in gpus)
specs['gpu_memory'] = ', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus)
except Exception:
return {}
return specs