mirror of
https://github.com/clearml/clearml
synced 2025-05-14 09:30:40 +00:00
Changed, resource monitor prefer sync to iterations, over seconds from beginning of experiment
This commit is contained in:
parent
85e783cc6b
commit
d27dc352cb
@ -2,6 +2,7 @@ import base64
|
||||
import sys
|
||||
import threading
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
from logging import ERROR, WARNING, getLogger
|
||||
from typing import Any
|
||||
|
||||
@ -21,6 +22,23 @@ except ImportError:
|
||||
MessageToDict = None
|
||||
|
||||
|
||||
class IsTensorboardInit(object):
|
||||
_tensorboard_initialized = False
|
||||
|
||||
@classmethod
|
||||
def tensorboard_used(cls):
|
||||
return cls._tensorboard_initialized
|
||||
|
||||
@classmethod
|
||||
def set_tensorboard_used(cls):
|
||||
cls._tensorboard_initialized = True
|
||||
|
||||
@staticmethod
|
||||
def _patched_tb__init__(original_init, self, *args, **kwargs):
|
||||
IsTensorboardInit._tensorboard_initialized = True
|
||||
return original_init(self, *args, **kwargs)
|
||||
|
||||
|
||||
class EventTrainsWriter(object):
|
||||
"""
|
||||
TF SummaryWriter implementation that converts the tensorboard's summary into
|
||||
@ -68,6 +86,7 @@ class EventTrainsWriter(object):
|
||||
:param max_keep_images: Maximum number of images to save before starting to reuse files (per title/metric pair)
|
||||
"""
|
||||
# We are the events_writer, so that's what we'll pass
|
||||
IsTensorboardInit.set_tensorboard_used()
|
||||
self.max_keep_images = max_keep_images
|
||||
self.report_freq = report_freq
|
||||
self.image_report_freq = image_report_freq if image_report_freq else report_freq
|
||||
@ -407,6 +426,7 @@ class EventTrainsWriter(object):
|
||||
|
||||
class ProxyEventsWriter(object):
|
||||
def __init__(self, events):
|
||||
IsTensorboardInit.set_tensorboard_used()
|
||||
self._events = events
|
||||
|
||||
def _get_sentinel_event(self):
|
||||
@ -768,6 +788,10 @@ class PatchTensorFlowEager(object):
|
||||
gen_summary_ops.write_image_summary = PatchTensorFlowEager._write_image_summary
|
||||
PatchTensorFlowEager.__original_fn_hist = gen_summary_ops.write_histogram_summary
|
||||
gen_summary_ops.write_histogram_summary = PatchTensorFlowEager._write_hist_summary
|
||||
gen_summary_ops.create_summary_file_writer = partial(IsTensorboardInit._patched_tb__init__,
|
||||
gen_summary_ops.create_summary_file_writer)
|
||||
gen_summary_ops.create_summary_db_writer = partial(IsTensorboardInit._patched_tb__init__,
|
||||
gen_summary_ops.create_summary_db_writer)
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception as ex:
|
||||
|
@ -810,7 +810,7 @@ class OutputModel(BaseModel):
|
||||
framework=self.framework or framework,
|
||||
comment=comment,
|
||||
cb=delete_previous_weights_file if auto_delete_file else None,
|
||||
iteration=iteration or self._task.data.last_iteration,
|
||||
iteration=iteration or self._task.get_last_iteration(),
|
||||
)
|
||||
elif register_uri:
|
||||
register_uri = StorageHelper.conform_url(register_uri)
|
||||
|
@ -664,6 +664,27 @@ class Task(_Task):
|
||||
"""
|
||||
super(Task, self).set_model_label_enumeration(enumeration=enumeration)
|
||||
|
||||
def get_last_iteration(self):
|
||||
"""
|
||||
Return the last reported iteration (i.e. the maximum iteration the task reported a metric for)
|
||||
Notice, this is not a cached call, it will ask the backend for the answer (no local caching)
|
||||
|
||||
:return integer, last reported iteration number
|
||||
"""
|
||||
self.reload()
|
||||
return self.data.last_iteration
|
||||
|
||||
def set_last_iteration(self, last_iteration):
|
||||
"""
|
||||
Forcefully set the last reported iteration
|
||||
(i.e. the maximum iteration the task reported a metric for)
|
||||
|
||||
:param last_iteration: last reported iteration number
|
||||
:type last_iteration: integer
|
||||
"""
|
||||
self.data.last_iteration = int(last_iteration)
|
||||
self._edit(last_iteration=self.data.last_iteration)
|
||||
|
||||
def _connect_output_model(self, model):
|
||||
assert isinstance(model, OutputModel)
|
||||
model.connect(self)
|
||||
|
@ -4,6 +4,7 @@ from threading import Thread, Event
|
||||
import psutil
|
||||
from pathlib2 import Path
|
||||
from typing import Text
|
||||
from ..binding.frameworks.tensorflow_bind import IsTensorboardInit
|
||||
|
||||
try:
|
||||
import gpustat
|
||||
@ -15,10 +16,13 @@ class ResourceMonitor(object):
|
||||
_title_machine = ':monitor:machine'
|
||||
_title_gpu = ':monitor:gpu'
|
||||
|
||||
def __init__(self, task, measure_frequency_times_per_sec=2., report_frequency_sec=30.):
|
||||
def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30.,
|
||||
first_report_sec=None, wait_for_first_iteration_to_start_sec=180.):
|
||||
self._task = task
|
||||
self._measure_frequency = measure_frequency_times_per_sec
|
||||
self._sample_frequency = sample_frequency_per_sec
|
||||
self._report_frequency = report_frequency_sec
|
||||
self._first_report_sec = first_report_sec or report_frequency_sec
|
||||
self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec
|
||||
self._num_readouts = 0
|
||||
self._readouts = {}
|
||||
self._previous_readouts = {}
|
||||
@ -41,11 +45,18 @@ class ResourceMonitor(object):
|
||||
def _daemon(self):
|
||||
logger = self._task.get_logger()
|
||||
seconds_since_started = 0
|
||||
reported = 0
|
||||
last_iteration = 0
|
||||
last_iteration_ts = 0
|
||||
last_iteration_interval = None
|
||||
repeated_iterations = 0
|
||||
fallback_to_sec_as_iterations = 0
|
||||
while True:
|
||||
last_report = time()
|
||||
while (time() - last_report) < self._report_frequency:
|
||||
# wait for self._measure_frequency seconds, if event set quit
|
||||
if self._exit_event.wait(1.0 / self._measure_frequency):
|
||||
current_report_frequency = self._report_frequency if reported != 0 else self._first_report_sec
|
||||
while (time() - last_report) < current_report_frequency:
|
||||
# wait for self._sample_frequency seconds, if event set quit
|
||||
if self._exit_event.wait(1.0 / self._sample_frequency):
|
||||
return
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
@ -53,15 +64,43 @@ class ResourceMonitor(object):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
reported += 1
|
||||
average_readouts = self._get_average_readouts()
|
||||
seconds_since_started += int(round(time() - last_report))
|
||||
# check if we do not report any metric (so it means the last iteration will not be changed)
|
||||
if fallback_to_sec_as_iterations is None:
|
||||
if IsTensorboardInit.tensorboard_used():
|
||||
fallback_to_sec_as_iterations = False
|
||||
elif seconds_since_started >= self._wait_for_first_iteration:
|
||||
fallback_to_sec_as_iterations = True
|
||||
|
||||
# if we do not have last_iteration, we just use seconds as iteration
|
||||
if fallback_to_sec_as_iterations:
|
||||
iteration = seconds_since_started
|
||||
else:
|
||||
iteration = self._task.get_last_iteration()
|
||||
if iteration == last_iteration:
|
||||
repeated_iterations += 1
|
||||
if last_iteration_interval:
|
||||
# to be on the safe side, we don't want to pass the actual next iteration
|
||||
iteration += int(0.95*last_iteration_interval[0] * (seconds_since_started - last_iteration_ts)
|
||||
/ last_iteration_interval[1])
|
||||
else:
|
||||
iteration += 1
|
||||
else:
|
||||
last_iteration_interval = (iteration - last_iteration, seconds_since_started - last_iteration_ts)
|
||||
last_iteration_ts = seconds_since_started
|
||||
last_iteration = iteration
|
||||
repeated_iterations = 0
|
||||
fallback_to_sec_as_iterations = False
|
||||
|
||||
for k, v in average_readouts.items():
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
title = self._title_gpu if k.startswith('gpu_') else self._title_machine
|
||||
# 3 points after the dot
|
||||
value = round(v*1000) / 1000.
|
||||
logger.report_scalar(title=title, series=k, iteration=seconds_since_started, value=value)
|
||||
logger.report_scalar(title=title, series=k, iteration=iteration, value=value)
|
||||
except Exception:
|
||||
pass
|
||||
self._clear_readouts()
|
||||
|
Loading…
Reference in New Issue
Block a user