mirror of
https://github.com/clearml/clearml
synced 2025-06-26 18:16:07 +00:00
Fix automation BOHB budget display calculation, Job.started() and daemon sleep
This commit is contained in:
parent
2066d9ff9d
commit
7dad7e57e4
@ -299,9 +299,15 @@ class OptimizerBOHB(SearchStrategy, RandomSeed):
|
|||||||
# Step 3: Run an optimizer
|
# Step 3: Run an optimizer
|
||||||
self._bohb = BOHB(configspace=self._convert_hyper_parameters_to_cs(),
|
self._bohb = BOHB(configspace=self._convert_hyper_parameters_to_cs(),
|
||||||
run_id=fake_run_id,
|
run_id=fake_run_id,
|
||||||
num_samples=self.total_max_jobs,
|
# num_samples=self.total_max_jobs, # will be set by self._bohb_kwargs
|
||||||
min_budget=float(self._min_iteration_per_job) / float(self._max_iteration_per_job),
|
min_budget=float(self._min_iteration_per_job) / float(self._max_iteration_per_job),
|
||||||
**self._bohb_kwargs)
|
**self._bohb_kwargs)
|
||||||
|
# scale the budget according to the successive halving iterations
|
||||||
|
if self.budget.jobs.limit:
|
||||||
|
self.budget.jobs.limit *= len(self._bohb.budgets)
|
||||||
|
if self.budget.iterations.limit:
|
||||||
|
self.budget.iterations.limit *= len(self._bohb.budgets)
|
||||||
|
# start optimization
|
||||||
self._res = self._bohb.run(n_iterations=self.total_max_jobs, min_n_workers=self._num_concurrent_workers)
|
self._res = self._bohb.run(n_iterations=self.total_max_jobs, min_n_workers=self._num_concurrent_workers)
|
||||||
|
|
||||||
# Step 4: if we get here, Shutdown
|
# Step 4: if we get here, Shutdown
|
||||||
|
@ -105,6 +105,8 @@ class TrainsJob(object):
|
|||||||
if not self.task_started and str(self.task.status) != Task.TaskStatusEnum.in_progress:
|
if not self.task_started and str(self.task.status) != Task.TaskStatusEnum.in_progress:
|
||||||
return -1
|
return -1
|
||||||
self.task_started = True
|
self.task_started = True
|
||||||
|
if not self.task.data.started:
|
||||||
|
self.task.reload()
|
||||||
if not self.task.data.started:
|
if not self.task.data.started:
|
||||||
return -1
|
return -1
|
||||||
return (datetime.now(tz=self.task.data.started.tzinfo) - self.task.data.started).total_seconds()
|
return (datetime.now(tz=self.task.data.started.tzinfo) - self.task.data.started).total_seconds()
|
||||||
@ -215,6 +217,20 @@ class TrainsJob(object):
|
|||||||
"""
|
"""
|
||||||
return self.task.status in (Task.TaskStatusEnum.queued, Task.TaskStatusEnum.created)
|
return self.task.status in (Task.TaskStatusEnum.queued, Task.TaskStatusEnum.created)
|
||||||
|
|
||||||
|
def started(self):
|
||||||
|
# type: () -> bool
|
||||||
|
"""
|
||||||
|
Return True if job already started, or ended (or False if created/pending)
|
||||||
|
|
||||||
|
:return bool: False if the task is currently in draft mode or pending
|
||||||
|
"""
|
||||||
|
if not self.task_started and self.task.status in (
|
||||||
|
Task.TaskStatusEnum.in_progress, Task.TaskStatusEnum.created):
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.task_started = True
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
# noinspection PyMethodMayBeStatic, PyUnusedLocal
|
# noinspection PyMethodMayBeStatic, PyUnusedLocal
|
||||||
class _JobStub(object):
|
class _JobStub(object):
|
||||||
@ -326,3 +342,7 @@ class _JobStub(object):
|
|||||||
def is_pending(self):
|
def is_pending(self):
|
||||||
# type: () -> bool
|
# type: () -> bool
|
||||||
return self.task_started is None
|
return self.task_started is None
|
||||||
|
|
||||||
|
def started(self):
|
||||||
|
# type: () -> bool
|
||||||
|
return not self.is_pending()
|
||||||
|
@ -369,12 +369,14 @@ class SearchStrategy(object):
|
|||||||
|
|
||||||
if self.time_limit_per_job:
|
if self.time_limit_per_job:
|
||||||
elapsed = job.elapsed() / 60.
|
elapsed = job.elapsed() / 60.
|
||||||
|
if elapsed > 0:
|
||||||
self.budget.compute_time.update(job.task_id(), elapsed)
|
self.budget.compute_time.update(job.task_id(), elapsed)
|
||||||
if elapsed > self.time_limit_per_job:
|
if elapsed > self.time_limit_per_job:
|
||||||
abort_job = True
|
abort_job = True
|
||||||
|
|
||||||
if self.max_iteration_per_job:
|
if self.max_iteration_per_job:
|
||||||
iterations = self._get_job_iterations(job)
|
iterations = self._get_job_iterations(job)
|
||||||
|
if iterations > 0:
|
||||||
self.budget.iterations.update(job.task_id(), iterations)
|
self.budget.iterations.update(job.task_id(), iterations)
|
||||||
if iterations > self.max_iteration_per_job:
|
if iterations > self.max_iteration_per_job:
|
||||||
abort_job = True
|
abort_job = True
|
||||||
@ -1139,6 +1141,9 @@ class HyperParameterOptimizer(object):
|
|||||||
|
|
||||||
if timeout >= 0:
|
if timeout >= 0:
|
||||||
timeout = min(self._report_period_min * 60., timeout if timeout else self._report_period_min * 60.)
|
timeout = min(self._report_period_min * 60., timeout if timeout else self._report_period_min * 60.)
|
||||||
|
# make sure that we have the first report fired before we actually go to sleep, wait for 15 sec.
|
||||||
|
if counter <= 0:
|
||||||
|
timeout = 15
|
||||||
print('Progress report #{} completed, sleeping for {} minutes'.format(counter, timeout / 60.))
|
print('Progress report #{} completed, sleeping for {} minutes'.format(counter, timeout / 60.))
|
||||||
if self._stop_event.wait(timeout=timeout):
|
if self._stop_event.wait(timeout=timeout):
|
||||||
# wait for one last report
|
# wait for one last report
|
||||||
|
Loading…
Reference in New Issue
Block a user