From ce5fd31070c48c855aad20982dd1301fe89e7175 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Sun, 10 Jan 2021 13:03:40 +0200 Subject: [PATCH] Fix dataset upload aborted on server watchdog --- .../backend_interface/task/development/worker.py | 4 ++-- clearml/datasets/dataset.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/clearml/backend_interface/task/development/worker.py b/clearml/backend_interface/task/development/worker.py index 2189af86..a8947278 100644 --- a/clearml/backend_interface/task/development/worker.py +++ b/clearml/backend_interface/task/development/worker.py @@ -30,10 +30,10 @@ class DevWorker(object): return False return True - def register(self, task): + def register(self, task, stop_signal_support=None): if self._thread: return True - if TaskStopSignal.enabled: + if (stop_signal_support is None and TaskStopSignal.enabled) or stop_signal_support is True: self._dev_stop_signal = TaskStopSignal(task=task) self._support_ping = hasattr(tasks, 'PingRequest') # if there is nothing to monitor, leave diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index d240809d..6ce505cb 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -15,6 +15,7 @@ from pathlib2 import Path from .. import Task, StorageManager from ..backend_api.session.client import APIClient +from ..backend_interface.task.development.worker import DevWorker from ..backend_interface.util import mutually_exclusive, exact_match_regex from ..debugging.log import LoggerRoot from ..storage.helper import StorageHelper @@ -83,6 +84,13 @@ class Dataset(object): # noinspection PyProtectedMember task._edit(script=task.data.script) + # if the task is running make sure we ping to the server so it will not be aborted by a watchdog + if task.status in ('created', 'in_progress'): + self._task_pinger = DevWorker() + self._task_pinger.register(task, stop_signal_support=False) + else: + self._task_pinger = None + # store current dataset Task self._task = task # store current dataset id @@ -389,6 +397,11 @@ class Dataset(object): self._task.comment = 'Dependencies: {}\n'.format(hashed_nodes) self._task.close() self._task.completed() + + if self._task_pinger: + self._task_pinger.unregister() + self._task_pinger = None + return True def is_final(self):