diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index 36cef19..a93d900 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -878,7 +878,7 @@ class Worker(ServiceCommandSection): # if we are in dynamic gpus / services mode, # we should send termination signal to all child processes if self._services_mode: - terminate_all_child_processes(timeout=120) + terminate_all_child_processes(timeout=20, include_parent=False) # if we are here, just kill all sub processes kill_all_child_processes() @@ -1371,6 +1371,7 @@ class Worker(ServiceCommandSection): service_mode_internal_agent_started = None stopping = False status = None + process = None try: _last_machine_update_ts = time() stop_reason = None @@ -1427,6 +1428,8 @@ class Worker(ServiceCommandSection): status = ex.returncode except KeyboardInterrupt: # so someone else will catch us + if process: + kill_all_child_processes(process.pid) raise except Exception: # we should not get here, but better safe than sorry @@ -1438,6 +1441,10 @@ class Worker(ServiceCommandSection): stop_reason = TaskStopReason.exception status = -1 + # full cleanup (just in case) + if process: + kill_all_child_processes(process.pid) + # if running in services mode, keep the file open # in case the docker was so quick it started and finished, check the stop reason if self._services_mode and service_mode_internal_agent_started and stop_reason == 'Service started': @@ -3091,7 +3098,7 @@ class Worker(ServiceCommandSection): warning('Could not terminate process pid={}'.format(pid)) return True - # wither we have a match for the worker_id or we just pick the first one, and kill it. + # either we have a match for the worker_id or we just pick the first one, and kill it. if (worker_id and uid == worker_id) or (not worker_id and uid.startswith('{}:'.format(worker_name))): # this is us kill it print('Terminating clearml-agent worker_id={} pid={}'.format(uid, pid)) diff --git a/clearml_agent/helper/process.py b/clearml_agent/helper/process.py index 427d9c9..c92c3cf 100644 --- a/clearml_agent/helper/process.py +++ b/clearml_agent/helper/process.py @@ -42,20 +42,31 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False): return output if not strip or not output else output.strip() -def terminate_process(pid, timeout=10., ignore_zombie=True): +def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False): # noinspection PyBroadException try: proc = psutil.Process(pid) + children = proc.children(recursive=True) if include_children else [] proc.terminate() cnt = 0 while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout: sleep(1.) cnt += 1 proc.terminate() + + # terminate children + for c in children: + c.terminate() + cnt = 0 while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout: sleep(1.) cnt += 1 + + # kill children + for c in children: + c.kill() + proc.kill() except Exception: pass @@ -66,9 +77,8 @@ def terminate_process(pid, timeout=10., ignore_zombie=True): return True -def kill_all_child_processes(pid=None): +def kill_all_child_processes(pid=None, include_parent=True): # get current process if pid not provided - include_parent = True if not pid: pid = os.getpid() include_parent = False @@ -96,7 +106,7 @@ def terminate_all_child_processes(pid=None, timeout=10., include_parent=True): return for child in parent.children(recursive=False): print('Terminating child process {}'.format(child.pid)) - terminate_process(child.pid, timeout=timeout, ignore_zombie=False) + terminate_process(child.pid, timeout=timeout, ignore_zombie=False, include_children=True) if include_parent: terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)