Fix --stop with dynamic gpus

This commit is contained in:
allegroai 2021-05-20 10:58:46 +03:00
parent 3c4e976093
commit e93384b99b
2 changed files with 23 additions and 6 deletions

View File

@ -878,7 +878,7 @@ class Worker(ServiceCommandSection):
# if we are in dynamic gpus / services mode,
# we should send termination signal to all child processes
if self._services_mode:
terminate_all_child_processes(timeout=120)
terminate_all_child_processes(timeout=20, include_parent=False)
# if we are here, just kill all sub processes
kill_all_child_processes()
@ -1371,6 +1371,7 @@ class Worker(ServiceCommandSection):
service_mode_internal_agent_started = None
stopping = False
status = None
process = None
try:
_last_machine_update_ts = time()
stop_reason = None
@ -1427,6 +1428,8 @@ class Worker(ServiceCommandSection):
status = ex.returncode
except KeyboardInterrupt:
# so someone else will catch us
if process:
kill_all_child_processes(process.pid)
raise
except Exception:
# we should not get here, but better safe than sorry
@ -1438,6 +1441,10 @@ class Worker(ServiceCommandSection):
stop_reason = TaskStopReason.exception
status = -1
# full cleanup (just in case)
if process:
kill_all_child_processes(process.pid)
# if running in services mode, keep the file open
# in case the docker was so quick it started and finished, check the stop reason
if self._services_mode and service_mode_internal_agent_started and stop_reason == 'Service started':
@ -3091,7 +3098,7 @@ class Worker(ServiceCommandSection):
warning('Could not terminate process pid={}'.format(pid))
return True
# wither we have a match for the worker_id or we just pick the first one, and kill it.
# either we have a match for the worker_id or we just pick the first one, and kill it.
if (worker_id and uid == worker_id) or (not worker_id and uid.startswith('{}:'.format(worker_name))):
# this is us kill it
print('Terminating clearml-agent worker_id={} pid={}'.format(uid, pid))

View File

@ -42,20 +42,31 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
return output if not strip or not output else output.strip()
def terminate_process(pid, timeout=10., ignore_zombie=True):
def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False):
# noinspection PyBroadException
try:
proc = psutil.Process(pid)
children = proc.children(recursive=True) if include_children else []
proc.terminate()
cnt = 0
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
sleep(1.)
cnt += 1
proc.terminate()
# terminate children
for c in children:
c.terminate()
cnt = 0
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
sleep(1.)
cnt += 1
# kill children
for c in children:
c.kill()
proc.kill()
except Exception:
pass
@ -66,9 +77,8 @@ def terminate_process(pid, timeout=10., ignore_zombie=True):
return True
def kill_all_child_processes(pid=None):
def kill_all_child_processes(pid=None, include_parent=True):
# get current process if pid not provided
include_parent = True
if not pid:
pid = os.getpid()
include_parent = False
@ -96,7 +106,7 @@ def terminate_all_child_processes(pid=None, timeout=10., include_parent=True):
return
for child in parent.children(recursive=False):
print('Terminating child process {}'.format(child.pid))
terminate_process(child.pid, timeout=timeout, ignore_zombie=False)
terminate_process(child.pid, timeout=timeout, ignore_zombie=False, include_children=True)
if include_parent:
terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)