mirror of
https://github.com/clearml/clearml-agent
synced 2025-01-31 09:06:52 +00:00
Fix --stop with dynamic gpus
This commit is contained in:
parent
3c4e976093
commit
e93384b99b
@ -878,7 +878,7 @@ class Worker(ServiceCommandSection):
|
||||
# if we are in dynamic gpus / services mode,
|
||||
# we should send termination signal to all child processes
|
||||
if self._services_mode:
|
||||
terminate_all_child_processes(timeout=120)
|
||||
terminate_all_child_processes(timeout=20, include_parent=False)
|
||||
|
||||
# if we are here, just kill all sub processes
|
||||
kill_all_child_processes()
|
||||
@ -1371,6 +1371,7 @@ class Worker(ServiceCommandSection):
|
||||
service_mode_internal_agent_started = None
|
||||
stopping = False
|
||||
status = None
|
||||
process = None
|
||||
try:
|
||||
_last_machine_update_ts = time()
|
||||
stop_reason = None
|
||||
@ -1427,6 +1428,8 @@ class Worker(ServiceCommandSection):
|
||||
status = ex.returncode
|
||||
except KeyboardInterrupt:
|
||||
# so someone else will catch us
|
||||
if process:
|
||||
kill_all_child_processes(process.pid)
|
||||
raise
|
||||
except Exception:
|
||||
# we should not get here, but better safe than sorry
|
||||
@ -1438,6 +1441,10 @@ class Worker(ServiceCommandSection):
|
||||
stop_reason = TaskStopReason.exception
|
||||
status = -1
|
||||
|
||||
# full cleanup (just in case)
|
||||
if process:
|
||||
kill_all_child_processes(process.pid)
|
||||
|
||||
# if running in services mode, keep the file open
|
||||
# in case the docker was so quick it started and finished, check the stop reason
|
||||
if self._services_mode and service_mode_internal_agent_started and stop_reason == 'Service started':
|
||||
@ -3091,7 +3098,7 @@ class Worker(ServiceCommandSection):
|
||||
warning('Could not terminate process pid={}'.format(pid))
|
||||
return True
|
||||
|
||||
# wither we have a match for the worker_id or we just pick the first one, and kill it.
|
||||
# either we have a match for the worker_id or we just pick the first one, and kill it.
|
||||
if (worker_id and uid == worker_id) or (not worker_id and uid.startswith('{}:'.format(worker_name))):
|
||||
# this is us kill it
|
||||
print('Terminating clearml-agent worker_id={} pid={}'.format(uid, pid))
|
||||
|
@ -42,20 +42,31 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
|
||||
return output if not strip or not output else output.strip()
|
||||
|
||||
|
||||
def terminate_process(pid, timeout=10., ignore_zombie=True):
|
||||
def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
proc = psutil.Process(pid)
|
||||
children = proc.children(recursive=True) if include_children else []
|
||||
proc.terminate()
|
||||
cnt = 0
|
||||
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||
sleep(1.)
|
||||
cnt += 1
|
||||
proc.terminate()
|
||||
|
||||
# terminate children
|
||||
for c in children:
|
||||
c.terminate()
|
||||
|
||||
cnt = 0
|
||||
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||
sleep(1.)
|
||||
cnt += 1
|
||||
|
||||
# kill children
|
||||
for c in children:
|
||||
c.kill()
|
||||
|
||||
proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
@ -66,9 +77,8 @@ def terminate_process(pid, timeout=10., ignore_zombie=True):
|
||||
return True
|
||||
|
||||
|
||||
def kill_all_child_processes(pid=None):
|
||||
def kill_all_child_processes(pid=None, include_parent=True):
|
||||
# get current process if pid not provided
|
||||
include_parent = True
|
||||
if not pid:
|
||||
pid = os.getpid()
|
||||
include_parent = False
|
||||
@ -96,7 +106,7 @@ def terminate_all_child_processes(pid=None, timeout=10., include_parent=True):
|
||||
return
|
||||
for child in parent.children(recursive=False):
|
||||
print('Terminating child process {}'.format(child.pid))
|
||||
terminate_process(child.pid, timeout=timeout, ignore_zombie=False)
|
||||
terminate_process(child.pid, timeout=timeout, ignore_zombie=False, include_children=True)
|
||||
if include_parent:
|
||||
terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user