mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-08 23:47:15 +00:00
Fix --stop with dynamic gpus
This commit is contained in:
parent
3c4e976093
commit
e93384b99b
@ -878,7 +878,7 @@ class Worker(ServiceCommandSection):
|
|||||||
# if we are in dynamic gpus / services mode,
|
# if we are in dynamic gpus / services mode,
|
||||||
# we should send termination signal to all child processes
|
# we should send termination signal to all child processes
|
||||||
if self._services_mode:
|
if self._services_mode:
|
||||||
terminate_all_child_processes(timeout=120)
|
terminate_all_child_processes(timeout=20, include_parent=False)
|
||||||
|
|
||||||
# if we are here, just kill all sub processes
|
# if we are here, just kill all sub processes
|
||||||
kill_all_child_processes()
|
kill_all_child_processes()
|
||||||
@ -1371,6 +1371,7 @@ class Worker(ServiceCommandSection):
|
|||||||
service_mode_internal_agent_started = None
|
service_mode_internal_agent_started = None
|
||||||
stopping = False
|
stopping = False
|
||||||
status = None
|
status = None
|
||||||
|
process = None
|
||||||
try:
|
try:
|
||||||
_last_machine_update_ts = time()
|
_last_machine_update_ts = time()
|
||||||
stop_reason = None
|
stop_reason = None
|
||||||
@ -1427,6 +1428,8 @@ class Worker(ServiceCommandSection):
|
|||||||
status = ex.returncode
|
status = ex.returncode
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
# so someone else will catch us
|
# so someone else will catch us
|
||||||
|
if process:
|
||||||
|
kill_all_child_processes(process.pid)
|
||||||
raise
|
raise
|
||||||
except Exception:
|
except Exception:
|
||||||
# we should not get here, but better safe than sorry
|
# we should not get here, but better safe than sorry
|
||||||
@ -1438,6 +1441,10 @@ class Worker(ServiceCommandSection):
|
|||||||
stop_reason = TaskStopReason.exception
|
stop_reason = TaskStopReason.exception
|
||||||
status = -1
|
status = -1
|
||||||
|
|
||||||
|
# full cleanup (just in case)
|
||||||
|
if process:
|
||||||
|
kill_all_child_processes(process.pid)
|
||||||
|
|
||||||
# if running in services mode, keep the file open
|
# if running in services mode, keep the file open
|
||||||
# in case the docker was so quick it started and finished, check the stop reason
|
# in case the docker was so quick it started and finished, check the stop reason
|
||||||
if self._services_mode and service_mode_internal_agent_started and stop_reason == 'Service started':
|
if self._services_mode and service_mode_internal_agent_started and stop_reason == 'Service started':
|
||||||
@ -3091,7 +3098,7 @@ class Worker(ServiceCommandSection):
|
|||||||
warning('Could not terminate process pid={}'.format(pid))
|
warning('Could not terminate process pid={}'.format(pid))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# wither we have a match for the worker_id or we just pick the first one, and kill it.
|
# either we have a match for the worker_id or we just pick the first one, and kill it.
|
||||||
if (worker_id and uid == worker_id) or (not worker_id and uid.startswith('{}:'.format(worker_name))):
|
if (worker_id and uid == worker_id) or (not worker_id and uid.startswith('{}:'.format(worker_name))):
|
||||||
# this is us kill it
|
# this is us kill it
|
||||||
print('Terminating clearml-agent worker_id={} pid={}'.format(uid, pid))
|
print('Terminating clearml-agent worker_id={} pid={}'.format(uid, pid))
|
||||||
|
@ -42,20 +42,31 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
|
|||||||
return output if not strip or not output else output.strip()
|
return output if not strip or not output else output.strip()
|
||||||
|
|
||||||
|
|
||||||
def terminate_process(pid, timeout=10., ignore_zombie=True):
|
def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False):
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
proc = psutil.Process(pid)
|
proc = psutil.Process(pid)
|
||||||
|
children = proc.children(recursive=True) if include_children else []
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
cnt = 0
|
cnt = 0
|
||||||
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||||
sleep(1.)
|
sleep(1.)
|
||||||
cnt += 1
|
cnt += 1
|
||||||
proc.terminate()
|
proc.terminate()
|
||||||
|
|
||||||
|
# terminate children
|
||||||
|
for c in children:
|
||||||
|
c.terminate()
|
||||||
|
|
||||||
cnt = 0
|
cnt = 0
|
||||||
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
|
||||||
sleep(1.)
|
sleep(1.)
|
||||||
cnt += 1
|
cnt += 1
|
||||||
|
|
||||||
|
# kill children
|
||||||
|
for c in children:
|
||||||
|
c.kill()
|
||||||
|
|
||||||
proc.kill()
|
proc.kill()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
@ -66,9 +77,8 @@ def terminate_process(pid, timeout=10., ignore_zombie=True):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def kill_all_child_processes(pid=None):
|
def kill_all_child_processes(pid=None, include_parent=True):
|
||||||
# get current process if pid not provided
|
# get current process if pid not provided
|
||||||
include_parent = True
|
|
||||||
if not pid:
|
if not pid:
|
||||||
pid = os.getpid()
|
pid = os.getpid()
|
||||||
include_parent = False
|
include_parent = False
|
||||||
@ -96,7 +106,7 @@ def terminate_all_child_processes(pid=None, timeout=10., include_parent=True):
|
|||||||
return
|
return
|
||||||
for child in parent.children(recursive=False):
|
for child in parent.children(recursive=False):
|
||||||
print('Terminating child process {}'.format(child.pid))
|
print('Terminating child process {}'.format(child.pid))
|
||||||
terminate_process(child.pid, timeout=timeout, ignore_zombie=False)
|
terminate_process(child.pid, timeout=timeout, ignore_zombie=False, include_children=True)
|
||||||
if include_parent:
|
if include_parent:
|
||||||
terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)
|
terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user