mirror of
https://github.com/clearml/clearml-agent
synced 2025-01-31 09:06:52 +00:00
Fix service-mode support for venvs
Fix --services-mode with venvs
This commit is contained in:
parent
407deb84e9
commit
ca242424ab
@ -412,6 +412,7 @@ class Worker(ServiceCommandSection):
|
||||
self._daemon_foreground = None
|
||||
self._standalone_mode = None
|
||||
self._services_mode = None
|
||||
self._dynamic_gpus = None
|
||||
self._force_current_version = None
|
||||
self._redirected_stdout_file_no = None
|
||||
self._uptime_config = self._session.config.get("agent.uptime", None)
|
||||
@ -499,6 +500,9 @@ class Worker(ServiceCommandSection):
|
||||
)
|
||||
|
||||
docker_image = None
|
||||
worker_id = '{}:service:{}'.format(self.worker_id, task_id) \
|
||||
if self._services_mode and not self._dynamic_gpus else self.worker_id
|
||||
|
||||
if self.docker_image_func:
|
||||
try:
|
||||
response = get_task(self._session, task_id, only_fields=["execution.docker_cmd"])
|
||||
@ -526,7 +530,7 @@ class Worker(ServiceCommandSection):
|
||||
if self._services_mode:
|
||||
# if this is services mode, give the docker a unique worker id, as it will register itself.
|
||||
full_docker_cmd = self.docker_image_func(
|
||||
worker_id='{}:service:{}'.format(self.worker_id, task_id),
|
||||
worker_id=worker_id,
|
||||
docker_image=docker_image, docker_arguments=docker_arguments)
|
||||
else:
|
||||
full_docker_cmd = self.docker_image_func(docker_image=docker_image, docker_arguments=docker_arguments)
|
||||
@ -568,7 +572,7 @@ class Worker(ServiceCommandSection):
|
||||
status = ExitStatus.failure
|
||||
try:
|
||||
# set WORKER ID
|
||||
ENV_WORKER_ID.set(self.worker_id)
|
||||
ENV_WORKER_ID.set(worker_id)
|
||||
|
||||
if self._docker_force_pull and docker_image:
|
||||
full_pull_cmd = ['docker', 'pull', docker_image]
|
||||
@ -958,16 +962,12 @@ class Worker(ServiceCommandSection):
|
||||
self._session.print_configuration()
|
||||
|
||||
def daemon(self, queues, log_level, foreground=False, docker=False, detached=False, order_fairness=False, **kwargs):
|
||||
# if we do not need to create queues, make sure they are valid
|
||||
# match previous behaviour when we validated queue names before everything else
|
||||
queues = self._resolve_queue_names(queues, create_if_missing=kwargs.get('create_queue', False))
|
||||
|
||||
self._standalone_mode = kwargs.get('standalone_mode', False)
|
||||
self._services_mode = kwargs.get('services_mode', False)
|
||||
# must have docker in services_mode
|
||||
if self._services_mode:
|
||||
kwargs = self._verify_command_states(kwargs)
|
||||
docker = docker or kwargs.get('docker')
|
||||
self._uptime_config = kwargs.get('uptime', None) or self._uptime_config
|
||||
self._downtime_config = kwargs.get('downtime', None) or self._downtime_config
|
||||
if self._uptime_config and self._downtime_config:
|
||||
@ -980,6 +980,13 @@ class Worker(ServiceCommandSection):
|
||||
# support --dynamic-gpus
|
||||
dynamic_gpus, gpu_indexes, queues = self._parse_dynamic_gpus(kwargs, queues)
|
||||
|
||||
if self._services_mode and dynamic_gpus:
|
||||
raise ValueError("Combining --dynamic-gpus and --services-mode is not supported")
|
||||
|
||||
# if we do not need to create queues, make sure they are valid
|
||||
# match previous behaviour when we validated queue names before everything else
|
||||
queues = self._resolve_queue_names(queues, create_if_missing=kwargs.get('create_queue', False))
|
||||
|
||||
# We are not running a daemon we are killing one.
|
||||
# find the pid send termination signal and leave
|
||||
if kwargs.get('stop', False):
|
||||
@ -1002,7 +1009,7 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
# make sure we only have a single instance,
|
||||
# also make sure we set worker_id properly and cache folders
|
||||
self._singleton()
|
||||
self._singleton(dynamic=bool(dynamic_gpus))
|
||||
|
||||
# check if we have the latest version
|
||||
start_check_update_daemon()
|
||||
@ -1123,6 +1130,11 @@ class Worker(ServiceCommandSection):
|
||||
if not dynamic_gpus:
|
||||
return None, None, queues
|
||||
|
||||
queue_names = [q.name for q in queues]
|
||||
if not all('=' in q for q in queue_names):
|
||||
raise ValueError("using --dynamic-gpus, --queues [{}], "
|
||||
"queue must be in format <queue_name>=<num_gpus>".format(queue_names))
|
||||
|
||||
gpu_indexes = kwargs.get('gpus')
|
||||
|
||||
# test gpus were passed correctly
|
||||
@ -1137,17 +1149,16 @@ class Worker(ServiceCommandSection):
|
||||
except Exception:
|
||||
raise ValueError('Failed parsing --gpus {}'.format(kwargs.get('gpus')))
|
||||
|
||||
dynamic_gpus = [(s[:-1 - len(s.split('=')[-1])], int(s.split('=')[-1])) for s in dynamic_gpus]
|
||||
dynamic_gpus = [(s[:-1 - len(s.split('=')[-1])], int(s.split('=')[-1])) for s in queue_names]
|
||||
queue_names = [q for q, _ in dynamic_gpus]
|
||||
# resolve queue ids
|
||||
dynamic_gpus_q = self._resolve_queue_names([q for q, _ in dynamic_gpus], create_if_missing=False)
|
||||
dynamic_gpus_q = self._resolve_queue_names(
|
||||
queue_names, create_if_missing=kwargs.get('create_queue', False))
|
||||
dynamic_gpus = list(zip(dynamic_gpus_q, [i for _, i in dynamic_gpus]))
|
||||
if set(dynamic_gpus_q) != set(queues):
|
||||
raise ValueError(
|
||||
"--dynamic-gpus [{}] and --queues [{}] must contain the same queues".format(
|
||||
dynamic_gpus, queues))
|
||||
dynamic_gpus = sorted(
|
||||
dynamic_gpus, reverse=True, key=cmp_to_key(
|
||||
lambda x, y: -1 if x[1] < y[1] or x[1] == y[1] and queues.index(x[0]) > queues.index(y[0])
|
||||
lambda x, y: -1 if x[1] < y[1] or x[1] == y[1] and
|
||||
dynamic_gpus_q.index(x[0]) > dynamic_gpus_q.index(y[0])
|
||||
else +1))
|
||||
# order queue priority based on the combination we have
|
||||
queues = [q for q, _ in dynamic_gpus]
|
||||
@ -1158,6 +1169,8 @@ class Worker(ServiceCommandSection):
|
||||
key='available_gpus', value=','.join(str(g) for g in available_gpus)):
|
||||
raise ValueError("Dynamic GPU allocation is not supported by the ClearML-server")
|
||||
|
||||
self._dynamic_gpus = True
|
||||
|
||||
return dynamic_gpus, gpu_indexes, queues
|
||||
|
||||
def report_monitor(self, report):
|
||||
@ -1631,10 +1644,16 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
# We expect the same behaviour in case full_monitoring was set, and in case docker mode is used
|
||||
if full_monitoring or docker is not False:
|
||||
if full_monitoring and not (ENV_WORKER_ID.get() or '').strip():
|
||||
self._session.config["agent"]["worker_id"] = ''
|
||||
if full_monitoring:
|
||||
if not (ENV_WORKER_ID.get() or '').strip():
|
||||
self._session.config["agent"]["worker_id"] = ''
|
||||
# make sure we support multiple instances if we need to
|
||||
self._singleton()
|
||||
self.temp_config_path = self.temp_config_path or safe_mkstemp(
|
||||
suffix=".cfg", prefix=".clearml_agent.", text=True, name_only=True
|
||||
)
|
||||
self.dump_config(self.temp_config_path)
|
||||
self._session._config_file = self.temp_config_path
|
||||
|
||||
worker_params = WorkerParams(
|
||||
log_level=log_level,
|
||||
@ -1647,6 +1666,10 @@ class Worker(ServiceCommandSection):
|
||||
|
||||
self.stop_monitor()
|
||||
self._unregister()
|
||||
|
||||
if full_monitoring and self.temp_config_path:
|
||||
safe_remove_file(self._session.config_file)
|
||||
Singleton.close_pid_file()
|
||||
return
|
||||
|
||||
self._session.print_configuration()
|
||||
@ -2852,9 +2875,9 @@ class Worker(ServiceCommandSection):
|
||||
worker_name, worker_id))
|
||||
return False
|
||||
|
||||
def _singleton(self):
|
||||
def _singleton(self, dynamic=False):
|
||||
# ensure singleton
|
||||
worker_id, worker_name = self._generate_worker_id_name()
|
||||
worker_id, worker_name = self._generate_worker_id_name(dynamic=dynamic)
|
||||
|
||||
# if we are running in services mode, we allow double register since
|
||||
# docker-compose will kill instances before they cleanup
|
||||
@ -2869,13 +2892,14 @@ class Worker(ServiceCommandSection):
|
||||
# update folders based on free slot
|
||||
self._session.create_cache_folders(slot_index=worker_slot)
|
||||
|
||||
def _generate_worker_id_name(self):
|
||||
def _generate_worker_id_name(self, dynamic=False):
|
||||
worker_id = self._session.config["agent.worker_id"]
|
||||
worker_name = self._session.config["agent.worker_name"]
|
||||
if not worker_id and os.environ.get('NVIDIA_VISIBLE_DEVICES') is not None:
|
||||
nvidia_visible_devices = os.environ.get('NVIDIA_VISIBLE_DEVICES')
|
||||
if nvidia_visible_devices and nvidia_visible_devices.lower() != 'none':
|
||||
worker_id = '{}:gpu{}'.format(worker_name, nvidia_visible_devices)
|
||||
worker_id = '{}:{}gpu{}'.format(
|
||||
worker_name, 'd' if dynamic else '', nvidia_visible_devices)
|
||||
elif nvidia_visible_devices == '':
|
||||
pass
|
||||
else:
|
||||
|
@ -135,7 +135,8 @@ class FolderCache(object):
|
||||
# if we do not have enough free space, do nothing.
|
||||
if not self._check_min_free_space():
|
||||
warning('Could not add cache entry, not enough free space on drive, '
|
||||
'free space threshold {} GB'.format(self._min_free_space_gb))
|
||||
'free space threshold {} GB. Clearing all cache entries!'.format(self._min_free_space_gb))
|
||||
self._remove_old_entries(max_cache_entries=0)
|
||||
return False
|
||||
|
||||
# create the new entry for us
|
||||
@ -175,10 +176,11 @@ class FolderCache(object):
|
||||
"""
|
||||
return self._last_copied_entry_folder
|
||||
|
||||
def _remove_old_entries(self):
|
||||
# type: () -> ()
|
||||
def _remove_old_entries(self, max_cache_entries=None):
|
||||
# type: (Optional[int]) -> ()
|
||||
"""
|
||||
Notice we only keep self._max_cache_entries-1, assuming we will be adding a new entry soon
|
||||
:param int max_cache_entries: if not None use instead of self._max_cache_entries
|
||||
"""
|
||||
folder_entries = [(cache_folder, cache_folder.stat().st_mtime)
|
||||
for cache_folder in self._cache_folder.glob('*')
|
||||
@ -193,7 +195,8 @@ class FolderCache(object):
|
||||
warning('Could not lock cache folder {}: {}'.format(self._cache_folder, ex))
|
||||
return
|
||||
|
||||
number_of_entries_to_keep = self._max_cache_entries-1
|
||||
number_of_entries_to_keep = self._max_cache_entries - 1 \
|
||||
if max_cache_entries is None else max(0, int(max_cache_entries))
|
||||
for folder, ts in folder_entries[number_of_entries_to_keep:]:
|
||||
try:
|
||||
shutil.rmtree(folder.as_posix(), ignore_errors=True)
|
||||
|
@ -7,7 +7,7 @@ from tempfile import gettempdir, NamedTemporaryFile
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
from clearml_agent.definitions import ENV_DOCKER_HOST_MOUNT
|
||||
from clearml_agent.helper.base import warning, is_windows_platform
|
||||
from clearml_agent.helper.base import warning, is_windows_platform, safe_remove_file
|
||||
|
||||
|
||||
class Singleton(object):
|
||||
@ -22,6 +22,13 @@ class Singleton(object):
|
||||
_lock_timeout = 10
|
||||
_pid = None
|
||||
|
||||
@classmethod
|
||||
def close_pid_file(cls):
|
||||
if cls._pid_file:
|
||||
cls._pid_file.close()
|
||||
safe_remove_file(cls._pid_file.name)
|
||||
cls._pid_file = None
|
||||
|
||||
@classmethod
|
||||
def update_pid_file(cls):
|
||||
new_pid = str(os.getpid())
|
||||
@ -115,7 +122,7 @@ class Singleton(object):
|
||||
|
||||
@classmethod
|
||||
def _register_instance(cls, unique_worker_id=None, worker_name=None, api_client=None, allow_double=False):
|
||||
if cls.worker_id:
|
||||
if cls.worker_id and cls.instance_slot is not None:
|
||||
return cls.worker_id, cls.instance_slot
|
||||
# make sure we have a unique name
|
||||
instance_num = 0
|
||||
|
@ -97,10 +97,10 @@ DAEMON_ARGS = dict({
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--dynamic-gpus': {
|
||||
'help': 'Allow to dynamically allocate gpus based on queue properties, pass \'<queue_name>=<num_gpus>\'.'
|
||||
' Example: \'dual_gpus=2 single_gpu=1\'',
|
||||
'nargs': '*',
|
||||
'default': None,
|
||||
'help': 'Allow to dynamically allocate gpus based on queue properties, '
|
||||
'configure with \'--queues <queue_name>=<num_gpus>\'.'
|
||||
' Example: \'--dynamic-gpus --queue dual_gpus=2 single_gpu=1\'',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--uptime': {
|
||||
'help': 'Specify uptime for clearml-agent in "<hours> <days>" format. for example, use "17-20 TUE" to set '
|
||||
|
Loading…
Reference in New Issue
Block a user