Add support for dynamic gpus opportunistic scheduling (with min/max gpus per queue)

2025-03-03 18:52:22 +00:00 · 2021-04-20 18:11:16 +03:00 · 2021-04-20 18:11:16 +03:00 · 5c8675e43a
commit 5c8675e43a
parent 60a58f6fad
2 changed files with 24 additions and 18 deletions
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@ -767,14 +767,13 @@ class Worker(ServiceCommandSection):
                            break
                        if not len(response.queue.entries):
                            continue
-                        # check if we have enough available gpus
-                        if gpu_queues[queue] > len(available_gpus):
+                        # check if we do not have enough available gpus
+                        if gpu_queues[queue][0] > len(available_gpus):
                            # not enough available_gpus, we should sleep and start over
                            if self._daemon_foreground or worker_params.debug:
-                                print("Not enough free GPUs {}/{}, sleeping for {:.1f} seconds".format(
-                                    len(available_gpus), gpu_queues[queue], self._polling_interval))
-                            sleep(self._polling_interval)
-                            break
+                                print("Not enough free GPUs for queue={} {}/{}".format(
+                                    queue, len(available_gpus), gpu_queues[queue][0]))
+                            continue

                    # get next task in queue
                    try:
@ -813,8 +812,10 @@ class Worker(ServiceCommandSection):
                        # the following is only executed in dynamic gpus mode
                        if gpu_queues and gpu_queues.get(queue):
                            # pick the first available GPUs
-                            gpus = available_gpus[:gpu_queues.get(queue)]
-                            available_gpus = available_gpus[gpu_queues.get(queue):]
+                            # gpu_queues[queue] = (min_gpus, max_gpus)
+                            # get as many gpus as possible with max_gpus as limit, the min is covered before
+                            gpus = available_gpus[:gpu_queues.get(queue)[1]]
+                            available_gpus = available_gpus[gpu_queues.get(queue)[1]:]
                            self.set_runtime_properties(
                                key='available_gpus', value=','.join(str(g) for g in available_gpus))
                            os.environ['CUDA_VISIBLE_DEVICES'] = \
@ -1114,6 +1115,10 @@ class Worker(ServiceCommandSection):

        columns = ("id", "name", "tags")
        print("Listening to queues:")
+        if dynamic_gpus:
+            columns = ("id", "name", "tags", "gpus")
+            for q in queues_info:
+                q['gpus'] = str(dict(dynamic_gpus).get(q['id']) or '')
        print_table(queues_info, columns=columns, titles=columns)

        # register worker
@ -1134,7 +1139,7 @@ class Worker(ServiceCommandSection):
            # make sure we have CUDA set if we have --gpus
            if kwargs.get('gpus') and self._session.config.get('agent.cuda_version', None) in (None, 0, '0'):
                message = 'Running with GPUs but no CUDA version was detected!\n' \
-                          '\tSet OS environemnt CUDA_VERSION & CUDNN_VERSION to the correct version\n' \
+                          '\tSet OS environment CUDA_VERSION & CUDNN_VERSION to the correct version\n' \
                          '\tExample: export CUDA_VERSION=10.1 or (Windows: set CUDA_VERSION=10.1)'
                if is_conda(self._session.config):
                    self._unregister(queues)
@ -1246,7 +1251,14 @@ class Worker(ServiceCommandSection):
                '--dynamic_gpus must be use with '
                'specific gpus for example "0-7" or "0,1,2,3"'.format(kwargs.get('gpus')))

-        dynamic_gpus = [(s[:-1 - len(s.split('=')[-1])], int(s.split('=')[-1])) for s in queue_names]
+        dynamic_gpus = []
+        for s in queue_names:
+            s_p = s.split('=')
+            name = s[:-1 - len(s_p[-1])]
+            min_max_g = int(s_p[-1].split('-')[0] or 1), int(s_p[-1].split('-')[-1])
+            if min(min_max_g) <= 0:
+                raise ValueError("Parsing min/max number of gpus <= 0 is not allowed: \"{}\"".format(s))
+            dynamic_gpus.append((name, min_max_g,))
        queue_names = [q for q, _ in dynamic_gpus]
        # resolve queue ids
        dynamic_gpus_q = self._resolve_queue_names(
@ -1257,13 +1269,6 @@ class Worker(ServiceCommandSection):

        self._dynamic_gpus = True

-        # order queue priority based on the combination we have
-        dynamic_gpus = sorted(
-            dynamic_gpus, reverse=True, key=cmp_to_key(
-                lambda x, y: -1 if x[1] < y[1] or x[1] == y[1] and
-                                   dynamic_gpus_q.index(x[0]) > dynamic_gpus_q.index(y[0])
-                else +1))
-
        return dynamic_gpus, gpu_indexes, queues

    def _register_dynamic_gpus(self, gpu_indexes):
--- a/clearml_agent/interface/worker.py
+++ b/clearml_agent/interface/worker.py
@ -99,7 +99,8 @@ DAEMON_ARGS = dict({
    '--dynamic-gpus': {
        'help': 'Allow to dynamically allocate gpus based on queue properties, '
                'configure with \'--queues <queue_name>=<num_gpus>\'.'
-                ' Example: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 single_gpu=1\'',
+                ' Example: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 single_gpu=1\''
+                ' Example Opportunistic: \'--dynamic-gpus --gpus 0-3 --queue dual_gpus=2 max_quad_gpus=1-4 \'',
        'action': 'store_true',
    },
    '--uptime': {