From d87521c36c6f8deb86db1662acf304c9dce74c33 Mon Sep 17 00:00:00 2001 From: clearml <> Date: Mon, 24 Feb 2025 13:44:26 +0200 Subject: [PATCH] Add support for container rulebook overrides ('force_container_rules: true') and container rulebook task update ('update_back_task: true'). This addition allows users to override container arguments forcefully based on the tasks properties (repo, tags, project, user etc.), as well as offer additional defaults based on python required packages or python versions --- .../backend_api/config/default/agent.conf | 37 +++++- clearml_agent/commands/resolver.py | 113 +++++++++--------- clearml_agent/commands/worker.py | 52 +++++++- docs/clearml.conf | 29 ++++- 4 files changed, 160 insertions(+), 71 deletions(-) diff --git a/clearml_agent/backend_api/config/default/agent.conf b/clearml_agent/backend_api/config/default/agent.conf index 5b4b6a6..fc573af 100644 --- a/clearml_agent/backend_api/config/default/agent.conf +++ b/clearml_agent/backend_api/config/default/agent.conf @@ -236,20 +236,35 @@ docker_force_pull: false default_docker: { - # default docker image to use when running in docker mode - image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04" + # default container image to use when running in docker mode + image: "nvidia/cuda:12.6.3-cudnn-runtime-ubuntu20.04" # optional arguments to pass to docker image # arguments: ["--ipc=host", ] - # Choose the default docker based on the Task properties, - # Notice: Enterprise feature, ignored otherwise - # Examples: 'script.requirements', 'script.binary', 'script.repository', 'script.branch', 'project' - # Notice: Matching is done via regular expression, for example "^searchme$" will match exactly "searchme" string + # if true update the Task container section based on the selected rule/default + # can also be set/override per specific rule + update_back_task: true + + # **Notice**: Enterprise feature, ignored otherwise + # Choose the default container based on the Task properties, + # container values that can be changed: "image", "arguments" , "setup_shell_script" + # Notice: adding "force_container_rules: true" to a rule, + # will cause it to IGNORE the Task's requested container and use the configuration of the rule, + # including all its entries (image, arguments, setup_shell_script)! + # Rule selector keys: 'script.requirements', 'script.binary', + # 'script.repository', 'script.branch', + # 'project', 'project_id', 'user_id', 'container', 'tags' + # Notice: Matching is done via regular expression and needs to match ALL entries inside the rule, + # matching re example: "^searchme$" will match exactly "searchme" string inside an entry + # specifically for tags single tag match is enough: + # "^my_tag$" will match a Task that has + # multiple tags one of them matches ["general_tag", "my_tag$"] "match_rules": [ { "image": "python:3.6-bullseye", "arguments": "--ipc=host", + "update_back_task": true, "match": { "script": { "binary": "python3.6$", @@ -259,6 +274,7 @@ { "image": "python:3.7-bullseye", "arguments": "--ipc=host", + "update_back_task": true, "match": { "script": { "binary": "python3.7$", @@ -310,6 +326,15 @@ }, } }, + { + "image": "python:3.13-bullseye", + "arguments": "--ipc=host", + "match": { + "script": { + "binary": "python3.13$", + }, + } + }, ] } diff --git a/clearml_agent/commands/resolver.py b/clearml_agent/commands/resolver.py index aa7679d..e6b0f41 100644 --- a/clearml_agent/commands/resolver.py +++ b/clearml_agent/commands/resolver.py @@ -34,7 +34,7 @@ def resolve_default_container(session, task_id, container_config, ignore_match_r json={'id': [task_id], 'only_fields': ['script.requirements', 'script.binary', 'script.repository', 'script.branch', - 'project', 'container'], + 'project', 'container', 'tags', 'user'], 'search_hidden': True}, method=Request.def_method, async_enable=False, @@ -72,62 +72,50 @@ def resolve_default_container(session, task_id, container_config, ignore_match_r except (ValueError, TypeError): pass + match_term_lookup = { + "project": project_full_name, + "project_id": task_info.get('project', ''), + "script.repository": repository, + "script.branch": branch, + "script.binary": binary, + "user_id": task_info.get('user', ""), + "container": requested_container.get('image', ''), + "tags": task_info.get('tags', []), + } + task_packages_lookup = {} for entry in container_lookup: match = entry.get('match', None) if not match: continue - if match.get('project', None): - # noinspection PyBroadException - try: - if not re.search(match.get('project', None), project_full_name): - continue - except Exception: - print('Failed parsing regular expression \"{}\" in rule: {}'.format( - match.get('project', None), entry)) - continue - - if match.get('script.repository', None): - # noinspection PyBroadException - try: - if not re.search(match.get('script.repository', None), repository): - continue - except Exception: - print('Failed parsing regular expression \"{}\" in rule: {}'.format( - match.get('script.repository', None), entry)) - continue - - if match.get('script.branch', None): - # noinspection PyBroadException - try: - if not re.search(match.get('script.branch', None), branch): - continue - except Exception: - print('Failed parsing regular expression \"{}\" in rule: {}'.format( - match.get('script.branch', None), entry)) - continue - - if match.get('script.binary', None): - # noinspection PyBroadException - try: - if not re.search(match.get('script.binary', None), binary): - continue - except Exception: - print('Failed parsing regular expression \"{}\" in rule: {}'.format( - match.get('script.binary', None), entry)) - continue - - # if match.get('image', None): - # # noinspection PyBroadException - # try: - # if not re.search(match.get('image', None), requested_container.get('image', '')): - # continue - # except Exception: - # print('Failed parsing regular expression \"{}\" in rule: {}'.format( - # match.get('image', None), entry)) - # continue matched = True + for key, value in match_term_lookup.items(): + term = match.get(key, None) + if not term: + continue + values = [value] if not isinstance(value, (list, tuple)) else value + # noinspection PyBroadException + try: + terms = [term] if not isinstance(term, (list, tuple)) else term + # we fail if we didn't find ANY match in the list + if all(any(bool(re.search(t, v)) for v in values) for t in terms): + # we found a match, go to the next match term + pass + else: + # no match, stop, and we should go to the next rule + matched = False + break + except Exception: + print('Failed parsing regular expression \"{}\" in rule: {}'.format(term, entry)) + matched = False + break + + # we had at least a single key that was Not matched in this rule, go to the next one + if not matched: + continue + + # look for the complicated stuff (i.e. requirements) for req_section in ['script.requirements.pip', 'script.requirements.conda']: if not match.get(req_section, None): continue @@ -163,13 +151,28 @@ def resolve_default_container(session, task_id, container_config, ignore_match_r matched = False break + # if we found a match in the rulebook if matched: - if not container_config.get('image'): - container_config['image'] = entry.get('image', None) - if not container_config.get('arguments'): + allow_override = bool(entry.get('force_container_rules', None)) + container_config["force_container_rules"] = allow_override + + if not container_config.get('image') or allow_override: + container_config['image'] = entry.get('image', None) or '' + + if not container_config.get('arguments') or allow_override: container_config['arguments'] = entry.get('arguments', None) or '' - if isinstance(container_config.get('arguments'), str): - container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip()) + + if not container_config.get('setup_shell_script') or allow_override: + container_config['setup_shell_script'] = entry.get('setup_shell_script', None) or '' + if isinstance(container_config['setup_shell_script'], (list, tuple)): + container_config['setup_shell_script'] = "\n".join(container_config['setup_shell_script']) + + update_back_task = entry.get('update_back_task', None) + if update_back_task is None: + update_back_task = session.config.get('agent.default_docker.update_back_task', None) + + container_config['update_back_task'] = update_back_task + print('INFO: Matching default container with rule:\n{}'.format(json.dumps(entry))) return container_config diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index c2e1661..22f0a5d 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -388,7 +388,7 @@ def get_task_fields(session, task_id, fields: list, log=None) -> dict: return {} -def get_task_container(session, task_id, ignore_match_rules=False): +def get_task_container(session, task_id, ignore_match_rules=False, allow_force_container_rules=True): """ Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''} """ @@ -423,12 +423,54 @@ def get_task_container(session, task_id, ignore_match_rules=False): except (ValueError, TypeError): pass - if (not container or not container.get('image')) and session.check_min_api_version("2.13"): - container = resolve_default_container( + no_default_container = not container or not container.get('image') + if no_default_container or allow_force_container_rules and session.check_min_api_version("2.13"): + original_container = copy(container) or {} + updated_container = resolve_default_container( session=session, task_id=task_id, - container_config=container, - ignore_match_rules=ignore_match_rules, + container_config=original_container, + ignore_match_rules=ignore_match_rules and not no_default_container, ) + if no_default_container and not ignore_match_rules: + # if we do not have a default container image / args, use the defaults from the resolver + container = updated_container + elif allow_force_container_rules and updated_container.get('force_container_rules'): + # if we allow to force rules (and we have requested container) + # and the 'force_container_rules' is turned on in the rule, then overwrite container + container = updated_container + + # make sure we pop the new added fields + container.pop("force_container_rules", None) + + # check if we need to update the Task based on the new matched container defaults or overrides + update_back_task = container.pop("update_back_task", None) or updated_container.pop("update_back_task", None) + if update_back_task: + # update back the task + print('INFO: Updating the Task with the selected container with rule') + try: + res = session.send_request( + service='tasks', action='edit', method=Request.def_method, + version='2.13', + json={ + "task": task_id, + "force": True, + "container": { + "image": container.get('image') or "", + "arguments": container.get('arguments') or "", + "setup_shell_script": container.get('setup_shell_script') or "", + } + }, + ) + if not res.ok: + raise Exception("failed setting runtime property") + except Exception as ex: + print("WARNING: failed setting container properties for task '{}': {}".format(task_id, ex)) + + # make sure we preserve backwards compatibility with the expected entries types + if isinstance(container.get('arguments'), str): + container['arguments'] = shlex.split(str(container.get('arguments') or '').strip()) + if container.get('image'): + container['image'] = container.get('image').strip() return container diff --git a/docs/clearml.conf b/docs/clearml.conf index b78adfb..5eae0f8 100644 --- a/docs/clearml.conf +++ b/docs/clearml.conf @@ -230,20 +230,39 @@ agent { docker_force_pull: false default_docker: { - # default docker image to use when running in docker mode - image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04" + # default container image to use when running in docker mode + image: "nvidia/cuda:12.6.3-cudnn-runtime-ubuntu20.04" # optional arguments to pass to docker image # arguments: ["--ipc=host"] - # lookup table rules for default container - # first matched rule will be picked, according to rule order - # enterprise version only + # **Notice**: Enterprise feature, ignored otherwise + # Choose the default container based on the Task properties, + # container values that can be changed: "image", "arguments" , "setup_shell_script" + # Notice: adding "force_container_rules: true" to a rule, + # will cause it to IGNORE the Task's requested container and use the configuration of the rule, + # including all its entries (image, arguments, setup_shell_script)! + # Rule selector keys: 'script.requirements', 'script.binary', + # 'script.repository', 'script.branch', + # 'project', 'project_id', 'user_id', 'container', 'tags' + # Notice: Matching is done via regular expression and needs to match ALL entries inside the rule, + # matching re example: "^searchme$" will match exactly "searchme" string inside an entry + # specifically for tags single tag match is enough: + # "^my_tag$" will match a Task that has + # multiple tags one of them matches ["general_tag", "my_tag$"] + # # match_rules: [ # { + # # default container image to use when running in docker mode # image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04" + # # optional arguments to pass to docker image # arguments: "-e define=value" + # # if true update the Task container section based on the selected rule/default + # # can also be set/override per specific rule + # update_back_task: true + # # match: { + # force_container_rules: true # script{ # # Optional: must match all requirements (not partial) # requirements: {