diff --git a/clearml_agent/backend_api/config/default/agent.conf b/clearml_agent/backend_api/config/default/agent.conf index 5b4b6a6..fc573af 100644 --- a/clearml_agent/backend_api/config/default/agent.conf +++ b/clearml_agent/backend_api/config/default/agent.conf @@ -236,20 +236,35 @@ docker_force_pull: false default_docker: { - # default docker image to use when running in docker mode - image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04" + # default container image to use when running in docker mode + image: "nvidia/cuda:12.6.3-cudnn-runtime-ubuntu20.04" # optional arguments to pass to docker image # arguments: ["--ipc=host", ] - # Choose the default docker based on the Task properties, - # Notice: Enterprise feature, ignored otherwise - # Examples: 'script.requirements', 'script.binary', 'script.repository', 'script.branch', 'project' - # Notice: Matching is done via regular expression, for example "^searchme$" will match exactly "searchme" string + # if true update the Task container section based on the selected rule/default + # can also be set/override per specific rule + update_back_task: true + + # **Notice**: Enterprise feature, ignored otherwise + # Choose the default container based on the Task properties, + # container values that can be changed: "image", "arguments" , "setup_shell_script" + # Notice: adding "force_container_rules: true" to a rule, + # will cause it to IGNORE the Task's requested container and use the configuration of the rule, + # including all its entries (image, arguments, setup_shell_script)! + # Rule selector keys: 'script.requirements', 'script.binary', + # 'script.repository', 'script.branch', + # 'project', 'project_id', 'user_id', 'container', 'tags' + # Notice: Matching is done via regular expression and needs to match ALL entries inside the rule, + # matching re example: "^searchme$" will match exactly "searchme" string inside an entry + # specifically for tags single tag match is enough: + # "^my_tag$" will match a Task that has + # multiple tags one of them matches ["general_tag", "my_tag$"] "match_rules": [ { "image": "python:3.6-bullseye", "arguments": "--ipc=host", + "update_back_task": true, "match": { "script": { "binary": "python3.6$", @@ -259,6 +274,7 @@ { "image": "python:3.7-bullseye", "arguments": "--ipc=host", + "update_back_task": true, "match": { "script": { "binary": "python3.7$", @@ -310,6 +326,15 @@ }, } }, + { + "image": "python:3.13-bullseye", + "arguments": "--ipc=host", + "match": { + "script": { + "binary": "python3.13$", + }, + } + }, ] } diff --git a/clearml_agent/commands/resolver.py b/clearml_agent/commands/resolver.py index aa7679d..e6b0f41 100644 --- a/clearml_agent/commands/resolver.py +++ b/clearml_agent/commands/resolver.py @@ -34,7 +34,7 @@ def resolve_default_container(session, task_id, container_config, ignore_match_r json={'id': [task_id], 'only_fields': ['script.requirements', 'script.binary', 'script.repository', 'script.branch', - 'project', 'container'], + 'project', 'container', 'tags', 'user'], 'search_hidden': True}, method=Request.def_method, async_enable=False, @@ -72,62 +72,50 @@ def resolve_default_container(session, task_id, container_config, ignore_match_r except (ValueError, TypeError): pass + match_term_lookup = { + "project": project_full_name, + "project_id": task_info.get('project', ''), + "script.repository": repository, + "script.branch": branch, + "script.binary": binary, + "user_id": task_info.get('user', ""), + "container": requested_container.get('image', ''), + "tags": task_info.get('tags', []), + } + task_packages_lookup = {} for entry in container_lookup: match = entry.get('match', None) if not match: continue - if match.get('project', None): - # noinspection PyBroadException - try: - if not re.search(match.get('project', None), project_full_name): - continue - except Exception: - print('Failed parsing regular expression \"{}\" in rule: {}'.format( - match.get('project', None), entry)) - continue - - if match.get('script.repository', None): - # noinspection PyBroadException - try: - if not re.search(match.get('script.repository', None), repository): - continue - except Exception: - print('Failed parsing regular expression \"{}\" in rule: {}'.format( - match.get('script.repository', None), entry)) - continue - - if match.get('script.branch', None): - # noinspection PyBroadException - try: - if not re.search(match.get('script.branch', None), branch): - continue - except Exception: - print('Failed parsing regular expression \"{}\" in rule: {}'.format( - match.get('script.branch', None), entry)) - continue - - if match.get('script.binary', None): - # noinspection PyBroadException - try: - if not re.search(match.get('script.binary', None), binary): - continue - except Exception: - print('Failed parsing regular expression \"{}\" in rule: {}'.format( - match.get('script.binary', None), entry)) - continue - - # if match.get('image', None): - # # noinspection PyBroadException - # try: - # if not re.search(match.get('image', None), requested_container.get('image', '')): - # continue - # except Exception: - # print('Failed parsing regular expression \"{}\" in rule: {}'.format( - # match.get('image', None), entry)) - # continue matched = True + for key, value in match_term_lookup.items(): + term = match.get(key, None) + if not term: + continue + values = [value] if not isinstance(value, (list, tuple)) else value + # noinspection PyBroadException + try: + terms = [term] if not isinstance(term, (list, tuple)) else term + # we fail if we didn't find ANY match in the list + if all(any(bool(re.search(t, v)) for v in values) for t in terms): + # we found a match, go to the next match term + pass + else: + # no match, stop, and we should go to the next rule + matched = False + break + except Exception: + print('Failed parsing regular expression \"{}\" in rule: {}'.format(term, entry)) + matched = False + break + + # we had at least a single key that was Not matched in this rule, go to the next one + if not matched: + continue + + # look for the complicated stuff (i.e. requirements) for req_section in ['script.requirements.pip', 'script.requirements.conda']: if not match.get(req_section, None): continue @@ -163,13 +151,28 @@ def resolve_default_container(session, task_id, container_config, ignore_match_r matched = False break + # if we found a match in the rulebook if matched: - if not container_config.get('image'): - container_config['image'] = entry.get('image', None) - if not container_config.get('arguments'): + allow_override = bool(entry.get('force_container_rules', None)) + container_config["force_container_rules"] = allow_override + + if not container_config.get('image') or allow_override: + container_config['image'] = entry.get('image', None) or '' + + if not container_config.get('arguments') or allow_override: container_config['arguments'] = entry.get('arguments', None) or '' - if isinstance(container_config.get('arguments'), str): - container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip()) + + if not container_config.get('setup_shell_script') or allow_override: + container_config['setup_shell_script'] = entry.get('setup_shell_script', None) or '' + if isinstance(container_config['setup_shell_script'], (list, tuple)): + container_config['setup_shell_script'] = "\n".join(container_config['setup_shell_script']) + + update_back_task = entry.get('update_back_task', None) + if update_back_task is None: + update_back_task = session.config.get('agent.default_docker.update_back_task', None) + + container_config['update_back_task'] = update_back_task + print('INFO: Matching default container with rule:\n{}'.format(json.dumps(entry))) return container_config diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index c2e1661..22f0a5d 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -388,7 +388,7 @@ def get_task_fields(session, task_id, fields: list, log=None) -> dict: return {} -def get_task_container(session, task_id, ignore_match_rules=False): +def get_task_container(session, task_id, ignore_match_rules=False, allow_force_container_rules=True): """ Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''} """ @@ -423,12 +423,54 @@ def get_task_container(session, task_id, ignore_match_rules=False): except (ValueError, TypeError): pass - if (not container or not container.get('image')) and session.check_min_api_version("2.13"): - container = resolve_default_container( + no_default_container = not container or not container.get('image') + if no_default_container or allow_force_container_rules and session.check_min_api_version("2.13"): + original_container = copy(container) or {} + updated_container = resolve_default_container( session=session, task_id=task_id, - container_config=container, - ignore_match_rules=ignore_match_rules, + container_config=original_container, + ignore_match_rules=ignore_match_rules and not no_default_container, ) + if no_default_container and not ignore_match_rules: + # if we do not have a default container image / args, use the defaults from the resolver + container = updated_container + elif allow_force_container_rules and updated_container.get('force_container_rules'): + # if we allow to force rules (and we have requested container) + # and the 'force_container_rules' is turned on in the rule, then overwrite container + container = updated_container + + # make sure we pop the new added fields + container.pop("force_container_rules", None) + + # check if we need to update the Task based on the new matched container defaults or overrides + update_back_task = container.pop("update_back_task", None) or updated_container.pop("update_back_task", None) + if update_back_task: + # update back the task + print('INFO: Updating the Task with the selected container with rule') + try: + res = session.send_request( + service='tasks', action='edit', method=Request.def_method, + version='2.13', + json={ + "task": task_id, + "force": True, + "container": { + "image": container.get('image') or "", + "arguments": container.get('arguments') or "", + "setup_shell_script": container.get('setup_shell_script') or "", + } + }, + ) + if not res.ok: + raise Exception("failed setting runtime property") + except Exception as ex: + print("WARNING: failed setting container properties for task '{}': {}".format(task_id, ex)) + + # make sure we preserve backwards compatibility with the expected entries types + if isinstance(container.get('arguments'), str): + container['arguments'] = shlex.split(str(container.get('arguments') or '').strip()) + if container.get('image'): + container['image'] = container.get('image').strip() return container diff --git a/docs/clearml.conf b/docs/clearml.conf index b78adfb..5eae0f8 100644 --- a/docs/clearml.conf +++ b/docs/clearml.conf @@ -230,20 +230,39 @@ agent { docker_force_pull: false default_docker: { - # default docker image to use when running in docker mode - image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04" + # default container image to use when running in docker mode + image: "nvidia/cuda:12.6.3-cudnn-runtime-ubuntu20.04" # optional arguments to pass to docker image # arguments: ["--ipc=host"] - # lookup table rules for default container - # first matched rule will be picked, according to rule order - # enterprise version only + # **Notice**: Enterprise feature, ignored otherwise + # Choose the default container based on the Task properties, + # container values that can be changed: "image", "arguments" , "setup_shell_script" + # Notice: adding "force_container_rules: true" to a rule, + # will cause it to IGNORE the Task's requested container and use the configuration of the rule, + # including all its entries (image, arguments, setup_shell_script)! + # Rule selector keys: 'script.requirements', 'script.binary', + # 'script.repository', 'script.branch', + # 'project', 'project_id', 'user_id', 'container', 'tags' + # Notice: Matching is done via regular expression and needs to match ALL entries inside the rule, + # matching re example: "^searchme$" will match exactly "searchme" string inside an entry + # specifically for tags single tag match is enough: + # "^my_tag$" will match a Task that has + # multiple tags one of them matches ["general_tag", "my_tag$"] + # # match_rules: [ # { + # # default container image to use when running in docker mode # image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04" + # # optional arguments to pass to docker image # arguments: "-e define=value" + # # if true update the Task container section based on the selected rule/default + # # can also be set/override per specific rule + # update_back_task: true + # # match: { + # force_container_rules: true # script{ # # Optional: must match all requirements (not partial) # requirements: {