Add support for container rulebook overrides ('force_container_rules: true') and container rulebook task update ('update_back_task: true').

This addition allows users to override container arguments forcefully based on the tasks properties (repo, tags, project, user etc.), as well as offer additional defaults based on python required packages or python versions
This commit is contained in:
clearml 2025-02-24 13:44:26 +02:00
parent 8887453328
commit d87521c36c
4 changed files with 160 additions and 71 deletions

View File

@ -236,20 +236,35 @@
docker_force_pull: false
default_docker: {
# default docker image to use when running in docker mode
image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04"
# default container image to use when running in docker mode
image: "nvidia/cuda:12.6.3-cudnn-runtime-ubuntu20.04"
# optional arguments to pass to docker image
# arguments: ["--ipc=host", ]
# Choose the default docker based on the Task properties,
# Notice: Enterprise feature, ignored otherwise
# Examples: 'script.requirements', 'script.binary', 'script.repository', 'script.branch', 'project'
# Notice: Matching is done via regular expression, for example "^searchme$" will match exactly "searchme" string
# if true update the Task container section based on the selected rule/default
# can also be set/override per specific rule
update_back_task: true
# **Notice**: Enterprise feature, ignored otherwise
# Choose the default container based on the Task properties,
# container values that can be changed: "image", "arguments" , "setup_shell_script"
# Notice: adding "force_container_rules: true" to a rule,
# will cause it to IGNORE the Task's requested container and use the configuration of the rule,
# including all its entries (image, arguments, setup_shell_script)!
# Rule selector keys: 'script.requirements', 'script.binary',
# 'script.repository', 'script.branch',
# 'project', 'project_id', 'user_id', 'container', 'tags'
# Notice: Matching is done via regular expression and needs to match ALL entries inside the rule,
# matching re example: "^searchme$" will match exactly "searchme" string inside an entry
# specifically for tags single tag match is enough:
# "^my_tag$" will match a Task that has
# multiple tags one of them matches ["general_tag", "my_tag$"]
"match_rules": [
{
"image": "python:3.6-bullseye",
"arguments": "--ipc=host",
"update_back_task": true,
"match": {
"script": {
"binary": "python3.6$",
@ -259,6 +274,7 @@
{
"image": "python:3.7-bullseye",
"arguments": "--ipc=host",
"update_back_task": true,
"match": {
"script": {
"binary": "python3.7$",
@ -310,6 +326,15 @@
},
}
},
{
"image": "python:3.13-bullseye",
"arguments": "--ipc=host",
"match": {
"script": {
"binary": "python3.13$",
},
}
},
]
}

View File

@ -34,7 +34,7 @@ def resolve_default_container(session, task_id, container_config, ignore_match_r
json={'id': [task_id],
'only_fields': ['script.requirements', 'script.binary',
'script.repository', 'script.branch',
'project', 'container'],
'project', 'container', 'tags', 'user'],
'search_hidden': True},
method=Request.def_method,
async_enable=False,
@ -72,62 +72,50 @@ def resolve_default_container(session, task_id, container_config, ignore_match_r
except (ValueError, TypeError):
pass
match_term_lookup = {
"project": project_full_name,
"project_id": task_info.get('project', ''),
"script.repository": repository,
"script.branch": branch,
"script.binary": binary,
"user_id": task_info.get('user', ""),
"container": requested_container.get('image', ''),
"tags": task_info.get('tags', []),
}
task_packages_lookup = {}
for entry in container_lookup:
match = entry.get('match', None)
if not match:
continue
if match.get('project', None):
# noinspection PyBroadException
try:
if not re.search(match.get('project', None), project_full_name):
continue
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
match.get('project', None), entry))
continue
if match.get('script.repository', None):
# noinspection PyBroadException
try:
if not re.search(match.get('script.repository', None), repository):
continue
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
match.get('script.repository', None), entry))
continue
if match.get('script.branch', None):
# noinspection PyBroadException
try:
if not re.search(match.get('script.branch', None), branch):
continue
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
match.get('script.branch', None), entry))
continue
if match.get('script.binary', None):
# noinspection PyBroadException
try:
if not re.search(match.get('script.binary', None), binary):
continue
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(
match.get('script.binary', None), entry))
continue
# if match.get('image', None):
# # noinspection PyBroadException
# try:
# if not re.search(match.get('image', None), requested_container.get('image', '')):
# continue
# except Exception:
# print('Failed parsing regular expression \"{}\" in rule: {}'.format(
# match.get('image', None), entry))
# continue
matched = True
for key, value in match_term_lookup.items():
term = match.get(key, None)
if not term:
continue
values = [value] if not isinstance(value, (list, tuple)) else value
# noinspection PyBroadException
try:
terms = [term] if not isinstance(term, (list, tuple)) else term
# we fail if we didn't find ANY match in the list
if all(any(bool(re.search(t, v)) for v in values) for t in terms):
# we found a match, go to the next match term
pass
else:
# no match, stop, and we should go to the next rule
matched = False
break
except Exception:
print('Failed parsing regular expression \"{}\" in rule: {}'.format(term, entry))
matched = False
break
# we had at least a single key that was Not matched in this rule, go to the next one
if not matched:
continue
# look for the complicated stuff (i.e. requirements)
for req_section in ['script.requirements.pip', 'script.requirements.conda']:
if not match.get(req_section, None):
continue
@ -163,13 +151,28 @@ def resolve_default_container(session, task_id, container_config, ignore_match_r
matched = False
break
# if we found a match in the rulebook
if matched:
if not container_config.get('image'):
container_config['image'] = entry.get('image', None)
if not container_config.get('arguments'):
allow_override = bool(entry.get('force_container_rules', None))
container_config["force_container_rules"] = allow_override
if not container_config.get('image') or allow_override:
container_config['image'] = entry.get('image', None) or ''
if not container_config.get('arguments') or allow_override:
container_config['arguments'] = entry.get('arguments', None) or ''
if isinstance(container_config.get('arguments'), str):
container_config['arguments'] = shlex.split(str(container_config.get('arguments') or '').strip())
if not container_config.get('setup_shell_script') or allow_override:
container_config['setup_shell_script'] = entry.get('setup_shell_script', None) or ''
if isinstance(container_config['setup_shell_script'], (list, tuple)):
container_config['setup_shell_script'] = "\n".join(container_config['setup_shell_script'])
update_back_task = entry.get('update_back_task', None)
if update_back_task is None:
update_back_task = session.config.get('agent.default_docker.update_back_task', None)
container_config['update_back_task'] = update_back_task
print('INFO: Matching default container with rule:\n{}'.format(json.dumps(entry)))
return container_config

View File

@ -388,7 +388,7 @@ def get_task_fields(session, task_id, fields: list, log=None) -> dict:
return {}
def get_task_container(session, task_id, ignore_match_rules=False):
def get_task_container(session, task_id, ignore_match_rules=False, allow_force_container_rules=True):
"""
Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''}
"""
@ -423,12 +423,54 @@ def get_task_container(session, task_id, ignore_match_rules=False):
except (ValueError, TypeError):
pass
if (not container or not container.get('image')) and session.check_min_api_version("2.13"):
container = resolve_default_container(
no_default_container = not container or not container.get('image')
if no_default_container or allow_force_container_rules and session.check_min_api_version("2.13"):
original_container = copy(container) or {}
updated_container = resolve_default_container(
session=session, task_id=task_id,
container_config=container,
ignore_match_rules=ignore_match_rules,
container_config=original_container,
ignore_match_rules=ignore_match_rules and not no_default_container,
)
if no_default_container and not ignore_match_rules:
# if we do not have a default container image / args, use the defaults from the resolver
container = updated_container
elif allow_force_container_rules and updated_container.get('force_container_rules'):
# if we allow to force rules (and we have requested container)
# and the 'force_container_rules' is turned on in the rule, then overwrite container
container = updated_container
# make sure we pop the new added fields
container.pop("force_container_rules", None)
# check if we need to update the Task based on the new matched container defaults or overrides
update_back_task = container.pop("update_back_task", None) or updated_container.pop("update_back_task", None)
if update_back_task:
# update back the task
print('INFO: Updating the Task with the selected container with rule')
try:
res = session.send_request(
service='tasks', action='edit', method=Request.def_method,
version='2.13',
json={
"task": task_id,
"force": True,
"container": {
"image": container.get('image') or "",
"arguments": container.get('arguments') or "",
"setup_shell_script": container.get('setup_shell_script') or "",
}
},
)
if not res.ok:
raise Exception("failed setting runtime property")
except Exception as ex:
print("WARNING: failed setting container properties for task '{}': {}".format(task_id, ex))
# make sure we preserve backwards compatibility with the expected entries types
if isinstance(container.get('arguments'), str):
container['arguments'] = shlex.split(str(container.get('arguments') or '').strip())
if container.get('image'):
container['image'] = container.get('image').strip()
return container

View File

@ -230,20 +230,39 @@ agent {
docker_force_pull: false
default_docker: {
# default docker image to use when running in docker mode
image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04"
# default container image to use when running in docker mode
image: "nvidia/cuda:12.6.3-cudnn-runtime-ubuntu20.04"
# optional arguments to pass to docker image
# arguments: ["--ipc=host"]
# lookup table rules for default container
# first matched rule will be picked, according to rule order
# enterprise version only
# **Notice**: Enterprise feature, ignored otherwise
# Choose the default container based on the Task properties,
# container values that can be changed: "image", "arguments" , "setup_shell_script"
# Notice: adding "force_container_rules: true" to a rule,
# will cause it to IGNORE the Task's requested container and use the configuration of the rule,
# including all its entries (image, arguments, setup_shell_script)!
# Rule selector keys: 'script.requirements', 'script.binary',
# 'script.repository', 'script.branch',
# 'project', 'project_id', 'user_id', 'container', 'tags'
# Notice: Matching is done via regular expression and needs to match ALL entries inside the rule,
# matching re example: "^searchme$" will match exactly "searchme" string inside an entry
# specifically for tags single tag match is enough:
# "^my_tag$" will match a Task that has
# multiple tags one of them matches ["general_tag", "my_tag$"]
#
# match_rules: [
# {
# # default container image to use when running in docker mode
# image: "nvidia/cuda:11.0.3-cudnn8-runtime-ubuntu20.04"
# # optional arguments to pass to docker image
# arguments: "-e define=value"
# # if true update the Task container section based on the selected rule/default
# # can also be set/override per specific rule
# update_back_task: true
#
# match: {
# force_container_rules: true
# script{
# # Optional: must match all requirements (not partial)
# requirements: {