mirror of
				https://github.com/clearml/clearml-agent
				synced 2025-06-26 18:16:15 +00:00 
			
		
		
		
	Add docker port mapping parsing and reassigning feature support
Add initial component import from clearml-sdk for easier integration
This commit is contained in:
		
							parent
							
								
									8f28d2882a
								
							
						
					
					
						commit
						97cb47d48e
					
				| @ -181,6 +181,7 @@ ENV_DOCKER_HOST_MOUNT = EnvironmentConfig( | |||||||
| ENV_VENV_CACHE_PATH = EnvironmentConfig("CLEARML_AGENT_VENV_CACHE_PATH") | ENV_VENV_CACHE_PATH = EnvironmentConfig("CLEARML_AGENT_VENV_CACHE_PATH") | ||||||
| ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_ARGS", type=list) | ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_ARGS", type=list) | ||||||
| ENV_EXTRA_DOCKER_LABELS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_LABELS", type=list) | ENV_EXTRA_DOCKER_LABELS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_LABELS", type=list) | ||||||
|  | ENV_FORCE_HOST_MACHINE_IP = EnvironmentConfig("CLEARML_AGENT_FORCE_HOST_MACHINE_IP") | ||||||
| ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO") | ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO") | ||||||
| ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD") | ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD") | ||||||
| ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS") | ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS") | ||||||
|  | |||||||
| @ -1,6 +1,6 @@ | |||||||
| import re | import re | ||||||
| import shlex | import shlex | ||||||
| from typing import Tuple, List, TYPE_CHECKING | from typing import Tuple, List, TYPE_CHECKING, Optional | ||||||
| from urllib.parse import urlunparse, urlparse | from urllib.parse import urlunparse, urlparse | ||||||
| 
 | 
 | ||||||
| from clearml_agent.definitions import ( | from clearml_agent.definitions import ( | ||||||
| @ -11,7 +11,10 @@ from clearml_agent.definitions import ( | |||||||
|     ENV_AGENT_AUTH_TOKEN, |     ENV_AGENT_AUTH_TOKEN, | ||||||
|     ENV_DOCKER_IMAGE, |     ENV_DOCKER_IMAGE, | ||||||
|     ENV_DOCKER_ARGS_HIDE_ENV, |     ENV_DOCKER_ARGS_HIDE_ENV, | ||||||
|  |     ENV_FORCE_HOST_MACHINE_IP, | ||||||
| ) | ) | ||||||
|  | from clearml_agent.helper.sdk_client.utilities.networking import get_private_ip | ||||||
|  | from clearml_agent.helper.os.networking import TcpPorts | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from clearml_agent.session import Session |     from clearml_agent.session import Session | ||||||
| @ -42,6 +45,8 @@ def sanitize_urls(s: str) -> Tuple[str, bool]: | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class DockerArgsSanitizer: | class DockerArgsSanitizer: | ||||||
|  |     _machine_ip = None | ||||||
|  | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def sanitize_docker_command(cls, session, docker_command): |     def sanitize_docker_command(cls, session, docker_command): | ||||||
|         # type: (Session, List[str]) -> List[str] |         # type: (Session, List[str]) -> List[str] | ||||||
| @ -108,14 +113,22 @@ class DockerArgsSanitizer: | |||||||
|         return args |         return args | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def filter_switches(docker_args: List[str], exclude_switches: List[str]) -> List[str]: |     def filter_switches( | ||||||
|  |             docker_args: List[str], | ||||||
|  |             exclude_switches: List[str] = None, | ||||||
|  |             include_switches: List[str] = None | ||||||
|  |     ) -> List[str]: | ||||||
|  | 
 | ||||||
|  |         assert not (include_switches and exclude_switches), "Either include_switches or exclude_switches but not both" | ||||||
|  | 
 | ||||||
|         # shortcut if we are sure we have no matches |         # shortcut if we are sure we have no matches | ||||||
|         if (not exclude_switches or |         if not include_switches and ( | ||||||
|                 not any("-{}".format(s) in " ".join(docker_args) for s in exclude_switches)): |                 not exclude_switches or not any("-{}".format(s) in " ".join(docker_args) for s in exclude_switches)): | ||||||
|             return docker_args |             return docker_args | ||||||
| 
 | 
 | ||||||
|         args = [] |         args = [] | ||||||
|         in_switch_args = True |         in_switch_args = True if not include_switches else False | ||||||
|  | 
 | ||||||
|         for token in docker_args: |         for token in docker_args: | ||||||
|             if token.strip().startswith("-"): |             if token.strip().startswith("-"): | ||||||
|                 if "=" in token: |                 if "=" in token: | ||||||
| @ -125,7 +138,10 @@ class DockerArgsSanitizer: | |||||||
|                     switch = token |                     switch = token | ||||||
|                     in_switch_args = True |                     in_switch_args = True | ||||||
| 
 | 
 | ||||||
|                 if switch.lstrip("-") in exclude_switches: |                 if not include_switches and switch.lstrip("-") in exclude_switches: | ||||||
|  |                     # if in excluded, skip the switch and following arguments | ||||||
|  |                     in_switch_args = False | ||||||
|  |                 elif not exclude_switches and switch.lstrip("-") not in include_switches: | ||||||
|                     # if in excluded, skip the switch and following arguments |                     # if in excluded, skip the switch and following arguments | ||||||
|                     in_switch_args = False |                     in_switch_args = False | ||||||
|                 else: |                 else: | ||||||
| @ -167,3 +183,90 @@ class DockerArgsSanitizer: | |||||||
|                 extra_docker_arguments = DockerArgsSanitizer.filter_switches(extra_docker_arguments, switches) |                 extra_docker_arguments = DockerArgsSanitizer.filter_switches(extra_docker_arguments, switches) | ||||||
|                 base_cmd += [a for a in extra_docker_arguments if a] |                 base_cmd += [a for a in extra_docker_arguments if a] | ||||||
|         return base_cmd |         return base_cmd | ||||||
|  | 
 | ||||||
|  |     @staticmethod | ||||||
|  |     def resolve_port_mapping(config, docker_arguments: List[str]) -> Optional[tuple]: | ||||||
|  |         """ | ||||||
|  |         If we have port mappings in the docker cmd, this function will do two things | ||||||
|  |         1. It will add an environment variable (CLEARML_AGENT_HOST_IP) with the host machines IP address | ||||||
|  |         2. it will return a runtime property ("_external_host_tcp_port_mapping") on the Task with the port mapping merged | ||||||
|  |         :param config: | ||||||
|  |         :param docker_arguments: | ||||||
|  |         :return: new docker commands with additional one to add docker | ||||||
|  |         (i.e. changing the ports if needed and adding the new env var), runtime property | ||||||
|  |         """ | ||||||
|  |         if not docker_arguments: | ||||||
|  |             return | ||||||
|  |         # make a copy we are going to change it | ||||||
|  |         docker_arguments = docker_arguments[:] | ||||||
|  |         port_mapping_filtered = [ | ||||||
|  |             p for p in DockerArgsSanitizer.filter_switches(docker_arguments, include_switches=["p", "publish"]) | ||||||
|  |             if p and p.strip() | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  |         if not port_mapping_filtered: | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         # test if network=host was requested, docker will ignore published ports anyhow, so no use in parsing them | ||||||
|  |         network_filtered = DockerArgsSanitizer.filter_switches( | ||||||
|  |             docker_arguments, include_switches=["network", "net"]) | ||||||
|  |         network_filtered = [t for t in network_filtered if t.strip == "host" or "host" in t.split("=")] | ||||||
|  |         # if any network is configured, we ignore it, there is nothing we can do | ||||||
|  |         if network_filtered: | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         # verifying available ports, remapping if necessary | ||||||
|  |         port_checks = TcpPorts() | ||||||
|  |         for i_p in range(len(port_mapping_filtered)): | ||||||
|  |             port_map = port_mapping_filtered[i_p] | ||||||
|  |             if not port_map.strip(): | ||||||
|  |                 continue | ||||||
|  |             # skip the flag | ||||||
|  |             if port_map.strip().startswith("-"): | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             # todo: support udp?! | ||||||
|  |             # example: "8080:80/udp" | ||||||
|  |             if port_map.strip().split("/")[-1] == "udp": | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             # either no type specified or tcp | ||||||
|  |             ports_host, ports_in = port_map.strip().split("/")[0].split(":")[-2:] | ||||||
|  |             # verify ports available | ||||||
|  |             port_range = int(ports_host.split("-")[0]), int(ports_host.split("-")[-1])+1 | ||||||
|  |             if not all(port_checks.check_tcp_port_available(p) for p in range(port_range[0], port_range[1])): | ||||||
|  |                 # we need to find a new range (this is a consecutive range) | ||||||
|  |                 new_port_range = port_checks.find_port_range(port_range[1]-port_range[0]) | ||||||
|  | 
 | ||||||
|  |                 if not new_port_range: | ||||||
|  |                     # we could not find any, leave it as it?! | ||||||
|  |                     break | ||||||
|  | 
 | ||||||
|  |                 # replace the ports, | ||||||
|  |                 for i in range(len(docker_arguments)): | ||||||
|  |                     if docker_arguments[i].strip() != port_map.strip(): | ||||||
|  |                         continue | ||||||
|  |                     slash_parts = port_map.strip().split("/") | ||||||
|  |                     colon_parts = slash_parts[0].split(":") | ||||||
|  |                     colon_parts[-2] = "{}-{}".format(new_port_range[0], new_port_range[-1]) \ | ||||||
|  |                         if len(new_port_range) > 1 else str(new_port_range[0]) | ||||||
|  | 
 | ||||||
|  |                     docker_arguments[i] = "/".join(slash_parts[1:] + [":".join(colon_parts)]) | ||||||
|  |                     port_mapping_filtered[i_p] = docker_arguments[i] | ||||||
|  |                     break | ||||||
|  | 
 | ||||||
|  |         additional_cmd = [] | ||||||
|  |         if not DockerArgsSanitizer._machine_ip: | ||||||
|  |             DockerArgsSanitizer._machine_ip = ENV_FORCE_HOST_MACHINE_IP.get() or get_private_ip(config) | ||||||
|  | 
 | ||||||
|  |         if DockerArgsSanitizer._machine_ip: | ||||||
|  |             additional_cmd += ["-e", "CLEARML_AGENT_HOST_IP={}".format(DockerArgsSanitizer._machine_ip)] | ||||||
|  | 
 | ||||||
|  |         # sanitize, remove ip/type | ||||||
|  |         ports = ",".join([":".join(t.strip().split("/")[0].split(":")[-2:]) | ||||||
|  |                           for t in port_mapping_filtered if t.strip() and not t.strip().startswith("-")]) | ||||||
|  | 
 | ||||||
|  |         # update Tasks runtime | ||||||
|  |         additional_task_runtime = {"_external_host_tcp_port_mapping": ports} | ||||||
|  | 
 | ||||||
|  |         return docker_arguments+additional_cmd, additional_task_runtime | ||||||
|  | |||||||
							
								
								
									
										42
									
								
								clearml_agent/helper/os/networking.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								clearml_agent/helper/os/networking.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,42 @@ | |||||||
|  | import psutil | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TcpPorts(object): | ||||||
|  | 
 | ||||||
|  |     def __init__(self): | ||||||
|  |         self._used_ports = sorted([i.laddr.port for i in psutil.net_connections()]) | ||||||
|  | 
 | ||||||
|  |     def check_tcp_port_available(self, port: int, remember_port: bool = True) -> bool: | ||||||
|  |         """ | ||||||
|  |         return True if the port is available | ||||||
|  |         :param port: port number | ||||||
|  |         :param remember_port: if True add the port into the used ports list | ||||||
|  |         :return: True port is available | ||||||
|  |         """ | ||||||
|  |         if port in self._used_ports: | ||||||
|  |             return False | ||||||
|  |         if remember_port: | ||||||
|  |             self._used_ports.append(port) | ||||||
|  |         return True | ||||||
|  | 
 | ||||||
|  |     def find_port_range(self, number_of_ports: int, remember_port: bool = True, | ||||||
|  |                         range_min: int = 10000, range_max: int = 60000) -> list: | ||||||
|  |         ports = (i for i in range(range_min, range_max) if i not in self._used_ports) | ||||||
|  |         new_allocation = [] | ||||||
|  |         for p in ports: | ||||||
|  |             # find consecutive ports | ||||||
|  |             if new_allocation and (new_allocation[-1]+1) != p: | ||||||
|  |                 new_allocation = [] | ||||||
|  | 
 | ||||||
|  |             new_allocation.append(p) | ||||||
|  |             if len(new_allocation) == number_of_ports: | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |         # check if we found enough | ||||||
|  |         if len(new_allocation) != number_of_ports: | ||||||
|  |             return [] | ||||||
|  | 
 | ||||||
|  |         if remember_port: | ||||||
|  |             self._used_ports += new_allocation | ||||||
|  | 
 | ||||||
|  |         return new_allocation | ||||||
							
								
								
									
										0
									
								
								clearml_agent/helper/sdk_client/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								clearml_agent/helper/sdk_client/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										98
									
								
								clearml_agent/helper/sdk_client/utilities/networking.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								clearml_agent/helper/sdk_client/utilities/networking.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,98 @@ | |||||||
|  | import requests | ||||||
|  | import socket | ||||||
|  | import subprocess | ||||||
|  | from typing import Optional | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_private_ip(config_obj): | ||||||
|  |     # type: (Config) -> str | ||||||
|  |     """ | ||||||
|  |     Get the private IP of this machine | ||||||
|  | 
 | ||||||
|  |     :return: A string representing the IP of this machine | ||||||
|  |     """ | ||||||
|  |     approaches = ( | ||||||
|  |         _get_private_ip_from_socket, | ||||||
|  |         _get_private_ip_from_subprocess, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     for approach in approaches: | ||||||
|  |         # noinspection PyBroadException | ||||||
|  |         try: | ||||||
|  |             return approach(config_obj) | ||||||
|  |         except Exception: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |     raise Exception("error getting private IP") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_public_ip(config_obj): | ||||||
|  |     # type: (Config) -> Optional[str] | ||||||
|  |     """ | ||||||
|  |     Get the public IP of this machine. External services such as `https://api.ipify.org` or `https://ident.me` | ||||||
|  |     are used to get the IP | ||||||
|  | 
 | ||||||
|  |     :return: A string representing the IP of this machine or `None` if getting the IP failed | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     # todo: add documentation in api section in conf file | ||||||
|  |     public_ip_service_urls = ( | ||||||
|  |             (config_obj.get("api.public_ip_service_urls", None) if config_obj else None) | ||||||
|  |             or ["https://api.ipify.org", "https://ident.me"] | ||||||
|  |     ) | ||||||
|  |     for external_service in public_ip_service_urls: | ||||||
|  |         ip = get_public_ip_from_external_service(external_service) | ||||||
|  |         if ip: | ||||||
|  |             return ip | ||||||
|  |     return None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def get_public_ip_from_external_service(external_service, timeout=5): | ||||||
|  |     # type: (str, Optional[int]) -> Optional[str] | ||||||
|  |     """ | ||||||
|  |     Get the public IP of this machine from an external service. | ||||||
|  |     Fetching the IP is done via a GET request. The whole content of the request | ||||||
|  |     should be the IP address | ||||||
|  | 
 | ||||||
|  |     :param external_service: The address of the external service | ||||||
|  |     :param timeout: The GET request timeout | ||||||
|  | 
 | ||||||
|  |     :return: A string representing the IP of this machine or `None` if getting the IP failed | ||||||
|  |     """ | ||||||
|  |     # noinspection PyBroadException | ||||||
|  |     try: | ||||||
|  |         response = requests.get(external_service, timeout=timeout) | ||||||
|  |         if not response.ok: | ||||||
|  |             return None | ||||||
|  |         ip = response.content.decode("utf8") | ||||||
|  |         # check that we actually received an IP address | ||||||
|  |         # noinspection PyBroadException | ||||||
|  |         try: | ||||||
|  |             socket.inet_pton(socket.AF_INET, ip) | ||||||
|  |             return ip | ||||||
|  |         except Exception: | ||||||
|  |             socket.inet_pton(socket.AF_INET6, ip) | ||||||
|  |             return ip | ||||||
|  |     except Exception: | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _get_private_ip_from_socket(config_obj): | ||||||
|  | 
 | ||||||
|  |     # todo: add documentation in api section in conf file | ||||||
|  |     public_ip_ping = (config_obj.get("api.public_ip_ping", None) if config_obj else None) or "8.8.8.8" | ||||||
|  | 
 | ||||||
|  |     s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) | ||||||
|  |     s.settimeout(0) | ||||||
|  |     try: | ||||||
|  |         s.connect((public_ip_ping, 1)) | ||||||
|  |         ip = s.getsockname()[0] | ||||||
|  |     except Exception as e: | ||||||
|  |         raise e | ||||||
|  |     finally: | ||||||
|  |         s.close() | ||||||
|  |     return ip | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _get_private_ip_from_subprocess(_): | ||||||
|  |     return subprocess.check_output("hostname -I", shell=True).split()[0].decode("utf-8") | ||||||
							
								
								
									
										43
									
								
								clearml_agent/helper/task_runtime.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								clearml_agent/helper/task_runtime.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,43 @@ | |||||||
|  | from typing import Optional | ||||||
|  | 
 | ||||||
|  | from ..backend_api.session import Request | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TaskRuntime(object): | ||||||
|  | 
 | ||||||
|  |     def __init__(self, session): | ||||||
|  |         self._session = session | ||||||
|  | 
 | ||||||
|  |     def get_task_runtime(self, task_id) -> Optional[dict]: | ||||||
|  |         try: | ||||||
|  |             res = self._session.send_request( | ||||||
|  |                 service='tasks', action='get_by_id', method=Request.def_method, | ||||||
|  |                 json={"task": task_id, "only_fields": ["runtime"]}, | ||||||
|  |             ) | ||||||
|  |             if not res.ok: | ||||||
|  |                 raise ValueError(f"request returned {res.status_code}") | ||||||
|  |             data = res.json().get("data") | ||||||
|  |             if not data or "task" not in data: | ||||||
|  |                 raise ValueError("empty data in result") | ||||||
|  |             return data["task"].get("runtime", {}) | ||||||
|  |         except Exception as ex: | ||||||
|  |             print(f"ERROR: Failed getting runtime properties for task {task_id}: {ex}") | ||||||
|  | 
 | ||||||
|  |     def update_task_runtime(self, task_id: str, runtime: dict) -> bool: | ||||||
|  |         task_runtime = self.get_task_runtime(task_id) or {} | ||||||
|  |         task_runtime.update(runtime) | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|  |             res = self._session.send_request( | ||||||
|  |                 service='tasks', action='edit', method=Request.def_method, | ||||||
|  |                 json={ | ||||||
|  |                     "task": task_id, "force": True, "runtime": task_runtime | ||||||
|  |                 }, | ||||||
|  |             ) | ||||||
|  |             if not res.ok: | ||||||
|  |                 raise Exception("failed setting runtime property") | ||||||
|  |             return True | ||||||
|  |         except Exception as ex: | ||||||
|  |             print("WARNING: failed setting custom runtime properties for task '{}': {}".format(task_id, ex)) | ||||||
|  | 
 | ||||||
|  |         return False | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 clearml
						clearml