mirror of
https://github.com/clearml/clearml-agent
synced 2025-03-13 06:58:37 +00:00
Add support for use-owner-token in k8s glue
This commit is contained in:
parent
c9ffb8a053
commit
2e5298b737
@ -1291,12 +1291,9 @@ class Worker(ServiceCommandSection):
|
|||||||
raise ValueError("Running in Docker mode, 'docker' command was not found")
|
raise ValueError("Running in Docker mode, 'docker' command was not found")
|
||||||
|
|
||||||
self._worker_tags = kwargs.get('child_report_tags', None)
|
self._worker_tags = kwargs.get('child_report_tags', None)
|
||||||
self._impersonate_as_task_owner = kwargs.get('use_owner_token', False)
|
|
||||||
if self._impersonate_as_task_owner:
|
self._use_owner_token(kwargs.get('use_owner_token', False))
|
||||||
if not self._session.check_min_api_version("2.14"):
|
|
||||||
raise ValueError("Server does not support --use-owner-token option (incompatible API version)")
|
|
||||||
if self._session.feature_set == "basic":
|
|
||||||
raise ValueError("Server does not support --use-owner-token option")
|
|
||||||
self._standalone_mode = kwargs.get('standalone_mode', False)
|
self._standalone_mode = kwargs.get('standalone_mode', False)
|
||||||
self._services_mode = kwargs.get('services_mode', False)
|
self._services_mode = kwargs.get('services_mode', False)
|
||||||
# must have docker in services_mode
|
# must have docker in services_mode
|
||||||
@ -3953,6 +3950,14 @@ class Worker(ServiceCommandSection):
|
|||||||
# type: (str) -> bool
|
# type: (str) -> bool
|
||||||
return re.fullmatch(r"^[a-zA-Z0-9][a-zA-Z0-9_.-]+$", name) is not None
|
return re.fullmatch(r"^[a-zA-Z0-9][a-zA-Z0-9_.-]+$", name) is not None
|
||||||
|
|
||||||
|
def _use_owner_token(self, use_owner_token=False):
|
||||||
|
self._impersonate_as_task_owner = use_owner_token
|
||||||
|
if self._impersonate_as_task_owner:
|
||||||
|
if not self._session.check_min_api_version("2.14"):
|
||||||
|
raise ValueError("Server does not support --use-owner-token option (incompatible API version)")
|
||||||
|
if self._session.feature_set == "basic":
|
||||||
|
raise ValueError("Server does not support --use-owner-token option")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pass
|
pass
|
||||||
|
@ -18,7 +18,7 @@ from typing import Text, List, Callable, Any, Collection, Optional, Union
|
|||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from clearml_agent.commands.events import Events
|
from clearml_agent.commands.events import Events
|
||||||
from clearml_agent.commands.worker import Worker, get_task_container, set_task_container
|
from clearml_agent.commands.worker import Worker, get_task_container, set_task_container, get_next_task
|
||||||
from clearml_agent.definitions import ENV_DOCKER_IMAGE
|
from clearml_agent.definitions import ENV_DOCKER_IMAGE
|
||||||
from clearml_agent.errors import APIError
|
from clearml_agent.errors import APIError
|
||||||
from clearml_agent.helper.base import safe_remove_file
|
from clearml_agent.helper.base import safe_remove_file
|
||||||
@ -362,7 +362,7 @@ class K8sIntegration(Worker):
|
|||||||
print('Failed getting number of used pods: {}'.format(ex))
|
print('Failed getting number of used pods: {}'.format(ex))
|
||||||
return -2
|
return -2
|
||||||
|
|
||||||
def run_one_task(self, queue: Text, task_id: Text, worker_args=None, **_):
|
def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_session=None, **_):
|
||||||
print('Pulling task {} launching on kubernetes cluster'.format(task_id))
|
print('Pulling task {} launching on kubernetes cluster'.format(task_id))
|
||||||
task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
|
task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
|
||||||
|
|
||||||
@ -398,10 +398,18 @@ class K8sIntegration(Worker):
|
|||||||
self.conf_file_content
|
self.conf_file_content
|
||||||
or Path(self._session._config_file).read_text()
|
or Path(self._session._config_file).read_text()
|
||||||
).encode("ascii")
|
).encode("ascii")
|
||||||
create_clearml_conf = "echo '{}' | base64 --decode >> ~/clearml.conf".format(
|
|
||||||
|
create_clearml_conf = ["echo '{}' | base64 --decode >> ~/clearml.conf".format(
|
||||||
base64.b64encode(
|
base64.b64encode(
|
||||||
hocon_config_encoded
|
hocon_config_encoded
|
||||||
).decode('ascii')
|
).decode('ascii')
|
||||||
|
)]
|
||||||
|
|
||||||
|
if task_session:
|
||||||
|
create_clearml_conf.append(
|
||||||
|
"export CLEARML_AUTH_TOKEN=$(echo '{}' | base64 --decode)".format(
|
||||||
|
base64.b64encode(task_session.token.encode("ascii")).decode('ascii')
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.ports_mode:
|
if self.ports_mode:
|
||||||
@ -594,19 +602,22 @@ class K8sIntegration(Worker):
|
|||||||
extra_docker_bash_script=extra_docker_bash_script)
|
extra_docker_bash_script=extra_docker_bash_script)
|
||||||
for line in container_bash_script])
|
for line in container_bash_script])
|
||||||
|
|
||||||
create_init_script = \
|
extra_bash_commands = list(create_clearml_conf or [])
|
||||||
"echo '{}' | base64 --decode >> ~/__start_agent__.sh ; " \
|
|
||||||
|
extra_bash_commands.append(
|
||||||
|
"echo '{}' | base64 --decode >> ~/__start_agent__.sh ; "
|
||||||
"/bin/bash ~/__start_agent__.sh".format(
|
"/bin/bash ~/__start_agent__.sh".format(
|
||||||
base64.b64encode(
|
base64.b64encode(
|
||||||
script_encoded.encode('ascii')
|
script_encoded.encode('ascii')
|
||||||
).decode('ascii'))
|
).decode('ascii'))
|
||||||
|
)
|
||||||
|
|
||||||
# Notice: we always leave with exit code 0, so pods are never restarted
|
# Notice: we always leave with exit code 0, so pods are never restarted
|
||||||
container = self._merge_containers(
|
container = self._merge_containers(
|
||||||
container,
|
container,
|
||||||
dict(name=name, image=docker_image,
|
dict(name=name, image=docker_image,
|
||||||
command=['/bin/bash'],
|
command=['/bin/bash'],
|
||||||
args=['-c', '{} ; {} ; exit 0'.format(create_clearml_conf, create_init_script)])
|
args=['-c', '{} ; exit 0'.format(' ; '.join(extra_bash_commands))])
|
||||||
)
|
)
|
||||||
|
|
||||||
if template['spec']['containers']:
|
if template['spec']['containers']:
|
||||||
@ -685,7 +696,7 @@ class K8sIntegration(Worker):
|
|||||||
"--",
|
"--",
|
||||||
"/bin/sh",
|
"/bin/sh",
|
||||||
"-c",
|
"-c",
|
||||||
"{} ; {}".format(create_clearml_conf, container_bash_script.format(
|
"{} ; {}".format(" ; ".join(create_clearml_conf or []), container_bash_script.format(
|
||||||
extra_bash_init_cmd=self.extra_bash_init_script or "",
|
extra_bash_init_cmd=self.extra_bash_init_script or "",
|
||||||
extra_docker_bash_script=docker_bash or "",
|
extra_docker_bash_script=docker_bash or "",
|
||||||
task_id=task_id
|
task_id=task_id
|
||||||
@ -742,14 +753,16 @@ class K8sIntegration(Worker):
|
|||||||
|
|
||||||
# get next task in queue
|
# get next task in queue
|
||||||
try:
|
try:
|
||||||
response = self._session.api_client.queues.get_next_task(queue=queue)
|
response = get_next_task(
|
||||||
|
self._session, queue=queue, get_task_info=self._impersonate_as_task_owner
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
|
print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
task_id = response.entry.task
|
task_id = response["entry"]["task"]
|
||||||
except AttributeError:
|
except (KeyError, TypeError, AttributeError):
|
||||||
print("No tasks in queue {}".format(queue))
|
print("No tasks in queue {}".format(queue))
|
||||||
continue
|
continue
|
||||||
events_service.send_log_events(
|
events_service.send_log_events(
|
||||||
@ -761,8 +774,26 @@ class K8sIntegration(Worker):
|
|||||||
level="INFO",
|
level="INFO",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
task_session = None
|
||||||
|
if self._impersonate_as_task_owner:
|
||||||
|
try:
|
||||||
|
task_user = response["task_info"]["user"]
|
||||||
|
task_company = response["task_info"]["company"]
|
||||||
|
except (KeyError, TypeError, AttributeError):
|
||||||
|
print("Error: cannot retrieve owner user for the task '{}', skipping".format(task_id))
|
||||||
|
continue
|
||||||
|
|
||||||
|
task_session = self.get_task_session(task_user, task_company)
|
||||||
|
if not task_session:
|
||||||
|
print(
|
||||||
|
"Error: Could not login as the user '{}' for the task '{}', skipping".format(
|
||||||
|
task_user, task_id
|
||||||
|
)
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
|
self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
|
||||||
self.run_one_task(queue, task_id, worker_params)
|
self.run_one_task(queue, task_id, worker_params, task_session)
|
||||||
self.report_monitor(ResourceMonitor.StatusReport(queues=self.queues))
|
self.report_monitor(ResourceMonitor.StatusReport(queues=self.queues))
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@ -773,7 +804,7 @@ class K8sIntegration(Worker):
|
|||||||
if self._session.config["agent.reload_config"]:
|
if self._session.config["agent.reload_config"]:
|
||||||
self.reload_config()
|
self.reload_config()
|
||||||
|
|
||||||
def k8s_daemon(self, queue):
|
def k8s_daemon(self, queue, **kwargs):
|
||||||
"""
|
"""
|
||||||
Start the k8s Glue service.
|
Start the k8s Glue service.
|
||||||
This service will be pulling tasks from *queue* and scheduling them for execution using kubectl.
|
This service will be pulling tasks from *queue* and scheduling them for execution using kubectl.
|
||||||
@ -784,8 +815,10 @@ class K8sIntegration(Worker):
|
|||||||
|
|
||||||
:param list(str) queue: queue name to pull from
|
:param list(str) queue: queue name to pull from
|
||||||
"""
|
"""
|
||||||
return self.daemon(queues=[ObjectID(name=queue)] if queue else None,
|
return self.daemon(
|
||||||
log_level=logging.INFO, foreground=True, docker=False)
|
queues=[ObjectID(name=queue)] if queue else None,
|
||||||
|
log_level=logging.INFO, foreground=True, docker=False, **kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_ssh_server_bash(cls, ssh_port_number):
|
def get_ssh_server_bash(cls, ssh_port_number):
|
||||||
|
@ -65,6 +65,10 @@ def parse_args():
|
|||||||
help="Limit the maximum number of pods that this service can run at the same time."
|
help="Limit the maximum number of pods that this service can run at the same time."
|
||||||
"Should not be used with ports-mode"
|
"Should not be used with ports-mode"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--use-owner-token", action="store_true", default=False,
|
||||||
|
help="Generate and use task owner token for the execution of each task"
|
||||||
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -87,7 +91,7 @@ def main():
|
|||||||
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
|
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
|
||||||
namespace=args.namespace, max_pods_limit=args.max_pods or None,
|
namespace=args.namespace, max_pods_limit=args.max_pods or None,
|
||||||
)
|
)
|
||||||
k8s.k8s_daemon(args.queue)
|
k8s.k8s_daemon(args.queue, use_owner_token=args.use_owner_token)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
Reference in New Issue
Block a user