diff --git a/README.md b/README.md index 038071c..8773ccc 100644 --- a/README.md +++ b/README.md @@ -205,7 +205,7 @@ In the `clearml` web UI, find the experiment (Task) you wish to debug. Click on the ID button next to the Task name, and copy the unique ID. ``` bash -clearml-session --debugging +clearml-session --debugging-session ``` Click on the JupyterLab/VSCode link, or connect directly to the SSH session @@ -219,9 +219,12 @@ clearml-session --help ``` console clearml-session - CLI for launching JupyterLab / VSCode on a remote machine usage: clearml-session [-h] [--version] [--attach [ATTACH]] - [--debugging DEBUGGING] [--queue QUEUE] - [--docker DOCKER] [--public-ip [true/false]] + [--debugging-session DEBUGGING_SESSION] [--queue QUEUE] + [--docker DOCKER] [--docker-args DOCKER_ARGS] + [--public-ip [true/false]] + [--remote-ssh-port REMOTE_SSH_PORT] [--vscode-server [true/false]] + [--vscode-version VSCODE_VERSION] [--jupyter-lab [true/false]] [--git-credentials [true/false]] [--user-folder USER_FOLDER] @@ -231,11 +234,11 @@ usage: clearml-session [-h] [--version] [--attach [ATTACH]] [--config-file CONFIG_FILE] [--remote-gateway [REMOTE_GATEWAY]] [--base-task-id BASE_TASK_ID] [--project PROJECT] - [--disable-keepalive] + [--keepalive [true/false]] [--queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]]] [--queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]]] [--skip-docker-network] [--password PASSWORD] - [--username USERNAME] + [--username USERNAME] [--verbose] clearml-session - CLI for launching JupyterLab / VSCode on a remote machine @@ -244,23 +247,34 @@ optional arguments: --version Display the clearml-session utility version --attach [ATTACH] Attach to running interactive session (default: previous session) - --debugging DEBUGGING + --debugging-session DEBUGGING_SESSION Pass existing Task id (experiment), create a copy of the experiment on a remote machine, and launch jupyter/ssh for interactive access. Example - --debugging + --debugging-session --queue QUEUE Select the queue to launch the interactive session on (default: previously used queue) --docker DOCKER Select the docker image to use in the interactive session on (default: previously used docker image or `nvidia/cuda:10.1-runtime-ubuntu18.04`) + --docker-args DOCKER_ARGS + Add additional arguments for the docker image to use + in the interactive session on (default: previously + used docker-args) --public-ip [true/false] If True register the public IP of the remote machine. Set if running on the cloud. Default: false (use for local / on-premises) + --remote-ssh-port REMOTE_SSH_PORT + Set the remote ssh server port, running on the agent`s + machine. (default: 10022) --vscode-server [true/false] Install vscode server (code-server) on interactive session (default: true) + --vscode-version VSCODE_VERSION + Set vscode server (code-server) version, as well as + vscode python extension version + (example: "3.7.4:2020.10.332292344") --jupyter-lab [true/false] Install Jupyter-Lab on interactive session (default: true) @@ -287,17 +301,19 @@ optional arguments: Advanced: Change the configuration file used to store the previous state (default: ~/.clearml_session.json) --remote-gateway [REMOTE_GATEWAY] - Advanced: Specify gateway ip/address to be passed to - interactive session (for use with k8s ingestion / ELB) + Advanced: Specify gateway ip/address:port to be passed + to interactive session (for use with k8s ingestion / + ELB) --base-task-id BASE_TASK_ID Advanced: Set the base task ID for the interactive session. (default: previously used Task). Use `none` for the default interactive session --project PROJECT Advanced: Set the project name for the interactive session Task - --disable-keepalive Advanced: If set, disable the transparent proxy always - keeping the sockets alive. Default: false, use - transparent socket mitigating connection drops. + --keepalive [true/false] + Advanced: If set, enables the transparent proxy always + keeping the sockets alive. Default: False, do not use + transparent socket for mitigating connection drops. --queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]] Advanced: Excluded queues with this specific tag from the selection @@ -313,6 +329,8 @@ optional arguments: used one) --username USERNAME Advanced: Select ssh username for the interactive session (default: `root` or previously used one) + --verbose Advanced: If set, print verbose progress information, + e.g. the remote machine setup process log Notice! all arguments are stored as new defaults for the next session ``` diff --git a/clearml_session/__main__.py b/clearml_session/__main__.py index 804cfd9..c0d2185 100644 --- a/clearml_session/__main__.py +++ b/clearml_session/__main__.py @@ -1,3 +1,4 @@ +import base64 import hashlib import json import logging @@ -20,6 +21,7 @@ import psutil from clearml import Task from clearml.backend_api.session.client import APIClient from clearml.config import config_obj +from clearml.backend_api import Session from .tcp_proxy import TcpProxy from .single_thread_proxy import SingleThreadProxy @@ -136,35 +138,59 @@ def create_base_task(state, project_name=None, task_name=None): task = Task.create(project_name=project_name or 'DevOps', task_name=task_name or 'Interactive Session', task_type=Task.TaskTypes.application) - task_state = task.export_task() + task_script = task.data.script.to_dict() base_script_file = os.path.abspath(os.path.join(__file__, '..', 'tcp_proxy.py')) with open(base_script_file, 'rt') as f: - task_state['script']['diff'] = f.read() + task_script['diff'] = f.read() base_script_file = os.path.abspath(os.path.join(__file__, '..', 'interactive_session_task.py')) with open(base_script_file, 'rt') as f: - task_state['script']['diff'] += '\n\n' + f.read() + task_script['diff'] += '\n\n' + f.read() - task_state['script']['working_dir'] = '.' - task_state['script']['entry_point'] = 'interactive_session.py' - task_state['script']['requirements'] = {'pip': '\n'.join( - ["clearml"] + (["jupyter", "jupyterlab", "jupyterlab_git"] if state.get('jupyter_lab') else []) + + task_script['working_dir'] = '.' + task_script['entry_point'] = 'interactive_session.py' + task_script['requirements'] = {'pip': '\n'.join( + ["clearml>=1.1.5"] + (["jupyter", "jupyterlab", "jupyterlab_git"] if state.get('jupyter_lab') else []) + (['pylint'] if state.get('vscode_server') else []))} - task.update_task(task_state) + section, _, _ = _get_config_section_name() - task.set_parameters({ - "{}/user_base_directory".format(section): "~/", - "{}/ssh_server".format(section): True, - "{}/ssh_password".format(section): "training", - "{}/default_docker".format(section): "nvidia/cuda", - "{}/user_key".format(section): '', - "{}/user_secret".format(section): '', - "properties/external_address": '', - "properties/internal_ssh_port": '', - "properties/jupyter_token": '', - "properties/jupyter_port": '', - }) + + if Session.check_min_api_version('2.13'): + _runtime_prop = dict(task._get_runtime_properties()) + _runtime_prop.update({ + "_user_key": '', + "_user_secret": '', + "_jupyter_token": '', + "_ssh_password": "training", + }) + # noinspection PyProtectedMember + task._set_runtime_properties(_runtime_prop) + task.set_parameters({ + "{}/user_base_directory".format(section): "~/", + "{}/ssh_server".format(section): True, + "{}/default_docker".format(section): "nvidia/cuda", + "properties/external_address": '', + "properties/internal_ssh_port": '', + "properties/jupyter_port": '', + }) + else: + task.set_parameters({ + "{}/user_base_directory".format(section): "~/", + "{}/ssh_server".format(section): True, + "{}/ssh_password".format(section): "training", + "{}/default_docker".format(section): "nvidia/cuda", + "{}/user_key".format(section): '', + "{}/user_secret".format(section): '', + "properties/external_address": '', + "properties/internal_ssh_port": '', + "properties/jupyter_token": '', + "properties/jupyter_port": '', + }) + task.set_system_tags([system_tag]) - task.reset(force=True) + + # only update the data at the end, so reload requests are smaller + # noinspection PyProtectedMember + task._edit(script=task_script) return task @@ -197,24 +223,44 @@ def create_debugging_task(state, debug_task_id): (['pylint'] if state.get('vscode_server') else []) task.update_task(task_state) section, _, _ = _get_config_section_name() - task.set_parameters({ - "{}/user_base_directory".format(section): "~/", - "{}/ssh_server".format(section): True, - "{}/ssh_password".format(section): "training", - "{}/default_docker".format(section): "nvidia/cuda", - "{}/user_key".format(section): '', - "{}/user_secret".format(section): '', - "properties/external_address": '', - "properties/internal_ssh_port": '', - "properties/jupyter_token": '', - "properties/jupyter_port": '', - }) + + if Session.check_min_api_version('2.13'): + _runtime_prop = dict(task._get_runtime_properties()) + _runtime_prop.update({ + "_user_key": '', + "_user_secret": '', + "_jupyter_token": '', + "_ssh_password": "training", + }) + # noinspection PyProtectedMember + task._set_runtime_properties(_runtime_prop) + task.set_parameters({ + "{}/user_base_directory".format(section): "~/", + "{}/ssh_server".format(section): True, + "{}/default_docker".format(section): "nvidia/cuda", + "properties/external_address": '', + "properties/internal_ssh_port": '', + "properties/jupyter_port": '', + }) + else: + task.set_parameters({ + "{}/user_base_directory".format(section): "~/", + "{}/ssh_server".format(section): True, + "{}/ssh_password".format(section): "training", + "{}/default_docker".format(section): "nvidia/cuda", + "{}/user_key".format(section): '', + "{}/user_secret".format(section): '', + "properties/external_address": '', + "properties/internal_ssh_port": '', + "properties/jupyter_token": '', + "properties/jupyter_port": '', + }) task.set_system_tags([system_tag]) task.reset(force=True) return task -def delete_old_tasks(client, base_task_id): +def delete_old_tasks(state, client, base_task_id): print('Removing stale interactive sessions') current_user_id = _get_user_id(client) previous_tasks = client.tasks.get_all(**{ @@ -222,9 +268,13 @@ def delete_old_tasks(client, base_task_id): 'parent': base_task_id or None, 'system_tags': None if base_task_id else [system_tag], 'page_size': 100, 'page': 0, - 'user': [current_user_id], 'only_fields': ['id'] + 'user': [current_user_id], + 'only_fields': ['id'] }) - for t in previous_tasks: + + for i, t in enumerate(previous_tasks): + if state.get('verbose'): + print('Removing {}/{} stale sessions'.format(i+1, len(previous_tasks))) try: client.tasks.delete(task=t.id, force=True) except Exception as ex: @@ -265,6 +315,17 @@ def _get_user_id(client): return current_user_id +def _b64_encode_file(file): + # noinspection PyBroadException + try: + import gzip + with open(file, 'rt') as f: + git_credentials = gzip.compress(f.read().encode('utf8')) + return base64.encodebytes(git_credentials).decode('ascii') + except Exception: + return None + + def get_project_id(state): project_id = None project_name = state.get('project') or None @@ -383,8 +444,8 @@ def load_state(state_file): state = json.load(f) except Exception: state = {} - # never reload --debug state - state.pop('debug', None) + # never reload --verbose state + state.pop('verbose', None) return state @@ -402,13 +463,31 @@ def clone_task(state, project_id): task = create_base_task(state, project_name=state.get('project')) new_task = True + print('Configuring new session') + runtime_prop_support = Session.check_min_api_version("2.13") + if runtime_prop_support: + # noinspection PyProtectedMember + runtime_properties = dict(task._get_runtime_properties() or {}) + runtime_properties['_jupyter_token'] = '' + runtime_properties['_ssh_password'] = str(state['password']) + runtime_properties['_user_key'] = str(config_obj.get("api.credentials.access_key")) + runtime_properties['_user_secret'] = (config_obj.get("api.credentials.secret_key")) + # noinspection PyProtectedMember + task._set_runtime_properties(runtime_properties) + task_params = task.get_parameters(backwards_compatibility=False) if 'General/ssh_server' in task_params: section = 'General' init_section = 'init_script' else: section, _, init_section = _get_config_section_name() - task_params['properties/jupyter_token'] = '' + + if not runtime_prop_support: + task_params['properties/jupyter_token'] = '' + task_params['{}/ssh_password'.format(section)] = state['password'] + task_params['{}/user_key'.format(section)] = config_obj.get("api.credentials.access_key") + task_params['{}/user_secret'.format(section)] = config_obj.get("api.credentials.secret_key") + task_params['properties/jupyter_port'] = '' if state.get('remote_gateway') is not None: remote_gateway_parts = str(state.get('remote_gateway')).split(':') @@ -416,9 +495,6 @@ def clone_task(state, project_id): if len(remote_gateway_parts) > 1: task_params['properties/external_ssh_port'] = remote_gateway_parts[1] task_params['{}/ssh_server'.format(section)] = str(True) - task_params['{}/ssh_password'.format(section)] = state['password'] - task_params['{}/user_key'.format(section)] = config_obj.get("api.credentials.access_key") - task_params['{}/user_secret'.format(section)] = config_obj.get("api.credentials.secret_key") task_params["{}/jupyterlab".format(section)] = bool(state.get('jupyter_lab')) task_params["{}/vscode_server".format(section)] = bool(state.get('vscode_server')) task_params["{}/public_ip".format(section)] = bool(state.get('public_ip')) @@ -443,13 +519,30 @@ def clone_task(state, project_id): # store the .git-credentials if state.get('git_credentials'): git_cred_file = os.path.join(os.path.expanduser('~'), '.git-credentials') - if os.path.isfile(git_cred_file): - task.connect_configuration( - configuration=git_cred_file, name='git_credentials', description='git credentials') git_conf_file = os.path.join(os.path.expanduser('~'), '.gitconfig') - if os.path.isfile(git_conf_file): - task.connect_configuration( - configuration=git_conf_file, name='git_config', description='git config') + if not os.path.isfile(git_cred_file): + git_cred_file = None + if not os.path.isfile(git_conf_file): + git_conf_file = None + + if runtime_prop_support: + # noinspection PyProtectedMember + runtime_properties = dict(task._get_runtime_properties() or {}) + if git_cred_file: + runtime_properties['_git_credentials'] = _b64_encode_file(git_cred_file) + if git_conf_file: + runtime_properties['_git_config'] = _b64_encode_file(git_conf_file) + # store back + if git_cred_file or git_conf_file: + # noinspection PyProtectedMember + task._set_runtime_properties(runtime_properties) + else: + if git_cred_file: + task.connect_configuration( + configuration=git_cred_file, name='git_credentials', description='git credentials') + if git_conf_file: + task.connect_configuration( + configuration=git_conf_file, name='git_config', description='git config') if state.get('packages'): requirements = task.data.script.requirements or {} @@ -476,21 +569,28 @@ def wait_for_machine(state, task): # wait until task is running print('Waiting for remote machine allocation [id={}]'.format(task.id)) last_status = None - while last_status != 'in_progress' and last_status in (None, 'created', 'queued', 'unknown',): + last_message = None + stopped_counter = 0 + while last_status != 'in_progress' and last_status in (None, 'created', 'queued', 'unknown', 'stopped'): print('.', end='', flush=True) if last_status is not None: sleep(2.) - status = task.get_status() - if last_status != status: + stopped_counter = (stopped_counter+1) if last_status == 'stopped' else 0 + if stopped_counter > 5: + break + # noinspection PyProtectedMember + status, message = task._get_status() + status = str(status) + if last_status != status or last_message != message: # noinspection PyProtectedMember - last_status = task._get_status()[1] - print('Status [{}]{}'.format(status, ' - {}'.format(last_status) if last_status else '')) + print('Status [{}]{} {}'.format(status, ' - {}'.format(last_status) if last_status else '', message)) last_status = status + last_message = message print('Remote machine allocated') print('Setting remote environment [Task id={}]'.format(task.id)) print('Setup process details: {}'.format(task.get_output_log_web_page())) - print('Waiting for environment setup to complete [usually about 20-30 seconds]') + print('Waiting for environment setup to complete [usually about 20-30 seconds, see last log line/s below]') # monitor progress, until we get the new jupyter, then we know it is working task.reload() @@ -513,24 +613,38 @@ def wait_for_machine(state, task): last_lines = [] period_counter = 0 while any(bool(not task.get_parameter(p)) for p in wait_properties) and task.get_status() == 'in_progress': - lines = task.get_reported_console_output(10) if state.get('debug') else [] + lines = task.get_reported_console_output(10 if state.get('verbose') else 1) if last_lines != lines: # new line if we had '.' counter in the previous run if period_counter: - print('') + if state.get('verbose'): + print('') period_counter = 0 try: index = next(i for i, line in enumerate(lines) if last_lines and line == last_lines[-1]) - print('> ' + ''.join(lines[index+1:]).rstrip().replace('\n', '\n> ')) + print_line = '> ' + ''.join(lines[index+1:]).rstrip().replace('\n', '\n> ') except StopIteration: - print('> ' + ''.join(lines).rstrip().replace('\n', '\n> ')) + print_line = '> ' + ''.join(lines).rstrip().replace('\n', '\n> ') + + if state.get('verbose'): + print(print_line) + else: + print_line = [l for l in print_line.split('\n') if l.rstrip()] + if print_line: + print('\r' + print_line[-1], end='', flush=True) last_lines = lines else: - print('.', end='', flush=True) period_counter += 1 + print(('' if state.get('verbose') else '\r') + '.'*period_counter, end='', flush=True) sleep(3.) task.reload() + + # clear the line + if not state.get('verbose'): + print('\r ', end='', flush=True) + print('\n') + if task.get_status() != 'in_progress': raise ValueError("Remote setup failed (status={}) see details: {}".format( task.get_status(), task.get_output_log_web_page())) @@ -609,7 +723,7 @@ def monitor_ssh_tunnel(state, task): vscode_port = None connect_state = {'reconnect': False} - if not state.get('disable_keepalive'): + if state.get('keepalive'): if state.get('jupyter_lab'): SingleThreadProxy(local_jupyter_port, local_jupyter_port_) if state.get('vscode_server'): @@ -628,18 +742,25 @@ def monitor_ssh_tunnel(state, task): if not all([ssh_port, jupyter_token, jupyter_port, internal_ssh_port, ssh_password, remote_address]): task.reload() task_parameters = task.get_parameters() - section = 'General' if 'General/ssh_server' in task_parameters else default_section + if Session.check_min_api_version("2.13"): + # noinspection PyProtectedMember + runtime_prop = task._get_runtime_properties() + ssh_password = runtime_prop.get('_ssh_password') or state.get('password', '') + jupyter_token = runtime_prop.get('_jupyter_token') + else: + section = 'General' if 'General/ssh_server' in task_parameters else default_section + ssh_password = task_parameters.get('{}/ssh_password'.format(section)) or state.get('password', '') + jupyter_token = task_parameters.get('properties/jupyter_token') + remote_address = \ task_parameters.get('properties/k8s-gateway-address') or \ task_parameters.get('properties/external_address') - ssh_password = task_parameters.get('{}/ssh_password'.format(section)) or state.get('password', '') internal_ssh_port = task_parameters.get('properties/internal_ssh_port') jupyter_port = task_parameters.get('properties/jupyter_port') - jupyter_token = task_parameters.get('properties/jupyter_token') ssh_port = \ task_parameters.get('properties/k8s-pod-port') or \ task_parameters.get('properties/external_ssh_port') or internal_ssh_port - if not state.get('disable_keepalive'): + if state.get('keepalive'): internal_ssh_port = task_parameters.get('properties/internal_stable_ssh_port') or internal_ssh_port local_remote_pair_list = [(local_ssh_port_, internal_ssh_port)] if state.get('jupyter_lab'): @@ -671,7 +792,7 @@ def monitor_ssh_tunnel(state, task): state.get('username') or 'root', remote_address, ssh_port, ssh_password, local_remote_pair_list=local_remote_pair_list, - debug=state.get('debug', False), + debug=state.get('verbose', False), ) if ssh_process and ssh_process.isalive(): @@ -684,9 +805,13 @@ def monitor_ssh_tunnel(state, task): msg += \ '\nJupyter Lab URL: http://localhost:{local_jupyter_port}/?token={jupyter_token}'.format( local_jupyter_port=local_jupyter_port, jupyter_token=jupyter_token.rstrip()) + if state.get('user_folder'): + msg += "&file-browser-path={}".format(state.get('user_folder')) if vscode_port: msg += '\nVSCode server available at http://localhost:{local_vscode_port}/'.format( local_vscode_port=local_vscode_port) + if state.get('user_folder'): + msg += "?folder={}".format(state.get('user_folder')) print(msg) print('\nConnection is up and running\n' @@ -810,9 +935,10 @@ def setup_parser(parser): '(default: previously used Task). Use `none` for the default interactive session') parser.add_argument('--project', type=str, default=None, help='Advanced: Set the project name for the interactive session Task') - parser.add_argument('--disable-keepalive', action='store_true', default=None, - help='Advanced: If set, disable the transparent proxy always keeping the sockets alive. ' - 'Default: false, use transparent socket mitigating connection drops.') + parser.add_argument('--keepalive', default=False, nargs='?', const='true', metavar='true/false', + type=lambda x: (str(x).strip().lower() in ('true', 'yes')), + help='Advanced: If set, enables the transparent proxy always keeping the sockets alive. ' + 'Default: False, do not use transparent socket for mitigating connection drops.') parser.add_argument('--queue-excluded-tag', default=None, nargs='*', help='Advanced: Excluded queues with this specific tag from the selection') parser.add_argument('--queue-include-tag', default=None, nargs='*', @@ -826,8 +952,9 @@ def setup_parser(parser): parser.add_argument('--username', type=str, default=None, help='Advanced: Select ssh username for the interactive session ' '(default: `root` or previously used one)') - parser.add_argument('--debug', action='store_true', default=None, - help='Advanced: If set, print debugging information') + parser.add_argument('--verbose', action='store_true', default=None, + help='Advanced: If set, print verbose progress information, ' + 'e.g. the remote machine setup process log') def get_version(): @@ -862,8 +989,8 @@ def cli(): state_file = os.path.abspath(os.path.expandvars(os.path.expanduser(args.config_file))) state = load_state(state_file) - if args.debug: - state['debug'] = args.debug + if args.verbose: + state['verbose'] = args.verbose client = APIClient() @@ -894,7 +1021,7 @@ def cli(): project_id = get_project_id(state) # remove old Tasks created by us. - delete_old_tasks(client, state.get('base_task_id')) + delete_old_tasks(state, client, state.get('base_task_id')) # Clone the Task and adjust parameters task = clone_task(state, project_id) diff --git a/clearml_session/interactive_session_task.py b/clearml_session/interactive_session_task.py index dbfbc1d..b89904d 100644 --- a/clearml_session/interactive_session_task.py +++ b/clearml_session/interactive_session_task.py @@ -1,3 +1,4 @@ +import base64 import json import os import socket @@ -13,6 +14,7 @@ import psutil from pathlib2 import Path from clearml import Task, StorageManager +from clearml.backend_api import Session # noinspection SpellCheckingInspection @@ -97,7 +99,25 @@ def init_task(param, a_default_ssh_fingerprint): project_name="DevOps", task_name="Allocate Jupyter Notebook Instance", task_type=Task.TaskTypes.service) # Add jupyter server base folder - task.connect(param, name=config_section_name) + if Session.check_min_api_version('2.13'): + param.pop('user_key', None) + param.pop('user_secret', None) + param.pop('ssh_password', None) + task.connect(param, name=config_section_name) + # noinspection PyProtectedMember + runtime_prop = dict(task._get_runtime_properties()) + # remove the user key/secret the moment we have it + param['user_key'] = runtime_prop.pop('_user_key', None) + param['user_secret'] = runtime_prop.pop('_user_secret', None) + # no need to reset, we will need it + param['ssh_password'] = runtime_prop.get('_ssh_password') + # Force removing properties + # noinspection PyProtectedMember + task._edit(runtime=runtime_prop) + task.reload() + else: + task.connect(param, name=config_section_name) + # connect ssh finger print configuration (with fallback if section is missing) old_default_ssh_fingerprint = deepcopy(a_default_ssh_fingerprint) try: @@ -123,16 +143,17 @@ def setup_os_env(param): "_API_SECRET_KEY", "_API_HOST_VERIFY_CERT", "_DOCKER_IMAGE", + "_DOCKER_BASH_SCRIPT", ) # set default docker image, with network configuration if param.get('default_docker', '').strip(): - os.environ["TRAINS_DOCKER_IMAGE"] = param['default_docker'].strip() os.environ["CLEARML_DOCKER_IMAGE"] = param['default_docker'].strip() # setup os environment env = deepcopy(os.environ) for key in os.environ: - if (key.startswith("TRAINS") or key.startswith("CLEARML")) and not any(key.endswith(p) for p in preserve): + # only set CLEARML_ remove any TRAINS_ + if key.startswith("TRAINS") or (key.startswith("CLEARML") and not any(key.endswith(p) for p in preserve)): env.pop(key, None) return env @@ -188,8 +209,7 @@ def monitor_jupyter_server(fd, local_filename, process, task, jupyter_port, host # we could not locate the token, try again if not token: continue - # update the task with the correct links and token - task.set_parameter(name='properties/jupyter_token', value=str(token)) + # we ignore the reported port, because jupyter server will get confused # if we have multiple servers running and will point to the wrong port/server task.set_parameter(name='properties/jupyter_port', value=str(jupyter_port)) @@ -197,8 +217,20 @@ def monitor_jupyter_server(fd, local_filename, process, task, jupyter_port, host 'https' if "https://" in line else 'http', hostnames, jupyter_port, token ) + + # update the task with the correct links and token + if Session.check_min_api_version("2.13"): + # noinspection PyProtectedMember + runtime_prop = task._get_runtime_properties() + runtime_prop['_jupyter_token'] = str(token) + runtime_prop['_jupyter_url'] = str(jupyter_url) + # noinspection PyProtectedMember + task._set_runtime_properties(runtime_prop) + else: + task.set_parameter(name='properties/jupyter_token', value=str(token)) + task.set_parameter(name='properties/jupyter_url', value=jupyter_url) + print('\nJupyter Lab URL: {}\n'.format(jupyter_url)) - task.set_parameter(name='properties/jupyter_url', value=jupyter_url) # cleanup # noinspection PyBroadException @@ -219,8 +251,8 @@ def start_vscode_server(hostname, hostnames, param, task, env): # get vscode version and python extension version # they are extremely flaky, this combination works, most do not. - vscode_version = '3.9.2' - python_ext_version = '2021.3.658691958' + vscode_version = '3.12.0' + python_ext_version = '2021.10.1365161279' if param.get("vscode_version"): vscode_version_parts = param.get("vscode_version").split(':') vscode_version = vscode_version_parts[0] @@ -231,27 +263,40 @@ def start_vscode_server(hostname, hostnames, param, task, env): env = dict(**env) env.pop('PYTHONPATH', None) + pre_installed = False + python_ext = None + # find a free tcp port port = get_free_port(9000, 9100) if os.geteuid() == 0: - # installing VSCODE: + # check if preinstalled + # noinspection PyBroadException try: - python_ext = StorageManager.get_local_copy( - 'https://github.com/microsoft/vscode-python/releases/download/{}/ms-python-release.vsix'.format( - python_ext_version), - extract_archive=False) - code_server_deb = StorageManager.get_local_copy( - 'https://github.com/cdr/code-server/releases/download/' - 'v{version}/code-server_{version}_amd64.deb'.format(version=vscode_version), - extract_archive=False) - os.system("dpkg -i {}".format(code_server_deb)) - except Exception as ex: - print("Failed installing vscode server: {}".format(ex)) - return - vscode_path = 'code-server' + vscode_path = subprocess.check_output('which code-server', shell=True).decode().strip() + pre_installed = bool(vscode_path) + except Exception: + vscode_path = None + + if not vscode_path: + # installing VSCODE: + try: + python_ext = StorageManager.get_local_copy( + 'https://github.com/microsoft/vscode-python/releases/download/{}/ms-python-release.vsix'.format( + python_ext_version), + extract_archive=False) + code_server_deb = StorageManager.get_local_copy( + 'https://github.com/cdr/code-server/releases/download/' + 'v{version}/code-server_{version}_amd64.deb'.format(version=vscode_version), + extract_archive=False) + os.system("dpkg -i {}".format(code_server_deb)) + except Exception as ex: + print("Failed installing vscode server: {}".format(ex)) + return + vscode_path = 'code-server' else: python_ext = None + pre_installed = True # check if code-server exists # noinspection PyBroadException try: @@ -280,51 +325,65 @@ def start_vscode_server(hostname, hostnames, param, task, env): try: fd, local_filename = mkstemp() - subprocess.Popen( - [ - vscode_path, - "--auth", - "none", - "--bind-addr", - "127.0.0.1:{}".format(port), - "--user-data-dir", user_folder, - "--extensions-dir", exts_folder, - "--install-extension", "ms-toolsai.jupyter", - # "--install-extension", "donjayamanne.python-extension-pack" - ] + ["--install-extension", python_ext] if python_ext else [], - env=env, - stdout=fd, - stderr=fd, - ) - settings = Path(os.path.expanduser(os.path.join(user_folder, 'User/settings.json'))) - settings.parent.mkdir(parents=True, exist_ok=True) - # noinspection PyBroadException - try: - with open(settings.as_posix(), 'rt') as f: - base_json = json.load(f) - except Exception: - base_json = {} - # noinspection PyBroadException - try: - base_json.update({ - "extensions.autoCheckUpdates": False, - "extensions.autoUpdate": False, - "python.pythonPath": sys.executable, - "terminal.integrated.shell.linux": "/bin/bash" if Path("/bin/bash").is_file() else None, - }) - with open(settings.as_posix(), 'wt') as f: - json.dump(base_json, f) - except Exception: - pass + if pre_installed: + user_folder = os.path.expanduser("~/.local/share/code-server/") + if not os.path.isdir(user_folder): + user_folder = None + exts_folder = None + else: + exts_folder = os.path.expanduser("~/.local/share/code-server/extensions/") + else: + subprocess.Popen( + [ + vscode_path, + "--auth", + "none", + "--bind-addr", + "127.0.0.1:{}".format(port), + "--user-data-dir", user_folder, + "--extensions-dir", exts_folder, + "--install-extension", "ms-toolsai.jupyter", + # "--install-extension", "donjayamanne.python-extension-pack" + ] + ["--install-extension", "ms-python.python@{}".format(python_ext_version)] if python_ext else [], + env=env, + stdout=fd, + stderr=fd, + ) + + if user_folder: + settings = Path(os.path.expanduser(os.path.join(user_folder, 'User/settings.json'))) + settings.parent.mkdir(parents=True, exist_ok=True) + # noinspection PyBroadException + try: + with open(settings.as_posix(), 'rt') as f: + base_json = json.load(f) + except Exception: + base_json = {} + # noinspection PyBroadException + try: + base_json.update({ + "extensions.autoCheckUpdates": False, + "extensions.autoUpdate": False, + "python.pythonPath": sys.executable, + "terminal.integrated.shell.linux": "/bin/bash" if Path("/bin/bash").is_file() else None, + }) + with open(settings.as_posix(), 'wt') as f: + json.dump(base_json, f) + except Exception: + pass + proc = subprocess.Popen( ['bash', '-c', - '{} --auth none --bind-addr 127.0.0.1:{} --disable-update-check ' - '--user-data-dir {} --extensions-dir {}'.format(vscode_path, port, user_folder, exts_folder)], + '{} --auth none --bind-addr 127.0.0.1:{} --disable-update-check {} {}'.format( + vscode_path, port, + '--user-data-dir \"{}\"'.format(user_folder) if user_folder else '', + '--extensions-dir \"{}\"'.format(exts_folder) if exts_folder else '')], env=env, stdout=fd, stderr=fd, cwd=cwd, ) + try: error_code = proc.wait(timeout=1) raise ValueError("code-server failed starting, return code {}".format(error_code)) @@ -343,7 +402,7 @@ def start_jupyter_server(hostname, hostnames, param, task, env): print('no jupyterlab to monitor - going to sleep') while True: sleep(10.) - return + return # noqa # execute jupyter notebook fd, local_filename = mkstemp() @@ -418,15 +477,15 @@ def setup_ssh_server(hostname, hostnames, param, task): "&& " # noqa: W605 "echo 'ClientAliveInterval 10' >> /etc/ssh/sshd_config && " "echo 'ClientAliveCountMax 20' >> /etc/ssh/sshd_config && " - "echo 'AcceptEnv TRAINS_API_ACCESS_KEY TRAINS_API_SECRET_KEY " + "echo 'AcceptEnv CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY " "CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY' >> /etc/ssh/sshd_config && " 'echo "export VISIBLE=now" >> /etc/profile && ' 'echo "export PATH=$PATH" >> /etc/profile && ' - 'echo "ldconfig" >> /etc/profile && ' - 'echo "export TRAINS_CONFIG_FILE={trains_config_file}" >> /etc/profile'.format( + 'echo "ldconfig" 2>/dev/null >> /etc/profile && ' + 'echo "export CLEARML_CONFIG_FILE={trains_config_file}" >> /etc/profile'.format( password=ssh_password, port=port, - trains_config_file=os.environ.get("CLEARML_CONFIG_FILE") or os.environ.get("TRAINS_CONFIG_FILE"), + trains_config_file=os.environ.get("CLEARML_CONFIG_FILE") or os.environ.get("CLEARML_CONFIG_FILE"), ) ) sshd_path = '/usr/sbin/sshd' @@ -449,7 +508,7 @@ def setup_ssh_server(hostname, hostnames, param, task): "UsePAM yes" + "\n"\ "AuthorizedKeysFile {}".format(os.path.join(ssh_config_path, 'authorized_keys')) + "\n"\ "PidFile {}".format(os.path.join(ssh_config_path, 'sshd.pid')) + "\n"\ - "AcceptEnv TRAINS_API_ACCESS_KEY TRAINS_API_SECRET_KEY "\ + "AcceptEnv CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY "\ "CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY"+"\n" for k in default_ssh_fingerprint: filename = os.path.join(ssh_config_path, '{}'.format(k.replace('__pub', '.pub'))) @@ -511,13 +570,42 @@ def setup_ssh_server(hostname, hostnames, param, task): print("Error: {}\n\n#\n# Error: SSH server could not be launched\n#\n".format(ex)) +def _b64_decode_file(encoded_string): + # noinspection PyBroadException + try: + import gzip + value = gzip.decompress(base64.decodebytes(encoded_string.encode('ascii'))).decode('utf8') + return value + except Exception: + return None + + def setup_user_env(param, task): env = setup_os_env(param) + + # apply vault if we have it + vault_environment = {} + if param.get("user_key") and param.get("user_secret"): + # noinspection PyBroadException + try: + print('Applying vault configuration') + from clearml.backend_api.session.defs import ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION + prev_env, prev_files = ENV_ENABLE_ENV_CONFIG_SECTION.get(), ENV_ENABLE_FILES_CONFIG_SECTION.get() + ENV_ENABLE_ENV_CONFIG_SECTION.set(True), ENV_ENABLE_FILES_CONFIG_SECTION.set(True) + prev_envs = deepcopy(os.environ) + Session(api_key=param.get("user_key"), secret_key=param.get("user_secret")) + vault_environment = {k: v for k, v in os.environ.items() if prev_envs.get(k) != v} + ENV_ENABLE_ENV_CONFIG_SECTION.set(prev_env), ENV_ENABLE_FILES_CONFIG_SECTION.set(prev_files) + if vault_environment: + print('Vault environment added: {}'.format(list(vault_environment.keys()))) + except Exception as ex: + print('Applying vault configuration failed: {}'.format(ex)) + # do not change user bash/profile if os.geteuid() != 0: if param.get("user_key") and param.get("user_secret"): - env['TRAINS_API_ACCESS_KEY'] = param.get("user_key") - env['TRAINS_API_SECRET_KEY'] = param.get("user_secret") + env['CLEARML_API_ACCESS_KEY'] = param.get("user_key") + env['CLEARML_API_SECRET_KEY'] = param.get("user_secret") return env # create symbolic link to the venv @@ -530,20 +618,24 @@ def setup_user_env(param, task): pass # set default user credentials if param.get("user_key") and param.get("user_secret"): - os.system("echo 'export TRAINS_API_ACCESS_KEY=\"{}\"' >> ~/.bashrc".format( + os.system("echo 'export CLEARML_API_ACCESS_KEY=\"{}\"' >> ~/.bashrc".format( param.get("user_key", "").replace('$', '\\$'))) - os.system("echo 'export TRAINS_API_SECRET_KEY=\"{}\"' >> ~/.bashrc".format( + os.system("echo 'export CLEARML_API_SECRET_KEY=\"{}\"' >> ~/.bashrc".format( param.get("user_secret", "").replace('$', '\\$'))) - os.system("echo 'export TRAINS_DOCKER_IMAGE=\"{}\"' >> ~/.bashrc".format( - param.get("default_docker", "").strip() or env.get('TRAINS_DOCKER_IMAGE', ''))) - os.system("echo 'export TRAINS_API_ACCESS_KEY=\"{}\"' >> ~/.profile".format( + os.system("echo 'export CLEARML_DOCKER_IMAGE=\"{}\"' >> ~/.bashrc".format( + param.get("default_docker", "").strip() or env.get('CLEARML_DOCKER_IMAGE', ''))) + os.system("echo 'export CLEARML_API_ACCESS_KEY=\"{}\"' >> ~/.profile".format( param.get("user_key", "").replace('$', '\\$'))) - os.system("echo 'export TRAINS_API_SECRET_KEY=\"{}\"' >> ~/.profile".format( + os.system("echo 'export CLEARML_API_SECRET_KEY=\"{}\"' >> ~/.profile".format( param.get("user_secret", "").replace('$', '\\$'))) - os.system("echo 'export TRAINS_DOCKER_IMAGE=\"{}\"' >> ~/.profile".format( - param.get("default_docker", "").strip() or env.get('TRAINS_DOCKER_IMAGE', ''))) - env['TRAINS_API_ACCESS_KEY'] = param.get("user_key") - env['TRAINS_API_SECRET_KEY'] = param.get("user_secret") + os.system("echo 'export CLEARML_DOCKER_IMAGE=\"{}\"' >> ~/.profile".format( + param.get("default_docker", "").strip() or env.get('CLEARML_DOCKER_IMAGE', ''))) + for k, v in vault_environment.items(): + os.system("echo 'export {}=\"{}\"' >> ~/.profile".format(k, v)) + os.system("echo 'export {}=\"{}\"' >> ~/.bashrc".format(k, v)) + env[k] = str(v) if v else "" + env['CLEARML_API_ACCESS_KEY'] = param.get("user_key") + env['CLEARML_API_SECRET_KEY'] = param.get("user_secret") # set default folder for user if param.get("user_base_directory"): base_dir = param.get("user_base_directory") @@ -557,8 +649,27 @@ def setup_user_env(param, task): os.system("echo '. {}' >> ~/.profile".format(os.path.join(environment, 'bin', 'activate'))) # check if we need to create .git-credentials - # noinspection PyProtectedMember - git_credentials = task._get_configuration_text('git_credentials') + + runtime_property_support = Session.check_min_api_version("2.13") + if runtime_property_support: + # noinspection PyProtectedMember + runtime_prop = dict(task._get_runtime_properties()) + git_credentials = runtime_prop.pop('_git_credentials', None) + git_config = runtime_prop.pop('_git_config', None) + # force removing properties + # noinspection PyProtectedMember + task._edit(runtime=runtime_prop) + task.reload() + if git_credentials is not None: + git_credentials = _b64_decode_file(git_credentials) + if git_config is not None: + git_config = _b64_decode_file(git_config) + else: + # noinspection PyProtectedMember + git_credentials = task._get_configuration_text('git_credentials') + # noinspection PyProtectedMember + git_config = task._get_configuration_text('git_config') + if git_credentials: git_cred_file = os.path.expanduser('~/.config/git/credentials') # noinspection PyBroadException @@ -568,8 +679,7 @@ def setup_user_env(param, task): f.write(git_credentials) except Exception: print('Could not write {} file'.format(git_cred_file)) - # noinspection PyProtectedMember - git_config = task._get_configuration_text('git_config') + if git_config: git_config_file = os.path.expanduser('~/.config/git/config') # noinspection PyBroadException diff --git a/clearml_session/version.py b/clearml_session/version.py index bfeb9e7..40ed83d 100644 --- a/clearml_session/version.py +++ b/clearml_session/version.py @@ -1 +1 @@ -__version__ = '0.3.4' +__version__ = '0.3.5' diff --git a/requirements.txt b/requirements.txt index 2030eeb..76bbc81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -clearml +clearml >= 1.1.5 pexpect ; sys_platform != 'win32' wexpect ; sys_platform == 'win32'