Merge branch 'allegroai:main' into main

2025-06-26 18:16:55 +00:00 · 2022-01-24 10:36:15 +02:00 · 2022-01-24 10:36:15 +02:00 · b3ab43de13
commit b3ab43de13
parent 1f511daa98 e956065db6
5 changed files with 424 additions and 169 deletions
--- a/README.md
+++ b/README.md
@ -205,7 +205,7 @@ In the `clearml` web UI, find the experiment (Task) you wish to debug.
 Click on the ID button next to the Task name, and copy the unique ID.

 ``` bash
-clearml-session --debugging <experiment_id_here>
+clearml-session --debugging-session <experiment_id_here>
 ```

 Click on the JupyterLab/VSCode link, or connect directly to the SSH session
@ -219,9 +219,12 @@ clearml-session --help
 ``` console
 clearml-session - CLI for launching JupyterLab / VSCode on a remote machine
 usage: clearml-session [-h] [--version] [--attach [ATTACH]]
-                       [--debugging DEBUGGING] [--queue QUEUE]
-                       [--docker DOCKER] [--public-ip [true/false]]
+                       [--debugging-session DEBUGGING_SESSION] [--queue QUEUE]
+                       [--docker DOCKER] [--docker-args DOCKER_ARGS]
+                       [--public-ip [true/false]]
+                       [--remote-ssh-port REMOTE_SSH_PORT]
                       [--vscode-server [true/false]]
+                       [--vscode-version VSCODE_VERSION]
                       [--jupyter-lab [true/false]]
                       [--git-credentials [true/false]]
                       [--user-folder USER_FOLDER]
@ -231,11 +234,11 @@ usage: clearml-session [-h] [--version] [--attach [ATTACH]]
                       [--config-file CONFIG_FILE]
                       [--remote-gateway [REMOTE_GATEWAY]]
                       [--base-task-id BASE_TASK_ID] [--project PROJECT]
-                       [--disable-keepalive]
+                       [--keepalive [true/false]]
                       [--queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]]]
                       [--queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]]]
                       [--skip-docker-network] [--password PASSWORD]
-                       [--username USERNAME]
+                       [--username USERNAME] [--verbose]

 clearml-session - CLI for launching JupyterLab / VSCode on a remote machine

@ -244,23 +247,34 @@ optional arguments:
  --version             Display the clearml-session utility version
  --attach [ATTACH]     Attach to running interactive session (default:
                        previous session)
-  --debugging DEBUGGING
+  --debugging-session DEBUGGING_SESSION
                        Pass existing Task id (experiment), create a copy of
                        the experiment on a remote machine, and launch
                        jupyter/ssh for interactive access. Example
-                        --debugging <task_id>
+                        --debugging-session <task_id>
  --queue QUEUE         Select the queue to launch the interactive session on
                        (default: previously used queue)
  --docker DOCKER       Select the docker image to use in the interactive
                        session on (default: previously used docker image or
                        `nvidia/cuda:10.1-runtime-ubuntu18.04`)
+  --docker-args DOCKER_ARGS
+                        Add additional arguments for the docker image to use
+                        in the interactive session on (default: previously
+                        used docker-args)
  --public-ip [true/false]
                        If True register the public IP of the remote machine.
                        Set if running on the cloud. Default: false (use for
                        local / on-premises)
+  --remote-ssh-port REMOTE_SSH_PORT
+                        Set the remote ssh server port, running on the agent`s
+                        machine. (default: 10022)
  --vscode-server [true/false]
                        Install vscode server (code-server) on interactive
                        session (default: true)
+  --vscode-version VSCODE_VERSION
+                        Set vscode server (code-server) version, as well as
+                        vscode python extension version <vscode:python-ext>
+                        (example: "3.7.4:2020.10.332292344")
  --jupyter-lab [true/false]
                        Install Jupyter-Lab on interactive session (default:
                        true)
@ -287,17 +301,19 @@ optional arguments:
                        Advanced: Change the configuration file used to store
                        the previous state (default: ~/.clearml_session.json)
  --remote-gateway [REMOTE_GATEWAY]
-                        Advanced: Specify gateway ip/address to be passed to
-                        interactive session (for use with k8s ingestion / ELB)
+                        Advanced: Specify gateway ip/address:port to be passed
+                        to interactive session (for use with k8s ingestion /
+                        ELB)
  --base-task-id BASE_TASK_ID
                        Advanced: Set the base task ID for the interactive
                        session. (default: previously used Task). Use `none`
                        for the default interactive session
  --project PROJECT     Advanced: Set the project name for the interactive
                        session Task
-  --disable-keepalive   Advanced: If set, disable the transparent proxy always
-                        keeping the sockets alive. Default: false, use
-                        transparent socket mitigating connection drops.
+  --keepalive [true/false]
+                        Advanced: If set, enables the transparent proxy always
+                        keeping the sockets alive. Default: False, do not use
+                        transparent socket for mitigating connection drops.
  --queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]]
                        Advanced: Excluded queues with this specific tag from
                        the selection
@ -313,6 +329,8 @@ optional arguments:
                        used one)
  --username USERNAME   Advanced: Select ssh username for the interactive
                        session (default: `root` or previously used one)
+  --verbose             Advanced: If set, print verbose progress information,
+                        e.g. the remote machine setup process log

 Notice! all arguments are stored as new defaults for the next session
 ```
--- a/clearml_session/main.py
+++ b/clearml_session/main.py
@ -1,3 +1,4 @@
+import base64
 import hashlib
 import json
 import logging
@ -20,6 +21,7 @@ import psutil
 from clearml import Task
 from clearml.backend_api.session.client import APIClient
 from clearml.config import config_obj
+from clearml.backend_api import Session
 from .tcp_proxy import TcpProxy
 from .single_thread_proxy import SingleThreadProxy

@ -136,35 +138,59 @@ def create_base_task(state, project_name=None, task_name=None):
    task = Task.create(project_name=project_name or 'DevOps',
                       task_name=task_name or 'Interactive Session',
                       task_type=Task.TaskTypes.application)
-    task_state = task.export_task()
+    task_script = task.data.script.to_dict()
    base_script_file = os.path.abspath(os.path.join(__file__, '..', 'tcp_proxy.py'))
    with open(base_script_file, 'rt') as f:
-        task_state['script']['diff'] = f.read()
+        task_script['diff'] = f.read()
    base_script_file = os.path.abspath(os.path.join(__file__, '..', 'interactive_session_task.py'))
    with open(base_script_file, 'rt') as f:
-        task_state['script']['diff'] += '\n\n' + f.read()
+        task_script['diff'] += '\n\n' + f.read()

-    task_state['script']['working_dir'] = '.'
-    task_state['script']['entry_point'] = 'interactive_session.py'
-    task_state['script']['requirements'] = {'pip': '\n'.join(
-        ["clearml"] + (["jupyter", "jupyterlab", "jupyterlab_git"] if state.get('jupyter_lab') else []) +
+    task_script['working_dir'] = '.'
+    task_script['entry_point'] = 'interactive_session.py'
+    task_script['requirements'] = {'pip': '\n'.join(
+        ["clearml>=1.1.5"] + (["jupyter", "jupyterlab", "jupyterlab_git"] if state.get('jupyter_lab') else []) +
        (['pylint'] if state.get('vscode_server') else []))}
-    task.update_task(task_state)
+
    section, _, _ = _get_config_section_name()
-    task.set_parameters({
-        "{}/user_base_directory".format(section): "~/",
-        "{}/ssh_server".format(section): True,
-        "{}/ssh_password".format(section): "training",
-        "{}/default_docker".format(section): "nvidia/cuda",
-        "{}/user_key".format(section): '',
-        "{}/user_secret".format(section): '',
-        "properties/external_address": '',
-        "properties/internal_ssh_port": '',
-        "properties/jupyter_token": '',
-        "properties/jupyter_port": '',
-    })
+
+    if Session.check_min_api_version('2.13'):
+        _runtime_prop = dict(task._get_runtime_properties())
+        _runtime_prop.update({
+            "_user_key": '',
+            "_user_secret": '',
+            "_jupyter_token": '',
+            "_ssh_password": "training",
+        })
+        # noinspection PyProtectedMember
+        task._set_runtime_properties(_runtime_prop)
+        task.set_parameters({
+            "{}/user_base_directory".format(section): "~/",
+            "{}/ssh_server".format(section): True,
+            "{}/default_docker".format(section): "nvidia/cuda",
+            "properties/external_address": '',
+            "properties/internal_ssh_port": '',
+            "properties/jupyter_port": '',
+        })
+    else:
+        task.set_parameters({
+            "{}/user_base_directory".format(section): "~/",
+            "{}/ssh_server".format(section): True,
+            "{}/ssh_password".format(section): "training",
+            "{}/default_docker".format(section): "nvidia/cuda",
+            "{}/user_key".format(section): '',
+            "{}/user_secret".format(section): '',
+            "properties/external_address": '',
+            "properties/internal_ssh_port": '',
+            "properties/jupyter_token": '',
+            "properties/jupyter_port": '',
+        })
+
    task.set_system_tags([system_tag])
-    task.reset(force=True)
+
+    # only update the data at the end, so reload requests are smaller
+    # noinspection PyProtectedMember
+    task._edit(script=task_script)
    return task


@ -197,24 +223,44 @@ def create_debugging_task(state, debug_task_id):
        (['pylint'] if state.get('vscode_server') else [])
    task.update_task(task_state)
    section, _, _ = _get_config_section_name()
-    task.set_parameters({
-        "{}/user_base_directory".format(section): "~/",
-        "{}/ssh_server".format(section): True,
-        "{}/ssh_password".format(section): "training",
-        "{}/default_docker".format(section): "nvidia/cuda",
-        "{}/user_key".format(section): '',
-        "{}/user_secret".format(section): '',
-        "properties/external_address": '',
-        "properties/internal_ssh_port": '',
-        "properties/jupyter_token": '',
-        "properties/jupyter_port": '',
-    })
+
+    if Session.check_min_api_version('2.13'):
+        _runtime_prop = dict(task._get_runtime_properties())
+        _runtime_prop.update({
+            "_user_key": '',
+            "_user_secret": '',
+            "_jupyter_token": '',
+            "_ssh_password": "training",
+        })
+        # noinspection PyProtectedMember
+        task._set_runtime_properties(_runtime_prop)
+        task.set_parameters({
+            "{}/user_base_directory".format(section): "~/",
+            "{}/ssh_server".format(section): True,
+            "{}/default_docker".format(section): "nvidia/cuda",
+            "properties/external_address": '',
+            "properties/internal_ssh_port": '',
+            "properties/jupyter_port": '',
+        })
+    else:
+        task.set_parameters({
+            "{}/user_base_directory".format(section): "~/",
+            "{}/ssh_server".format(section): True,
+            "{}/ssh_password".format(section): "training",
+            "{}/default_docker".format(section): "nvidia/cuda",
+            "{}/user_key".format(section): '',
+            "{}/user_secret".format(section): '',
+            "properties/external_address": '',
+            "properties/internal_ssh_port": '',
+            "properties/jupyter_token": '',
+            "properties/jupyter_port": '',
+        })
    task.set_system_tags([system_tag])
    task.reset(force=True)
    return task


-def delete_old_tasks(client, base_task_id):
+def delete_old_tasks(state, client, base_task_id):
    print('Removing stale interactive sessions')
    current_user_id = _get_user_id(client)
    previous_tasks = client.tasks.get_all(**{
@ -222,9 +268,13 @@ def delete_old_tasks(client, base_task_id):
        'parent': base_task_id or None,
        'system_tags': None if base_task_id else [system_tag],
        'page_size': 100, 'page': 0,
-        'user': [current_user_id], 'only_fields': ['id']
+        'user': [current_user_id],
+        'only_fields': ['id']
    })
-    for t in previous_tasks:
+
+    for i, t in enumerate(previous_tasks):
+        if state.get('verbose'):
+            print('Removing {}/{} stale sessions'.format(i+1, len(previous_tasks)))
        try:
            client.tasks.delete(task=t.id, force=True)
        except Exception as ex:
@ -265,6 +315,17 @@ def _get_user_id(client):
    return current_user_id


+def _b64_encode_file(file):
+    # noinspection PyBroadException
+    try:
+        import gzip
+        with open(file, 'rt') as f:
+            git_credentials = gzip.compress(f.read().encode('utf8'))
+        return base64.encodebytes(git_credentials).decode('ascii')
+    except Exception:
+        return None
+
+
 def get_project_id(state):
    project_id = None
    project_name = state.get('project') or None
@ -383,8 +444,8 @@ def load_state(state_file):
            state = json.load(f)
    except Exception:
        state = {}
-    # never reload --debug state
-    state.pop('debug', None)
+    # never reload --verbose state
+    state.pop('verbose', None)
    return state


@ -402,13 +463,31 @@ def clone_task(state, project_id):
        task = create_base_task(state, project_name=state.get('project'))
        new_task = True

+    print('Configuring new session')
+    runtime_prop_support = Session.check_min_api_version("2.13")
+    if runtime_prop_support:
+        # noinspection PyProtectedMember
+        runtime_properties = dict(task._get_runtime_properties() or {})
+        runtime_properties['_jupyter_token'] = ''
+        runtime_properties['_ssh_password'] = str(state['password'])
+        runtime_properties['_user_key'] = str(config_obj.get("api.credentials.access_key"))
+        runtime_properties['_user_secret'] = (config_obj.get("api.credentials.secret_key"))
+        # noinspection PyProtectedMember
+        task._set_runtime_properties(runtime_properties)
+
    task_params = task.get_parameters(backwards_compatibility=False)
    if 'General/ssh_server' in task_params:
        section = 'General'
        init_section = 'init_script'
    else:
        section, _, init_section = _get_config_section_name()
-    task_params['properties/jupyter_token'] = ''
+
+    if not runtime_prop_support:
+        task_params['properties/jupyter_token'] = ''
+        task_params['{}/ssh_password'.format(section)] = state['password']
+        task_params['{}/user_key'.format(section)] = config_obj.get("api.credentials.access_key")
+        task_params['{}/user_secret'.format(section)] = config_obj.get("api.credentials.secret_key")
+
    task_params['properties/jupyter_port'] = ''
    if state.get('remote_gateway') is not None:
        remote_gateway_parts = str(state.get('remote_gateway')).split(':')
@ -416,9 +495,6 @@ def clone_task(state, project_id):
        if len(remote_gateway_parts) > 1:
            task_params['properties/external_ssh_port'] = remote_gateway_parts[1]
    task_params['{}/ssh_server'.format(section)] = str(True)
-    task_params['{}/ssh_password'.format(section)] = state['password']
-    task_params['{}/user_key'.format(section)] = config_obj.get("api.credentials.access_key")
-    task_params['{}/user_secret'.format(section)] = config_obj.get("api.credentials.secret_key")
    task_params["{}/jupyterlab".format(section)] = bool(state.get('jupyter_lab'))
    task_params["{}/vscode_server".format(section)] = bool(state.get('vscode_server'))
    task_params["{}/public_ip".format(section)] = bool(state.get('public_ip'))
@ -443,13 +519,30 @@ def clone_task(state, project_id):
    # store the .git-credentials
    if state.get('git_credentials'):
        git_cred_file = os.path.join(os.path.expanduser('~'), '.git-credentials')
-        if os.path.isfile(git_cred_file):
-            task.connect_configuration(
-                configuration=git_cred_file, name='git_credentials', description='git credentials')
        git_conf_file = os.path.join(os.path.expanduser('~'), '.gitconfig')
-        if os.path.isfile(git_conf_file):
-            task.connect_configuration(
-                configuration=git_conf_file, name='git_config', description='git config')
+        if not os.path.isfile(git_cred_file):
+            git_cred_file = None
+        if not os.path.isfile(git_conf_file):
+            git_conf_file = None
+
+        if runtime_prop_support:
+            # noinspection PyProtectedMember
+            runtime_properties = dict(task._get_runtime_properties() or {})
+            if git_cred_file:
+                runtime_properties['_git_credentials'] = _b64_encode_file(git_cred_file)
+            if git_conf_file:
+                runtime_properties['_git_config'] = _b64_encode_file(git_conf_file)
+            # store back
+            if git_cred_file or git_conf_file:
+                # noinspection PyProtectedMember
+                task._set_runtime_properties(runtime_properties)
+        else:
+            if git_cred_file:
+                task.connect_configuration(
+                    configuration=git_cred_file, name='git_credentials', description='git credentials')
+            if git_conf_file:
+                task.connect_configuration(
+                    configuration=git_conf_file, name='git_config', description='git config')

    if state.get('packages'):
        requirements = task.data.script.requirements or {}
@ -476,21 +569,28 @@ def wait_for_machine(state, task):
    # wait until task is running
    print('Waiting for remote machine allocation [id={}]'.format(task.id))
    last_status = None
-    while last_status != 'in_progress' and last_status in (None, 'created', 'queued', 'unknown',):
+    last_message = None
+    stopped_counter = 0
+    while last_status != 'in_progress' and last_status in (None, 'created', 'queued', 'unknown', 'stopped'):
        print('.', end='', flush=True)
        if last_status is not None:
            sleep(2.)
-        status = task.get_status()
-        if last_status != status:
+            stopped_counter = (stopped_counter+1) if last_status == 'stopped' else 0
+            if stopped_counter > 5:
+                break
+        # noinspection PyProtectedMember
+        status, message = task._get_status()
+        status = str(status)
+        if last_status != status or last_message != message:
            # noinspection PyProtectedMember
-            last_status = task._get_status()[1]
-            print('Status [{}]{}'.format(status, ' - {}'.format(last_status) if last_status else ''))
+            print('Status [{}]{} {}'.format(status, ' - {}'.format(last_status) if last_status else '', message))
        last_status = status
+        last_message = message

    print('Remote machine allocated')
    print('Setting remote environment [Task id={}]'.format(task.id))
    print('Setup process details: {}'.format(task.get_output_log_web_page()))
-    print('Waiting for environment setup to complete [usually about 20-30 seconds]')
+    print('Waiting for environment setup to complete [usually about 20-30 seconds, see last log line/s below]')
    # monitor progress, until we get the new jupyter, then we know it is working
    task.reload()

@ -513,24 +613,38 @@ def wait_for_machine(state, task):
    last_lines = []
    period_counter = 0
    while any(bool(not task.get_parameter(p)) for p in wait_properties) and task.get_status() == 'in_progress':
-        lines = task.get_reported_console_output(10) if state.get('debug') else []
+        lines = task.get_reported_console_output(10 if state.get('verbose') else 1)
        if last_lines != lines:
            # new line if we had '.' counter in the previous run
            if period_counter:
-                print('')
+                if state.get('verbose'):
+                    print('')
                period_counter = 0
            try:
                index = next(i for i, line in enumerate(lines) if last_lines and line == last_lines[-1])
-                print('> ' + ''.join(lines[index+1:]).rstrip().replace('\n', '\n> '))
+                print_line = '> ' + ''.join(lines[index+1:]).rstrip().replace('\n', '\n> ')
            except StopIteration:
-                print('> ' + ''.join(lines).rstrip().replace('\n', '\n> '))
+                print_line = '> ' + ''.join(lines).rstrip().replace('\n', '\n> ')
+
+            if state.get('verbose'):
+                print(print_line)
+            else:
+                print_line = [l for l in print_line.split('\n') if l.rstrip()]
+                if print_line:
+                    print('\r' + print_line[-1], end='', flush=True)
            last_lines = lines
        else:
-            print('.', end='', flush=True)
            period_counter += 1
+            print(('' if state.get('verbose') else '\r') + '.'*period_counter, end='', flush=True)

        sleep(3.)
        task.reload()
+
+    # clear the line
+    if not state.get('verbose'):
+        print('\r     ', end='', flush=True)
+        print('\n')
+        
    if task.get_status() != 'in_progress':
        raise ValueError("Remote setup failed (status={}) see details: {}".format(
            task.get_status(), task.get_output_log_web_page()))
@ -609,7 +723,7 @@ def monitor_ssh_tunnel(state, task):
    vscode_port = None

    connect_state = {'reconnect': False}
-    if not state.get('disable_keepalive'):
+    if state.get('keepalive'):
        if state.get('jupyter_lab'):
            SingleThreadProxy(local_jupyter_port, local_jupyter_port_)
        if state.get('vscode_server'):
@ -628,18 +742,25 @@ def monitor_ssh_tunnel(state, task):
            if not all([ssh_port, jupyter_token, jupyter_port, internal_ssh_port, ssh_password, remote_address]):
                task.reload()
                task_parameters = task.get_parameters()
-                section = 'General' if 'General/ssh_server' in task_parameters else default_section
+                if Session.check_min_api_version("2.13"):
+                    # noinspection PyProtectedMember
+                    runtime_prop = task._get_runtime_properties()
+                    ssh_password = runtime_prop.get('_ssh_password') or state.get('password', '')
+                    jupyter_token = runtime_prop.get('_jupyter_token')
+                else:
+                    section = 'General' if 'General/ssh_server' in task_parameters else default_section
+                    ssh_password = task_parameters.get('{}/ssh_password'.format(section)) or state.get('password', '')
+                    jupyter_token = task_parameters.get('properties/jupyter_token')
+
                remote_address = \
                    task_parameters.get('properties/k8s-gateway-address') or \
                    task_parameters.get('properties/external_address')
-                ssh_password = task_parameters.get('{}/ssh_password'.format(section)) or state.get('password', '')
                internal_ssh_port = task_parameters.get('properties/internal_ssh_port')
                jupyter_port = task_parameters.get('properties/jupyter_port')
-                jupyter_token = task_parameters.get('properties/jupyter_token')
                ssh_port = \
                    task_parameters.get('properties/k8s-pod-port') or \
                    task_parameters.get('properties/external_ssh_port') or internal_ssh_port
-                if not state.get('disable_keepalive'):
+                if state.get('keepalive'):
                    internal_ssh_port = task_parameters.get('properties/internal_stable_ssh_port') or internal_ssh_port
                local_remote_pair_list = [(local_ssh_port_, internal_ssh_port)]
                if state.get('jupyter_lab'):
@ -671,7 +792,7 @@ def monitor_ssh_tunnel(state, task):
                    state.get('username') or 'root',
                    remote_address, ssh_port, ssh_password,
                    local_remote_pair_list=local_remote_pair_list,
-                    debug=state.get('debug', False),
+                    debug=state.get('verbose', False),
                )

                if ssh_process and ssh_process.isalive():
@ -684,9 +805,13 @@ def monitor_ssh_tunnel(state, task):
                        msg += \
                            '\nJupyter Lab URL: http://localhost:{local_jupyter_port}/?token={jupyter_token}'.format(
                                local_jupyter_port=local_jupyter_port, jupyter_token=jupyter_token.rstrip())
+                        if state.get('user_folder'):
+                            msg += "&file-browser-path={}".format(state.get('user_folder'))
                    if vscode_port:
                        msg += '\nVSCode server available at http://localhost:{local_vscode_port}/'.format(
                            local_vscode_port=local_vscode_port)
+                        if state.get('user_folder'):
+                            msg += "?folder={}".format(state.get('user_folder'))
                    print(msg)

                    print('\nConnection is up and running\n'
@ -810,9 +935,10 @@ def setup_parser(parser):
                             '(default: previously used Task). Use `none` for the default interactive session')
    parser.add_argument('--project', type=str, default=None,
                        help='Advanced: Set the project name for the interactive session Task')
-    parser.add_argument('--disable-keepalive', action='store_true', default=None,
-                        help='Advanced: If set, disable the transparent proxy always keeping the sockets alive. '
-                             'Default: false, use transparent socket mitigating connection drops.')
+    parser.add_argument('--keepalive', default=False, nargs='?', const='true', metavar='true/false',
+                        type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
+                        help='Advanced: If set, enables the transparent proxy always keeping the sockets alive. '
+                             'Default: False, do not use transparent socket for mitigating connection drops.')
    parser.add_argument('--queue-excluded-tag', default=None, nargs='*',
                        help='Advanced: Excluded queues with this specific tag from the selection')
    parser.add_argument('--queue-include-tag', default=None, nargs='*',
@ -826,8 +952,9 @@ def setup_parser(parser):
    parser.add_argument('--username', type=str, default=None,
                        help='Advanced: Select ssh username for the interactive session '
                             '(default: `root` or previously used one)')
-    parser.add_argument('--debug', action='store_true', default=None,
-                        help='Advanced: If set, print debugging information')
+    parser.add_argument('--verbose', action='store_true', default=None,
+                        help='Advanced: If set, print verbose progress information, '
+                             'e.g. the remote machine setup process log')


 def get_version():
@ -862,8 +989,8 @@ def cli():
    state_file = os.path.abspath(os.path.expandvars(os.path.expanduser(args.config_file)))
    state = load_state(state_file)

-    if args.debug:
-        state['debug'] = args.debug
+    if args.verbose:
+        state['verbose'] = args.verbose

    client = APIClient()

@ -894,7 +1021,7 @@ def cli():
        project_id = get_project_id(state)

        # remove old Tasks created by us.
-        delete_old_tasks(client, state.get('base_task_id'))
+        delete_old_tasks(state, client, state.get('base_task_id'))

        # Clone the Task and adjust parameters
        task = clone_task(state, project_id)
--- a/clearml_session/interactive_session_task.py
+++ b/clearml_session/interactive_session_task.py
@ -1,3 +1,4 @@
+import base64
 import json
 import os
 import socket
@ -13,6 +14,7 @@ import psutil
 from pathlib2 import Path

 from clearml import Task, StorageManager
+from clearml.backend_api import Session


 # noinspection SpellCheckingInspection
@ -97,7 +99,25 @@ def init_task(param, a_default_ssh_fingerprint):
        project_name="DevOps", task_name="Allocate Jupyter Notebook Instance", task_type=Task.TaskTypes.service)

    # Add jupyter server base folder
-    task.connect(param, name=config_section_name)
+    if Session.check_min_api_version('2.13'):
+        param.pop('user_key', None)
+        param.pop('user_secret', None)
+        param.pop('ssh_password', None)
+        task.connect(param, name=config_section_name)
+        # noinspection PyProtectedMember
+        runtime_prop = dict(task._get_runtime_properties())
+        # remove the user key/secret the moment we have it
+        param['user_key'] = runtime_prop.pop('_user_key', None)
+        param['user_secret'] = runtime_prop.pop('_user_secret', None)
+        # no need to reset, we will need it
+        param['ssh_password'] = runtime_prop.get('_ssh_password')
+        # Force removing properties
+        # noinspection PyProtectedMember
+        task._edit(runtime=runtime_prop)
+        task.reload()
+    else:
+        task.connect(param, name=config_section_name)
+
    # connect ssh finger print configuration (with fallback if section is missing)
    old_default_ssh_fingerprint = deepcopy(a_default_ssh_fingerprint)
    try:
@ -123,16 +143,17 @@ def setup_os_env(param):
        "_API_SECRET_KEY",
        "_API_HOST_VERIFY_CERT",
        "_DOCKER_IMAGE",
+        "_DOCKER_BASH_SCRIPT",
    )
    # set default docker image, with network configuration
    if param.get('default_docker', '').strip():
-        os.environ["TRAINS_DOCKER_IMAGE"] = param['default_docker'].strip()
        os.environ["CLEARML_DOCKER_IMAGE"] = param['default_docker'].strip()

    # setup os environment
    env = deepcopy(os.environ)
    for key in os.environ:
-        if (key.startswith("TRAINS") or key.startswith("CLEARML")) and not any(key.endswith(p) for p in preserve):
+        # only set CLEARML_ remove any TRAINS_
+        if key.startswith("TRAINS") or (key.startswith("CLEARML") and not any(key.endswith(p) for p in preserve)):
            env.pop(key, None)

    return env
@ -188,8 +209,7 @@ def monitor_jupyter_server(fd, local_filename, process, task, jupyter_port, host
        # we could not locate the token, try again
        if not token:
            continue
-        # update the task with the correct links and token
-        task.set_parameter(name='properties/jupyter_token', value=str(token))
+
        # we ignore the reported port, because jupyter server will get confused
        # if we have multiple servers running and will point to the wrong port/server
        task.set_parameter(name='properties/jupyter_port', value=str(jupyter_port))
@ -197,8 +217,20 @@ def monitor_jupyter_server(fd, local_filename, process, task, jupyter_port, host
            'https' if "https://" in line else 'http',
            hostnames, jupyter_port, token
        )
+
+        # update the task with the correct links and token
+        if Session.check_min_api_version("2.13"):
+            # noinspection PyProtectedMember
+            runtime_prop = task._get_runtime_properties()
+            runtime_prop['_jupyter_token'] = str(token)
+            runtime_prop['_jupyter_url'] = str(jupyter_url)
+            # noinspection PyProtectedMember
+            task._set_runtime_properties(runtime_prop)
+        else:
+            task.set_parameter(name='properties/jupyter_token', value=str(token))
+            task.set_parameter(name='properties/jupyter_url', value=jupyter_url)
+
        print('\nJupyter Lab URL: {}\n'.format(jupyter_url))
-        task.set_parameter(name='properties/jupyter_url', value=jupyter_url)

    # cleanup
    # noinspection PyBroadException
@ -219,8 +251,8 @@ def start_vscode_server(hostname, hostnames, param, task, env):

    # get vscode version and python extension version
    # they are extremely flaky, this combination works, most do not.
-    vscode_version = '3.9.2'
-    python_ext_version = '2021.3.658691958'
+    vscode_version = '3.12.0'
+    python_ext_version = '2021.10.1365161279'
    if param.get("vscode_version"):
        vscode_version_parts = param.get("vscode_version").split(':')
        vscode_version = vscode_version_parts[0]
@ -231,27 +263,40 @@ def start_vscode_server(hostname, hostnames, param, task, env):
    env = dict(**env)
    env.pop('PYTHONPATH', None)

+    pre_installed = False
+    python_ext = None
+
    # find a free tcp port
    port = get_free_port(9000, 9100)

    if os.geteuid() == 0:
-        # installing VSCODE:
+        # check if preinstalled
+        # noinspection PyBroadException
        try:
-            python_ext = StorageManager.get_local_copy(
-                'https://github.com/microsoft/vscode-python/releases/download/{}/ms-python-release.vsix'.format(
-                    python_ext_version),
-                extract_archive=False)
-            code_server_deb = StorageManager.get_local_copy(
-                'https://github.com/cdr/code-server/releases/download/'
-                'v{version}/code-server_{version}_amd64.deb'.format(version=vscode_version),
-                extract_archive=False)
-            os.system("dpkg -i {}".format(code_server_deb))
-        except Exception as ex:
-            print("Failed installing vscode server: {}".format(ex))
-            return
-        vscode_path = 'code-server'
+            vscode_path = subprocess.check_output('which code-server', shell=True).decode().strip()
+            pre_installed = bool(vscode_path)
+        except Exception:
+            vscode_path = None
+
+        if not vscode_path:
+            # installing VSCODE:
+            try:
+                python_ext = StorageManager.get_local_copy(
+                    'https://github.com/microsoft/vscode-python/releases/download/{}/ms-python-release.vsix'.format(
+                        python_ext_version),
+                    extract_archive=False)
+                code_server_deb = StorageManager.get_local_copy(
+                    'https://github.com/cdr/code-server/releases/download/'
+                    'v{version}/code-server_{version}_amd64.deb'.format(version=vscode_version),
+                    extract_archive=False)
+                os.system("dpkg -i {}".format(code_server_deb))
+            except Exception as ex:
+                print("Failed installing vscode server: {}".format(ex))
+                return
+            vscode_path = 'code-server'
    else:
        python_ext = None
+        pre_installed = True
        # check if code-server exists
        # noinspection PyBroadException
        try:
@ -280,51 +325,65 @@ def start_vscode_server(hostname, hostnames, param, task, env):

    try:
        fd, local_filename = mkstemp()
-        subprocess.Popen(
-            [
-                vscode_path,
-                "--auth",
-                "none",
-                "--bind-addr",
-                "127.0.0.1:{}".format(port),
-                "--user-data-dir", user_folder,
-                "--extensions-dir", exts_folder,
-                "--install-extension", "ms-toolsai.jupyter",
-                # "--install-extension", "donjayamanne.python-extension-pack"
-            ] + ["--install-extension", python_ext] if python_ext else [],
-            env=env,
-            stdout=fd,
-            stderr=fd,
-        )
-        settings = Path(os.path.expanduser(os.path.join(user_folder, 'User/settings.json')))
-        settings.parent.mkdir(parents=True, exist_ok=True)
-        # noinspection PyBroadException
-        try:
-            with open(settings.as_posix(), 'rt') as f:
-                base_json = json.load(f)
-        except Exception:
-            base_json = {}
-        # noinspection PyBroadException
-        try:
-            base_json.update({
-                "extensions.autoCheckUpdates": False,
-                "extensions.autoUpdate": False,
-                "python.pythonPath": sys.executable,
-                "terminal.integrated.shell.linux": "/bin/bash" if Path("/bin/bash").is_file() else None,
-            })
-            with open(settings.as_posix(), 'wt') as f:
-                json.dump(base_json, f)
-        except Exception:
-            pass
+        if pre_installed:
+            user_folder = os.path.expanduser("~/.local/share/code-server/")
+            if not os.path.isdir(user_folder):
+                user_folder = None
+                exts_folder = None
+            else:
+                exts_folder = os.path.expanduser("~/.local/share/code-server/extensions/")
+        else:
+            subprocess.Popen(
+                [
+                    vscode_path,
+                    "--auth",
+                    "none",
+                    "--bind-addr",
+                    "127.0.0.1:{}".format(port),
+                    "--user-data-dir", user_folder,
+                    "--extensions-dir", exts_folder,
+                    "--install-extension", "ms-toolsai.jupyter",
+                    # "--install-extension", "donjayamanne.python-extension-pack"
+                ] + ["--install-extension", "ms-python.python@{}".format(python_ext_version)] if python_ext else [],
+                env=env,
+                stdout=fd,
+                stderr=fd,
+            )
+
+        if user_folder:
+            settings = Path(os.path.expanduser(os.path.join(user_folder, 'User/settings.json')))
+            settings.parent.mkdir(parents=True, exist_ok=True)
+            # noinspection PyBroadException
+            try:
+                with open(settings.as_posix(), 'rt') as f:
+                    base_json = json.load(f)
+            except Exception:
+                base_json = {}
+            # noinspection PyBroadException
+            try:
+                base_json.update({
+                    "extensions.autoCheckUpdates": False,
+                    "extensions.autoUpdate": False,
+                    "python.pythonPath": sys.executable,
+                    "terminal.integrated.shell.linux": "/bin/bash" if Path("/bin/bash").is_file() else None,
+                })
+                with open(settings.as_posix(), 'wt') as f:
+                    json.dump(base_json, f)
+            except Exception:
+                pass
+
        proc = subprocess.Popen(
            ['bash', '-c',
-             '{} --auth none --bind-addr 127.0.0.1:{} --disable-update-check '
-             '--user-data-dir {} --extensions-dir {}'.format(vscode_path, port, user_folder, exts_folder)],
+             '{} --auth none --bind-addr 127.0.0.1:{} --disable-update-check {} {}'.format(
+                 vscode_path, port,
+                 '--user-data-dir \"{}\"'.format(user_folder) if user_folder else '',
+                 '--extensions-dir \"{}\"'.format(exts_folder) if exts_folder else '')],
            env=env,
            stdout=fd,
            stderr=fd,
            cwd=cwd,
        )
+
        try:
            error_code = proc.wait(timeout=1)
            raise ValueError("code-server failed starting, return code {}".format(error_code))
@ -343,7 +402,7 @@ def start_jupyter_server(hostname, hostnames, param, task, env):
        print('no jupyterlab to monitor - going to sleep')
        while True:
            sleep(10.)
-        return
+        return  # noqa

    # execute jupyter notebook
    fd, local_filename = mkstemp()
@ -418,15 +477,15 @@ def setup_ssh_server(hostname, hostnames, param, task):
                "&& "  # noqa: W605
                "echo 'ClientAliveInterval 10' >> /etc/ssh/sshd_config && "
                "echo 'ClientAliveCountMax 20' >> /etc/ssh/sshd_config && "
-                "echo 'AcceptEnv TRAINS_API_ACCESS_KEY TRAINS_API_SECRET_KEY "
+                "echo 'AcceptEnv CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY "
                "CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY' >> /etc/ssh/sshd_config && "
                'echo "export VISIBLE=now" >> /etc/profile && '
                'echo "export PATH=$PATH" >> /etc/profile && '
-                'echo "ldconfig" >> /etc/profile && '
-                'echo "export TRAINS_CONFIG_FILE={trains_config_file}" >> /etc/profile'.format(
+                'echo "ldconfig" 2>/dev/null >> /etc/profile && '
+                'echo "export CLEARML_CONFIG_FILE={trains_config_file}" >> /etc/profile'.format(
                    password=ssh_password,
                    port=port,
-                    trains_config_file=os.environ.get("CLEARML_CONFIG_FILE") or os.environ.get("TRAINS_CONFIG_FILE"),
+                    trains_config_file=os.environ.get("CLEARML_CONFIG_FILE") or os.environ.get("CLEARML_CONFIG_FILE"),
                )
            )
            sshd_path = '/usr/sbin/sshd'
@ -449,7 +508,7 @@ def setup_ssh_server(hostname, hostnames, param, task):
                        "UsePAM yes" + "\n"\
                        "AuthorizedKeysFile {}".format(os.path.join(ssh_config_path, 'authorized_keys')) + "\n"\
                        "PidFile {}".format(os.path.join(ssh_config_path, 'sshd.pid')) + "\n"\
-                        "AcceptEnv TRAINS_API_ACCESS_KEY TRAINS_API_SECRET_KEY "\
+                        "AcceptEnv CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY "\
                        "CLEARML_API_ACCESS_KEY CLEARML_API_SECRET_KEY"+"\n"
                    for k in default_ssh_fingerprint:
                        filename = os.path.join(ssh_config_path, '{}'.format(k.replace('__pub', '.pub')))
@ -511,13 +570,42 @@ def setup_ssh_server(hostname, hostnames, param, task):
        print("Error: {}\n\n#\n# Error: SSH server could not be launched\n#\n".format(ex))


+def _b64_decode_file(encoded_string):
+    # noinspection PyBroadException
+    try:
+        import gzip
+        value = gzip.decompress(base64.decodebytes(encoded_string.encode('ascii'))).decode('utf8')
+        return value
+    except Exception:
+        return None
+
+
 def setup_user_env(param, task):
    env = setup_os_env(param)
+
+    # apply vault if we have it
+    vault_environment = {}
+    if param.get("user_key") and param.get("user_secret"):
+        # noinspection PyBroadException
+        try:
+            print('Applying vault configuration')
+            from clearml.backend_api.session.defs import ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION
+            prev_env, prev_files = ENV_ENABLE_ENV_CONFIG_SECTION.get(), ENV_ENABLE_FILES_CONFIG_SECTION.get()
+            ENV_ENABLE_ENV_CONFIG_SECTION.set(True), ENV_ENABLE_FILES_CONFIG_SECTION.set(True)
+            prev_envs = deepcopy(os.environ)
+            Session(api_key=param.get("user_key"), secret_key=param.get("user_secret"))
+            vault_environment = {k: v for k, v in os.environ.items() if prev_envs.get(k) != v}
+            ENV_ENABLE_ENV_CONFIG_SECTION.set(prev_env), ENV_ENABLE_FILES_CONFIG_SECTION.set(prev_files)
+            if vault_environment:
+                print('Vault environment added: {}'.format(list(vault_environment.keys())))
+        except Exception as ex:
+            print('Applying vault configuration failed: {}'.format(ex))
+
    # do not change user bash/profile
    if os.geteuid() != 0:
        if param.get("user_key") and param.get("user_secret"):
-            env['TRAINS_API_ACCESS_KEY'] = param.get("user_key")
-            env['TRAINS_API_SECRET_KEY'] = param.get("user_secret")
+            env['CLEARML_API_ACCESS_KEY'] = param.get("user_key")
+            env['CLEARML_API_SECRET_KEY'] = param.get("user_secret")
        return env

    # create symbolic link to the venv
@ -530,20 +618,24 @@ def setup_user_env(param, task):
        pass
    # set default user credentials
    if param.get("user_key") and param.get("user_secret"):
-        os.system("echo 'export TRAINS_API_ACCESS_KEY=\"{}\"' >> ~/.bashrc".format(
+        os.system("echo 'export CLEARML_API_ACCESS_KEY=\"{}\"' >> ~/.bashrc".format(
            param.get("user_key", "").replace('$', '\\$')))
-        os.system("echo 'export TRAINS_API_SECRET_KEY=\"{}\"' >> ~/.bashrc".format(
+        os.system("echo 'export CLEARML_API_SECRET_KEY=\"{}\"' >> ~/.bashrc".format(
            param.get("user_secret", "").replace('$', '\\$')))
-        os.system("echo 'export TRAINS_DOCKER_IMAGE=\"{}\"' >> ~/.bashrc".format(
-            param.get("default_docker", "").strip() or env.get('TRAINS_DOCKER_IMAGE', '')))
-        os.system("echo 'export TRAINS_API_ACCESS_KEY=\"{}\"' >> ~/.profile".format(
+        os.system("echo 'export CLEARML_DOCKER_IMAGE=\"{}\"' >> ~/.bashrc".format(
+            param.get("default_docker", "").strip() or env.get('CLEARML_DOCKER_IMAGE', '')))
+        os.system("echo 'export CLEARML_API_ACCESS_KEY=\"{}\"' >> ~/.profile".format(
            param.get("user_key", "").replace('$', '\\$')))
-        os.system("echo 'export TRAINS_API_SECRET_KEY=\"{}\"' >> ~/.profile".format(
+        os.system("echo 'export CLEARML_API_SECRET_KEY=\"{}\"' >> ~/.profile".format(
            param.get("user_secret", "").replace('$', '\\$')))
-        os.system("echo 'export TRAINS_DOCKER_IMAGE=\"{}\"' >> ~/.profile".format(
-            param.get("default_docker", "").strip() or env.get('TRAINS_DOCKER_IMAGE', '')))
-        env['TRAINS_API_ACCESS_KEY'] = param.get("user_key")
-        env['TRAINS_API_SECRET_KEY'] = param.get("user_secret")
+        os.system("echo 'export CLEARML_DOCKER_IMAGE=\"{}\"' >> ~/.profile".format(
+            param.get("default_docker", "").strip() or env.get('CLEARML_DOCKER_IMAGE', '')))
+        for k, v in vault_environment.items():
+            os.system("echo 'export {}=\"{}\"' >> ~/.profile".format(k, v))
+            os.system("echo 'export {}=\"{}\"' >> ~/.bashrc".format(k, v))
+            env[k] = str(v) if v else ""
+        env['CLEARML_API_ACCESS_KEY'] = param.get("user_key")
+        env['CLEARML_API_SECRET_KEY'] = param.get("user_secret")
    # set default folder for user
    if param.get("user_base_directory"):
        base_dir = param.get("user_base_directory")
@ -557,8 +649,27 @@ def setup_user_env(param, task):
    os.system("echo '. {}' >> ~/.profile".format(os.path.join(environment, 'bin', 'activate')))

    # check if we need to create .git-credentials
-    # noinspection PyProtectedMember
-    git_credentials = task._get_configuration_text('git_credentials')
+
+    runtime_property_support = Session.check_min_api_version("2.13")
+    if runtime_property_support:
+        # noinspection PyProtectedMember
+        runtime_prop = dict(task._get_runtime_properties())
+        git_credentials = runtime_prop.pop('_git_credentials', None)
+        git_config = runtime_prop.pop('_git_config', None)
+        # force removing properties
+        # noinspection PyProtectedMember
+        task._edit(runtime=runtime_prop)
+        task.reload()
+        if git_credentials is not None:
+            git_credentials = _b64_decode_file(git_credentials)
+        if git_config is not None:
+            git_config = _b64_decode_file(git_config)
+    else:
+        # noinspection PyProtectedMember
+        git_credentials = task._get_configuration_text('git_credentials')
+        # noinspection PyProtectedMember
+        git_config = task._get_configuration_text('git_config')
+
    if git_credentials:
        git_cred_file = os.path.expanduser('~/.config/git/credentials')
        # noinspection PyBroadException
@ -568,8 +679,7 @@ def setup_user_env(param, task):
                f.write(git_credentials)
        except Exception:
            print('Could not write {} file'.format(git_cred_file))
-    # noinspection PyProtectedMember
-    git_config = task._get_configuration_text('git_config')
+
    if git_config:
        git_config_file = os.path.expanduser('~/.config/git/config')
        # noinspection PyBroadException
--- a/clearml_session/version.py
+++ b/clearml_session/version.py
@ -1 +1 @@
-__version__ = '0.3.4'
+__version__ = '0.3.5'
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,3 @@
-clearml
+clearml >= 1.1.5
 pexpect ; sys_platform != 'win32'
 wexpect ; sys_platform == 'win32'