Add --router-enabled to support clearml router service

2025-06-26 18:16:55 +00:00 · 2025-01-05 12:18:33 +02:00 · 2025-01-05 12:18:33 +02:00 · 90ac85339a
commit 90ac85339a
parent e0a79f7ce7
3 changed files with 85 additions and 57 deletions
--- a/README.md
+++ b/README.md
@ -307,21 +307,20 @@ clearml-session --help
 ```console
 clearml-session - CLI for launching JupyterLab / VSCode / SSH on a remote machine
 usage: clearml-session [-h] [--version] [--attach [ATTACH]] [--shutdown [SHUTDOWN]] [--shell]
-                       [--debugging-session DEBUGGING_SESSION] [--queue QUEUE] [--docker DOCKER]
+                       [--debugging-session DEBUGGING_SESSION] [--queue QUEUE] [--router-enabled] [--docker DOCKER]
                       [--docker-args DOCKER_ARGS] [--public-ip [true/false]] [--remote-ssh-port REMOTE_SSH_PORT]
-                       [--vscode-server [true/false]] [--vscode-version VSCODE_VERSION]
-                       [--vscode-extensions VSCODE_EXTENSIONS] [--jupyter-lab [true/false]]
-                       [--upload-files UPLOAD_FILES] [--continue-session CONTINUE_SESSION]
-                       [--store-workspace STORE_WORKSPACE] [--git-credentials [true/false]]
-                       [--user-folder USER_FOLDER] [--packages [PACKAGES [PACKAGES ...]]]
-                       [--requirements REQUIREMENTS] [--init-script [INIT_SCRIPT]] [--config-file CONFIG_FILE]
-                       [--remote-gateway [REMOTE_GATEWAY]] [--base-task-id BASE_TASK_ID] [--project PROJECT]
-                       [--session-name SESSION_NAME] [--session-tags [SESSION_TAGS [SESSION_TAGS ...]]]
+                       [--vscode-server [true/false]] [--vscode-version VSCODE_VERSION] [--vscode-extensions VSCODE_EXTENSIONS]
+                       [--jupyter-lab [true/false]] [--upload-files UPLOAD_FILES] [--continue-session CONTINUE_SESSION]
+                       [--store-workspace STORE_WORKSPACE] [--git-credentials [true/false]] [--user-folder USER_FOLDER]
+                       [--packages [PACKAGES [PACKAGES ...]]] [--requirements REQUIREMENTS] [--init-script [INIT_SCRIPT]]
+                       [--config-file CONFIG_FILE] [--remote-gateway [REMOTE_GATEWAY]] [--base-task-id BASE_TASK_ID]
+                       [--project PROJECT] [--session-name SESSION_NAME] [--session-tags [SESSION_TAGS [SESSION_TAGS ...]]]
                       [--disable-session-cleanup [true/false]] [--keepalive [true/false]]
                       [--queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]]]
-                       [--queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]]]
-                       [--skip-docker-network [true/false]] [--password PASSWORD] [--username USERNAME]
-                       [--force-dropbear [true/false]] [--verbose] [--yes]
+                       [--queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]]] [--skip-docker-network [true/false]]
+                       [--password PASSWORD] [--randomize [RANDOMIZE [RANDOMIZE ...]]] [--username USERNAME]
+                       [--force-dropbear [true/false]] [--disable-store-defaults] [--disable-fingerprint-check] [--verbose]
+                       [--yes]
                       {list,info,shutdown} ...

 clearml-session - CLI for launching JupyterLab / VSCode / SSH on a remote machine
@ -338,65 +337,64 @@ optional arguments:
  --attach [ATTACH]     Attach to running interactive session (default: previous session)
  --shutdown [SHUTDOWN], -S [SHUTDOWN]
                        Shut down an active session (default: previous session)
-  --shell               Open the SSH shell session directly, notice quitting the SSH session will Not shut down the
-                        remote session
+  --shell               Open the SSH shell session directly, notice quiting the SSH session will Not shutdown the remote session
  --debugging-session DEBUGGING_SESSION
-                        Pass existing Task id (experiment), create a copy of the experiment on a remote machine,
-                        and launch jupyter/ssh for interactive access. Example --debugging-session <task_id>
+                        Pass existing Task id (experiment), create a copy of the experiment on a remote machine, and launch
+                        jupyter/ssh for interactive access. Example --debugging-session <task_id>
  --queue QUEUE         Select the queue to launch the interactive session on (default: previously used queue)
-  --docker DOCKER       Select the docker image to use in the interactive session (default: previously used
-                        docker image or `nvidia/cuda:11.6.2-runtime-ubuntu20.04`)
+  --router-enabled      If we have a clearml Router set, make sure we request direct TCP routing to our container.
+  --docker DOCKER       Select the docker image to use in the interactive session on (default: previously used docker image or
+                        `nvidia/cuda:11.6.2-runtime-ubuntu20.04`)
  --docker-args DOCKER_ARGS
-                        Add additional arguments for the docker image to use in the interactive session on
-                        (default: previously used docker-args)
+                        Add additional arguments for the docker image to use in the interactive session on (default: previously
+                        used docker-args)
  --public-ip [true/false]
-                        If True, register the public IP of the remote machine. Set if running on the cloud.
-                        Default: false (use for local / on-premises)
+                        If True register the public IP of the remote machine. Set if running on the cloud. Default: false (use
+                        for local / on-premises)
  --remote-ssh-port REMOTE_SSH_PORT
                        Set the remote ssh server port, running on the agent`s machine. (default: 10022)
  --vscode-server [true/false]
                        Install vscode server (code-server) on interactive session (default: true)
  --vscode-version VSCODE_VERSION
-                        Set vscode server (code-server) version, as well as vscode python extension version
-                        <vscode:python-ext> (example: "3.7.4:2020.10.332292344")
+                        Set vscode server (code-server) version, as well as vscode python extension version <vscode:python-ext>
+                        (example: "3.7.4:2020.10.332292344")
  --vscode-extensions VSCODE_EXTENSIONS
-                        Install additional vscode extensions, as well as vscode python extension (example: "ms-
-                        python.python,ms-python.black-formatter,ms-python.pylint,ms-python.flake8")
+                        Install additional vscode extensions, as well as vscode python extension (example: "ms-python.python,ms-
+                        python.black-formatter,ms-python.pylint,ms-python.flake8")
  --jupyter-lab [true/false]
                        Install Jupyter-Lab on interactive session (default: true)
  --upload-files UPLOAD_FILES
-                        Advanced: Upload local files/folders to the remote session. Example: `/my/local/data/`
-                        will upload the local folder and extract it into the container in ~/session-files/
+                        Advanced: Upload local files/folders to the remote session. Example: `/my/local/data/` will upload the
+                        local folder and extract it into the container in ~/session-files/
  --continue-session CONTINUE_SESSION
                        Continue previous session (ID provided) restoring your workspace (see --store-workspace)
  --store-workspace STORE_WORKSPACE
-                        Upload/Restore remote workspace folder. Example: `~/workspace/` will automatically
-                        restore/store the *containers* folder and extract it into the next session. Use with
-                        --continue-session to continue your previous work from your exact container state
+                        Upload/Restore remote workspace folder. Example: `~/workspace/` will automatically restore/store the
+                        *containers* folder and extract it into next the session. Use with --continue-session to continue your
+                        previous work from your exact container state
  --git-credentials [true/false]
                        If true, local .git-credentials file is sent to the interactive session. (default: false)
  --user-folder USER_FOLDER
                        Advanced: Set the remote base folder (default: ~/)
  --packages [PACKAGES [PACKAGES ...]]
-                        Additional packages to add, supports version numbers (default: previously added packages).
-                        examples: --packages torch==1.7 tqdm
+                        Additional packages to add, supports version numbers (default: previously added packages). examples:
+                        --packages torch==1.7 tqdm
  --requirements REQUIREMENTS
-                        Specify requirements.txt file to install when setting the interactive session.
-                        Requirements file is read and stored in `packages` section as default for the next
-                        sessions. Can be overridden by calling `--packages`
+                        Specify requirements.txt file to install when setting the interactive session. Requirements file is read
+                        and stored in `packages` section as default for the next sessions. Can be overridden by calling
+                        `--packages`
  --init-script [INIT_SCRIPT]
-                        Specify BASH init script file to be executed when setting the interactive session. Script
-                        content is read and stored as default script for the next sessions. To clear the init-
-                        script do not pass a file
+                        Specify BASH init script file to be executed when setting the interactive session. Script content is
+                        read and stored as default script for the next sessions. To clear the init-script do not pass a file
  --config-file CONFIG_FILE
                        Advanced: Change the configuration file used to store the previous state (default:
                        ~/.clearml_session.json)
  --remote-gateway [REMOTE_GATEWAY]
-                        Advanced: Specify gateway ip/address:port to be passed to interactive session (for use
-                        with k8s ingestion / ELB)
+                        Advanced: Specify gateway ip/address:port to be passed to interactive session (for use with k8s
+                        ingestion / ELB)
  --base-task-id BASE_TASK_ID
-                        Advanced: Set the base task ID for the interactive session. (default: previously used
-                        Task). Use `none` for the default interactive session
+                        Advanced: Set the base task ID for the interactive session. (default: previously used Task). Use `none`
+                        for the default interactive session
  --project PROJECT     Advanced: Set the project name for the interactive session Task
  --session-name SESSION_NAME
                        Advanced: Set the name of the interactive session Task
@ -405,33 +403,32 @@ optional arguments:
  --disable-session-cleanup [true/false]
                        Advanced: If set, previous interactive sessions are not deleted
  --keepalive [true/false]
-                        Advanced: If set, enables the transparent proxy always keeping the sockets alive. Default:
-                        False, do not use transparent sockets for mitigating connection drops.
+                        Advanced: If set, enables the transparent proxy always keeping the sockets alive. Default: False, do not
+                        use transparent socket for mitigating connection drops.
  --queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]]
                        Advanced: Excluded queues with this specific tag from the selection
  --queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]]
                        Advanced: Only include queues with this specific tag from the selection
  --skip-docker-network [true/false]
-                        Advanced: If set, `--network host` is **not** passed to docker (assumes k8s network
-                        ingestion) (default: false)
-  --password PASSWORD   Advanced: Select ssh password for the interactive session (default: `randomly-generated`
-                        or previously used one)
-  --username USERNAME   Advanced: Select ssh username for the interactive session (default: `root` or previously
+                        Advanced: If set, `--network host` is **not** passed to docker (assumes k8s network ingestion) (default:
+                        false)
+  --password PASSWORD   Advanced: Select ssh password for the interactive session (default: `randomly-generated` or previously
                        used one)
-  --randomize           Advanced: Recreate a new random ssh password for the interactive session options: 
-                        `--randomize` one time recreate, --randomize `always` create a new random password for 
-                        every session                        
+  --randomize [RANDOMIZE [RANDOMIZE ...]]
+                        Advanced: Recreate a new random ssh password for the interactive session options: `--randomize` one time
+                        recreate random password, --randomize `always` create a new random password for every session
+  --username USERNAME   Advanced: Select ssh username for the interactive session (default: `root` or previously used one)
  --force-dropbear [true/false]
                        Force using `dropbear` instead of SSHd
  --disable-store-defaults
                        If set, do not store current setup as new default configuration
  --disable-fingerprint-check
                        Advanced: If set, ignore the remote SSH server fingerprint check
-  --verbose             Advanced: If set, print verbose progress information, e.g. the remote machine setup
-                        process log
+  --verbose             Advanced: If set, print verbose progress information, e.g. the remote machine setup process log
  --yes, -y             Automatic yes to prompts; assume "yes" as answer to all prompts and run non-interactively

 Notice! all arguments are stored as new defaults for the next execution
+
 ```


--- a/clearml_session/main.py
+++ b/clearml_session/main.py
@ -715,6 +715,7 @@ def clone_task(state, project_id=None):
    task_params["{}/force_dropbear".format(section)] = bool(state.get('force_dropbear'))
    task_params["{}/store_workspace".format(section)] = state.get('store_workspace')
    task_params["{}/use_ssh_proxy".format(section)] = state.get('keepalive')
+    task_params["{}/router_enabled".format(section)] = bool(state.get('router_enabled'))
    if state.get('user_folder'):
        task_params['{}/user_base_directory'.format(section)] = state.get('user_folder')
    docker = state.get('docker') or task.get_base_docker()
@ -977,7 +978,8 @@ def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_rem
        if debug:
            print("ERROR: running local SSH client [{}] failed connecting to {}: {}".format(command, args, ex))
        else:
-            print("ERROR: running local SSH client failed connecting to {}: {}".format(remote_address, ex))
+            print("ERROR: running local SSH client failed connecting to {} [{}]\n"
+                  "       for additional details re-run with --verbose".format(remote_address, type(ex)))

        if child:
            child.terminate(force=True)
@ -1165,6 +1167,8 @@ def monitor_ssh_tunnel(state, task, ssh_setup_completed_callback=None):
                else:
                    logging.getLogger().warning('SSH tunneling failed, retrying in {} seconds'.format(3))
                    sleep(3.)
+                    # clear ssh port, so that we reload it from Task (i.e. sync with router if it's there)
+                    ssh_port = None
                    continue

            connect_state['reconnect'] = False
@ -1355,6 +1359,10 @@ def setup_parser(parser):
                             'and launch jupyter/ssh for interactive access. Example --debugging-session <task_id>')
    parser.add_argument('--queue', type=str, default=None,
                        help='Select the queue to launch the interactive session on (default: previously used queue)')
+    parser.add_argument("--router-enabled", default=None, nargs='?', const='true', metavar='true/false',
+                        type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
+                        help="If we have a clearml Router set, make sure we request direct TCP routing "
+                             "to our container. ")
    parser.add_argument('--docker', type=str, default=None,
                        help='Select the docker image to use in the interactive session on '
                             '(default: previously used docker image or `{}`)'.format(default_docker_image))
--- a/clearml_session/interactive_session_task.py
+++ b/clearml_session/interactive_session_task.py
@ -591,6 +591,7 @@ def setup_ssh_server(hostname, hostnames, param, task, env):
    print("Installing SSH Server on {} [{}]".format(hostname, hostnames))
    ssh_password = param.get("ssh_password", "training")

+    proxy_port = port = None
    ssh_port = None
    if Session.check_min_api_version("2.13"):
        try:
@ -797,6 +798,8 @@ def setup_ssh_server(hostname, hostnames, param, task, env):
    except Exception as ex:
        print("Error: {}\n\n#\n# Error: SSH server could not be launched\n#\n".format(ex))

+    return proxy_port or port
+

 def _b64_decode_file(encoded_string):
    # noinspection PyBroadException
@ -1019,6 +1022,7 @@ def get_host_name(task, param):
    # update host name
    if (not task.get_parameter(name='properties/external_address') and
            not task.get_parameter(name='properties/k8s-gateway-address')):
+
        if task._get_runtime_properties().get("external_address"):
            external_addr = task._get_runtime_properties().get("external_address")
        else:
@ -1274,9 +1278,15 @@ def main():
        "force_dropbear": False,
        "store_workspace": None,
        "use_ssh_proxy": False,
+        "router_enabled": False,
    }
    task = init_task(param, default_ssh_fingerprint)

+    # if router is enabled, do not request a public IP, enforce local IP
+    if param.get("router_enabled") and param.get("public_ip"):
+        print("External TCP router configured, disabling `public_ip` request")
+        param["public_ip"] = False
+
    run_user_init_script(task)

    # restore workspace if exists
@ -1294,7 +1304,20 @@ def main():

    env = setup_user_env(param, task)

-    setup_ssh_server(hostname, hostnames, param, task, env)
+    ssh_port = setup_ssh_server(hostname, hostnames, param, task, env)
+
+    # make sure we set it to the runtime properties
+    if ssh_port and param.get("router_enabled"):
+        # noinspection PyProtectedMember
+        address = task._get_runtime_properties().get("external_address") or ""
+        print("Requesting TCP route from router ingress to {} port {}".format(address, ssh_port))
+        # noinspection PyProtectedMember
+        task._set_runtime_properties({
+            "external_address": address,
+            "external_tcp_port": ssh_port,
+            "_SERVICE": "EXTERNAL_TCP",
+        })
+        task.set_system_tags((task.get_system_tags() or []) + ["external_service"])

    start_vscode_server(hostname, hostnames, param, task, env)