From 90ac85339a1d42575933c21f230c9496ef6aa3fb Mon Sep 17 00:00:00 2001 From: clearml <> Date: Sun, 5 Jan 2025 12:18:33 +0200 Subject: [PATCH] Add --router-enabled to support clearml router service --- README.md | 107 ++++++++++---------- clearml_session/__main__.py | 10 +- clearml_session/interactive_session_task.py | 25 ++++- 3 files changed, 85 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 474fd11..03737fd 100644 --- a/README.md +++ b/README.md @@ -307,21 +307,20 @@ clearml-session --help ```console clearml-session - CLI for launching JupyterLab / VSCode / SSH on a remote machine usage: clearml-session [-h] [--version] [--attach [ATTACH]] [--shutdown [SHUTDOWN]] [--shell] - [--debugging-session DEBUGGING_SESSION] [--queue QUEUE] [--docker DOCKER] + [--debugging-session DEBUGGING_SESSION] [--queue QUEUE] [--router-enabled] [--docker DOCKER] [--docker-args DOCKER_ARGS] [--public-ip [true/false]] [--remote-ssh-port REMOTE_SSH_PORT] - [--vscode-server [true/false]] [--vscode-version VSCODE_VERSION] - [--vscode-extensions VSCODE_EXTENSIONS] [--jupyter-lab [true/false]] - [--upload-files UPLOAD_FILES] [--continue-session CONTINUE_SESSION] - [--store-workspace STORE_WORKSPACE] [--git-credentials [true/false]] - [--user-folder USER_FOLDER] [--packages [PACKAGES [PACKAGES ...]]] - [--requirements REQUIREMENTS] [--init-script [INIT_SCRIPT]] [--config-file CONFIG_FILE] - [--remote-gateway [REMOTE_GATEWAY]] [--base-task-id BASE_TASK_ID] [--project PROJECT] - [--session-name SESSION_NAME] [--session-tags [SESSION_TAGS [SESSION_TAGS ...]]] + [--vscode-server [true/false]] [--vscode-version VSCODE_VERSION] [--vscode-extensions VSCODE_EXTENSIONS] + [--jupyter-lab [true/false]] [--upload-files UPLOAD_FILES] [--continue-session CONTINUE_SESSION] + [--store-workspace STORE_WORKSPACE] [--git-credentials [true/false]] [--user-folder USER_FOLDER] + [--packages [PACKAGES [PACKAGES ...]]] [--requirements REQUIREMENTS] [--init-script [INIT_SCRIPT]] + [--config-file CONFIG_FILE] [--remote-gateway [REMOTE_GATEWAY]] [--base-task-id BASE_TASK_ID] + [--project PROJECT] [--session-name SESSION_NAME] [--session-tags [SESSION_TAGS [SESSION_TAGS ...]]] [--disable-session-cleanup [true/false]] [--keepalive [true/false]] [--queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]]] - [--queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]]] - [--skip-docker-network [true/false]] [--password PASSWORD] [--username USERNAME] - [--force-dropbear [true/false]] [--verbose] [--yes] + [--queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]]] [--skip-docker-network [true/false]] + [--password PASSWORD] [--randomize [RANDOMIZE [RANDOMIZE ...]]] [--username USERNAME] + [--force-dropbear [true/false]] [--disable-store-defaults] [--disable-fingerprint-check] [--verbose] + [--yes] {list,info,shutdown} ... clearml-session - CLI for launching JupyterLab / VSCode / SSH on a remote machine @@ -338,65 +337,64 @@ optional arguments: --attach [ATTACH] Attach to running interactive session (default: previous session) --shutdown [SHUTDOWN], -S [SHUTDOWN] Shut down an active session (default: previous session) - --shell Open the SSH shell session directly, notice quitting the SSH session will Not shut down the - remote session + --shell Open the SSH shell session directly, notice quiting the SSH session will Not shutdown the remote session --debugging-session DEBUGGING_SESSION - Pass existing Task id (experiment), create a copy of the experiment on a remote machine, - and launch jupyter/ssh for interactive access. Example --debugging-session + Pass existing Task id (experiment), create a copy of the experiment on a remote machine, and launch + jupyter/ssh for interactive access. Example --debugging-session --queue QUEUE Select the queue to launch the interactive session on (default: previously used queue) - --docker DOCKER Select the docker image to use in the interactive session (default: previously used - docker image or `nvidia/cuda:11.6.2-runtime-ubuntu20.04`) + --router-enabled If we have a clearml Router set, make sure we request direct TCP routing to our container. + --docker DOCKER Select the docker image to use in the interactive session on (default: previously used docker image or + `nvidia/cuda:11.6.2-runtime-ubuntu20.04`) --docker-args DOCKER_ARGS - Add additional arguments for the docker image to use in the interactive session on - (default: previously used docker-args) + Add additional arguments for the docker image to use in the interactive session on (default: previously + used docker-args) --public-ip [true/false] - If True, register the public IP of the remote machine. Set if running on the cloud. - Default: false (use for local / on-premises) + If True register the public IP of the remote machine. Set if running on the cloud. Default: false (use + for local / on-premises) --remote-ssh-port REMOTE_SSH_PORT Set the remote ssh server port, running on the agent`s machine. (default: 10022) --vscode-server [true/false] Install vscode server (code-server) on interactive session (default: true) --vscode-version VSCODE_VERSION - Set vscode server (code-server) version, as well as vscode python extension version - (example: "3.7.4:2020.10.332292344") + Set vscode server (code-server) version, as well as vscode python extension version + (example: "3.7.4:2020.10.332292344") --vscode-extensions VSCODE_EXTENSIONS - Install additional vscode extensions, as well as vscode python extension (example: "ms- - python.python,ms-python.black-formatter,ms-python.pylint,ms-python.flake8") + Install additional vscode extensions, as well as vscode python extension (example: "ms-python.python,ms- + python.black-formatter,ms-python.pylint,ms-python.flake8") --jupyter-lab [true/false] Install Jupyter-Lab on interactive session (default: true) --upload-files UPLOAD_FILES - Advanced: Upload local files/folders to the remote session. Example: `/my/local/data/` - will upload the local folder and extract it into the container in ~/session-files/ + Advanced: Upload local files/folders to the remote session. Example: `/my/local/data/` will upload the + local folder and extract it into the container in ~/session-files/ --continue-session CONTINUE_SESSION Continue previous session (ID provided) restoring your workspace (see --store-workspace) --store-workspace STORE_WORKSPACE - Upload/Restore remote workspace folder. Example: `~/workspace/` will automatically - restore/store the *containers* folder and extract it into the next session. Use with - --continue-session to continue your previous work from your exact container state + Upload/Restore remote workspace folder. Example: `~/workspace/` will automatically restore/store the + *containers* folder and extract it into next the session. Use with --continue-session to continue your + previous work from your exact container state --git-credentials [true/false] If true, local .git-credentials file is sent to the interactive session. (default: false) --user-folder USER_FOLDER Advanced: Set the remote base folder (default: ~/) --packages [PACKAGES [PACKAGES ...]] - Additional packages to add, supports version numbers (default: previously added packages). - examples: --packages torch==1.7 tqdm + Additional packages to add, supports version numbers (default: previously added packages). examples: + --packages torch==1.7 tqdm --requirements REQUIREMENTS - Specify requirements.txt file to install when setting the interactive session. - Requirements file is read and stored in `packages` section as default for the next - sessions. Can be overridden by calling `--packages` + Specify requirements.txt file to install when setting the interactive session. Requirements file is read + and stored in `packages` section as default for the next sessions. Can be overridden by calling + `--packages` --init-script [INIT_SCRIPT] - Specify BASH init script file to be executed when setting the interactive session. Script - content is read and stored as default script for the next sessions. To clear the init- - script do not pass a file + Specify BASH init script file to be executed when setting the interactive session. Script content is + read and stored as default script for the next sessions. To clear the init-script do not pass a file --config-file CONFIG_FILE Advanced: Change the configuration file used to store the previous state (default: ~/.clearml_session.json) --remote-gateway [REMOTE_GATEWAY] - Advanced: Specify gateway ip/address:port to be passed to interactive session (for use - with k8s ingestion / ELB) + Advanced: Specify gateway ip/address:port to be passed to interactive session (for use with k8s + ingestion / ELB) --base-task-id BASE_TASK_ID - Advanced: Set the base task ID for the interactive session. (default: previously used - Task). Use `none` for the default interactive session + Advanced: Set the base task ID for the interactive session. (default: previously used Task). Use `none` + for the default interactive session --project PROJECT Advanced: Set the project name for the interactive session Task --session-name SESSION_NAME Advanced: Set the name of the interactive session Task @@ -405,33 +403,32 @@ optional arguments: --disable-session-cleanup [true/false] Advanced: If set, previous interactive sessions are not deleted --keepalive [true/false] - Advanced: If set, enables the transparent proxy always keeping the sockets alive. Default: - False, do not use transparent sockets for mitigating connection drops. + Advanced: If set, enables the transparent proxy always keeping the sockets alive. Default: False, do not + use transparent socket for mitigating connection drops. --queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]] Advanced: Excluded queues with this specific tag from the selection --queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]] Advanced: Only include queues with this specific tag from the selection --skip-docker-network [true/false] - Advanced: If set, `--network host` is **not** passed to docker (assumes k8s network - ingestion) (default: false) - --password PASSWORD Advanced: Select ssh password for the interactive session (default: `randomly-generated` - or previously used one) - --username USERNAME Advanced: Select ssh username for the interactive session (default: `root` or previously + Advanced: If set, `--network host` is **not** passed to docker (assumes k8s network ingestion) (default: + false) + --password PASSWORD Advanced: Select ssh password for the interactive session (default: `randomly-generated` or previously used one) - --randomize Advanced: Recreate a new random ssh password for the interactive session options: - `--randomize` one time recreate, --randomize `always` create a new random password for - every session + --randomize [RANDOMIZE [RANDOMIZE ...]] + Advanced: Recreate a new random ssh password for the interactive session options: `--randomize` one time + recreate random password, --randomize `always` create a new random password for every session + --username USERNAME Advanced: Select ssh username for the interactive session (default: `root` or previously used one) --force-dropbear [true/false] Force using `dropbear` instead of SSHd --disable-store-defaults If set, do not store current setup as new default configuration --disable-fingerprint-check Advanced: If set, ignore the remote SSH server fingerprint check - --verbose Advanced: If set, print verbose progress information, e.g. the remote machine setup - process log + --verbose Advanced: If set, print verbose progress information, e.g. the remote machine setup process log --yes, -y Automatic yes to prompts; assume "yes" as answer to all prompts and run non-interactively Notice! all arguments are stored as new defaults for the next execution + ``` diff --git a/clearml_session/__main__.py b/clearml_session/__main__.py index 2c6bb1d..330d3f2 100644 --- a/clearml_session/__main__.py +++ b/clearml_session/__main__.py @@ -715,6 +715,7 @@ def clone_task(state, project_id=None): task_params["{}/force_dropbear".format(section)] = bool(state.get('force_dropbear')) task_params["{}/store_workspace".format(section)] = state.get('store_workspace') task_params["{}/use_ssh_proxy".format(section)] = state.get('keepalive') + task_params["{}/router_enabled".format(section)] = bool(state.get('router_enabled')) if state.get('user_folder'): task_params['{}/user_base_directory'.format(section)] = state.get('user_folder') docker = state.get('docker') or task.get_base_docker() @@ -977,7 +978,8 @@ def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_rem if debug: print("ERROR: running local SSH client [{}] failed connecting to {}: {}".format(command, args, ex)) else: - print("ERROR: running local SSH client failed connecting to {}: {}".format(remote_address, ex)) + print("ERROR: running local SSH client failed connecting to {} [{}]\n" + " for additional details re-run with --verbose".format(remote_address, type(ex))) if child: child.terminate(force=True) @@ -1165,6 +1167,8 @@ def monitor_ssh_tunnel(state, task, ssh_setup_completed_callback=None): else: logging.getLogger().warning('SSH tunneling failed, retrying in {} seconds'.format(3)) sleep(3.) + # clear ssh port, so that we reload it from Task (i.e. sync with router if it's there) + ssh_port = None continue connect_state['reconnect'] = False @@ -1355,6 +1359,10 @@ def setup_parser(parser): 'and launch jupyter/ssh for interactive access. Example --debugging-session ') parser.add_argument('--queue', type=str, default=None, help='Select the queue to launch the interactive session on (default: previously used queue)') + parser.add_argument("--router-enabled", default=None, nargs='?', const='true', metavar='true/false', + type=lambda x: (str(x).strip().lower() in ('true', 'yes')), + help="If we have a clearml Router set, make sure we request direct TCP routing " + "to our container. ") parser.add_argument('--docker', type=str, default=None, help='Select the docker image to use in the interactive session on ' '(default: previously used docker image or `{}`)'.format(default_docker_image)) diff --git a/clearml_session/interactive_session_task.py b/clearml_session/interactive_session_task.py index 624c09a..99723ea 100644 --- a/clearml_session/interactive_session_task.py +++ b/clearml_session/interactive_session_task.py @@ -591,6 +591,7 @@ def setup_ssh_server(hostname, hostnames, param, task, env): print("Installing SSH Server on {} [{}]".format(hostname, hostnames)) ssh_password = param.get("ssh_password", "training") + proxy_port = port = None ssh_port = None if Session.check_min_api_version("2.13"): try: @@ -797,6 +798,8 @@ def setup_ssh_server(hostname, hostnames, param, task, env): except Exception as ex: print("Error: {}\n\n#\n# Error: SSH server could not be launched\n#\n".format(ex)) + return proxy_port or port + def _b64_decode_file(encoded_string): # noinspection PyBroadException @@ -1019,6 +1022,7 @@ def get_host_name(task, param): # update host name if (not task.get_parameter(name='properties/external_address') and not task.get_parameter(name='properties/k8s-gateway-address')): + if task._get_runtime_properties().get("external_address"): external_addr = task._get_runtime_properties().get("external_address") else: @@ -1274,9 +1278,15 @@ def main(): "force_dropbear": False, "store_workspace": None, "use_ssh_proxy": False, + "router_enabled": False, } task = init_task(param, default_ssh_fingerprint) + # if router is enabled, do not request a public IP, enforce local IP + if param.get("router_enabled") and param.get("public_ip"): + print("External TCP router configured, disabling `public_ip` request") + param["public_ip"] = False + run_user_init_script(task) # restore workspace if exists @@ -1294,7 +1304,20 @@ def main(): env = setup_user_env(param, task) - setup_ssh_server(hostname, hostnames, param, task, env) + ssh_port = setup_ssh_server(hostname, hostnames, param, task, env) + + # make sure we set it to the runtime properties + if ssh_port and param.get("router_enabled"): + # noinspection PyProtectedMember + address = task._get_runtime_properties().get("external_address") or "" + print("Requesting TCP route from router ingress to {} port {}".format(address, ssh_port)) + # noinspection PyProtectedMember + task._set_runtime_properties({ + "external_address": address, + "external_tcp_port": ssh_port, + "_SERVICE": "EXTERNAL_TCP", + }) + task.set_system_tags((task.get_system_tags() or []) + ["external_service"]) start_vscode_server(hostname, hostnames, param, task, env)