From 3f3cb50d45edfb473d5a5bd408ed01f10592007c Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Thu, 7 Mar 2024 11:24:11 +0200
Subject: [PATCH] Add Workspace Synchronization - Update default docker to
nvidia/cuda:11.6.2-runtime-ubuntu20.04 - Add --session-name --session-tag to
allow setting session Task name and tags - Add list/info/shutdown commands -
Add --store-workspace allowing remote session to store a specific folder and
continue next session with a fully restored folder - Add
--disable-session-cleanup to disable old session cleanup
---
README.md | 382 ++++++++++++--------
clearml_session/__main__.py | 346 ++++++++++++++++--
clearml_session/interactive_session_task.py | 205 ++++++++++-
requirements.txt | 3 +-
4 files changed, 733 insertions(+), 203 deletions(-)
diff --git a/README.md b/README.md
index cf88e77..305b0d6 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,9 @@
-## **`clearml-session` CLI for launching JupyterLab / VSCode on a remote machine**
+## **`clearml-session` CLI for launching JupyterLab / VSCode / SSH on a remote machine**
+
+## đĨ NEW in version `0.13` [Workspace Syncing](#store-and-synchronize-interactive-session-workspace) đ
[![GitHub license](https://img.shields.io/github/license/allegroai/clearml-session.svg)](https://img.shields.io/github/license/allegroai/clearml-session.svg)
@@ -17,14 +19,20 @@
**`clearml-session`** is a utility for launching detachable remote interactive sessions (MacOS, Windows, Linux)
### tl;dr
-CLI to launch remote sessions for JupyterLab / VSCode-server / SSH, inside any docker image!
+**CLI to launch a remote session of Jupyter-Lab / VSCode / SSH,
+inside any docker container on any deployment, Cloud / Kubernetes / Bare-Metal**
-### What does it do?
+### đ° What does it do?
Starting a clearml (ob)session from your local machine triggers the following:
- ClearML allocates a remote instance (GPU) from your dedicated pool
- On the allocated instance it will spin **jupyter-lab** + **vscode server** + **SSH** access for
interactive usage (i.e., development)
- ClearML will start monitoring machine performance, allowing DevOps to detect stale instances and spin them down
+- NEW đĨ Kubernetes support, develop directly inside your pods! No kubectl required!
+Read more about `clearml-agent` and interactive sessions [here](https://clear.ml/docs/latest/docs/clearml_agent/#kubernetes)
+- NEW đ Automatically store & sync your [interactive session workspace](#store-and-synchronize-interactive-session-workspace).
+`clearml-session` will automatically create a snapshot of your entire workspace when shutting it down,
+and later restore into a new session on a different remote machine
> âšī¸ **Remote PyCharm:** You can also work with PyCharm in a remote session over SSH. Use the [PyCharm Plugin](https://github.com/allegroai/clearml-pycharm-plugin) to automatically sync local configurations with a remote session.
@@ -42,7 +50,7 @@ interactive usage (i.e., development)
Supported OS: MacOS, Windows, Linux
-## Secure & Stable
+## đ Secure & Stable
**clearml-session** creates a single, secure, and encrypted connection to the remote machine over SSH.
SSH credentials are automatically generated by the CLI and contain fully random 32 bytes password.
@@ -58,7 +66,7 @@ the base SSH network tunnel, without breaking JupyterLab/VSCode-server or your o
---
-## How to use: Interactive Session
+## ⥠How to use: Interactive Session
1. run `clearml-session`
@@ -70,39 +78,7 @@ the base SSH network tunnel, without breaking JupyterLab/VSCode-server or your o
pass **git credentials**, etc.
See below for full CLI options.
-## Frequently Asked Questions:
-
-#### How Does ClearML enable this?
-
-The `clearml-session` creates a new interactive `Task` in the system (default project: DevOps).
-
-This `Task` is responsible for setting the SSH and JupyterLab/VSCode on the host machine.
-
-The local `clearml-session` awaits for the interactive Task to finish with the initial setup, then
-it connects via SSH to the host machine (see "safe and stable" above), and tunnels
-both SSH and JupyterLab over the SSH connection.
-
-The end results is a local link which you can use to access the JupyterLab/VSCode on the remote machine, over a **secure and encrypted** connection!
-
-#### How can this be used to scale up/out development resources?
-
-**Clearml** has a cloud autoscaler, so you can easily and automatically spin machines for development!
-
-There is also a default docker image to use when initiating a task.
-
-This means that using **clearml-session**s
-with the autoscaler enabled, allows for turn-key secure development environment inside a docker of your choosing.
-
-Learn more about it [here](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler)
-
-#### Does this fit Work From Home situations?
-**YES**. Install `clearml-agent` on target machines inside the organization, connect over your company VPN
-and use `clearml-session` to gain access to a dedicated on-prem machine with the docker of your choosing
-(with out-of-the-box support for any internal docker artifactory).
-
-Learn more about how to utilize your office workstations and on-prem machines [here](https://clear.ml/docs/latest/docs/clearml_agent)
-
-## Tutorials
+## đ Tutorials
### Getting started
@@ -144,10 +120,20 @@ Enter "r" (or "reconnect") to reconnect the session (for example after suspend)
or "Shutdown" to shut down remote interactive session
```
-Click on the JupyterLab link (http://localhost:8878/?token=xyz)
+Click on the **Jupyter Lab** link (http://localhost:8878/?token=xyz)
+Or **VScode** (running inside your remote container) (http://localhost:8898/)
+Or drop into **SSH** shell by typying `shell`
+
Open your terminal, clone your code & start working :)
-### Leaving a session and reconnecting from the same machine
+> âšī¸ **TIP**: You can additional python package to your remote session setup by adding `--packages` to the command line,
+> for example to add `boto3` add `--packages "boto3>1"`
+
+> âšī¸ **TIP**: If you need direct SSH into the remote container from your terminal,
+> you can directly drop into a shell by adding `--shell` to the command line
+
+
+### Leaving a session and reconnecting to it
On the `clearml-session` CLI terminal, enter 'quit' or press `Ctrl-C`
It will close the CLI but preserve the remote session (i.e. remote session will remain running)
@@ -169,7 +155,7 @@ Connect to active session id=3d38e738c5ff458a9ec465e77e19da23 [Y]/n?
On the `clearml-session` CLI terminal, enter 'shutdown' (case-insensitive)
It will shut down the remote session, free the resource and close the CLI
-``` console
+```console
Enter "r" (or "reconnect") to reconnect the session (for example after suspend)
`s` (or "shell") to connect to the SSH session
`Ctrl-C` (or "quit") to abort (remote session remains active)
@@ -182,178 +168,262 @@ Remote session shutdown
Goodbye
```
+You can also use the CLI to shut down a specific clearml interactive session
+
+```bash
+clearml-session shutdown --id
+```
+
### Connecting to a running interactive session from a different machine
Continue working on an interactive session from **any** machine.
In the `clearml` web UI, go to DevOps project, and find your interactive session.
Click on the ID button next to the Task name, and copy the unique ID.
-``` bash
-clearml-session --attach
+```bash
+clearml-session --attach
```
Click on the JupyterLab/VSCode link, or connect directly to the SSH session
+> ⨠**TIP**: You can work & debug your colleagues code and workspace by sharing the `session id`
+> and connect to the same remote container together with `--attach`
+
+
+### Store and synchronize interactive session workspace
+
+Specify the remote workspace root-folder by adding `--store-workspace ~/workspace` to the command line.
+In the remote session container, put all your code / data under the `~/workspace` directory.
+When your session is shut down, the workspace folder will be automatically package and stored on the clearml file server.
+In your next `clearml-session` execution specify again `--store-workspace ~/workspace` and clearml-session
+will grab the previous workspace snapshot and restore it into the new remote container in `~/workspace`.
+
+```bash
+clearml-session --store-workspace ~/workspace --docker python:3.10-bullseye
+```
+
+To continue the last aborted session and restore the workspace
+
+```bash
+clearml-session --store-workspace ~/workspace --docker python:3.10-bullseye
+```
+
+```console
+clearml-session - CLI for launching JupyterLab / VSCode / SSH on a remote machine
+Verifying credentials
+Use previous queue (resource) '1xGPU' [Y]/n?
+
+Interactive session config:
+...
+Restore workspace from session id=01bf86f038314434878b2413343ba746 'interactive_session' @ 2024-03-02 20:34:03 [Y]/n?
+Restoring workspace from previous session id=01bf86f038314434878b2413343ba746
+```
+
+To continue a **specific** session ID and restore its workspace
+
+```bash
+clearml-session --continue-session --store-workspace ~/workspace --docker python:3.10-bullseye
+```
+
+### Upload local files to remote session
+
+If you need to upload files from your local machine into the remote session,
+specify the file or directory with `--upload-files /mnt/data/stuff`.
+The entire content of the directory / file will be copied into your remote `clearml-session`
+container under the `~/session-files/` directory.
+
+Can be used in conjunction with `--store-session` to easily move workloads between local development machine
+and remote machines with 100% persistent workspace synchronization.
+
+```bash
+clearml-session --upload-files /mnt/data/stuff
+```
+
+
### Debug a previously executed experiment
-If you have a previously executed experiment in the system,
-you can create an exact copy of the experiment and debug it on the remote interactive session.
+If you have a previously executed experiment (Task) on the clearml platform,
+you can create an exact copy of the experiment (Task) and debug it on the remote interactive session.
`clearml-session` will replicate the exact remote environment, add JupyterLab/VSCode/SSH and allow you interactively
-execute and debug the experiment, on the allocated remote machine.
+execute and debug the experiment, on the interactive remote container.
In the `clearml` web UI, find the experiment (Task) you wish to debug.
-Click on the ID button next to the Task name, and copy the unique ID.
+Click on the ID button next to the Task name, and copy the unique ID, then execute:
-``` bash
+```bash
clearml-session --debugging-session
```
-Click on the JupyterLab/VSCode link, or connect directly to the SSH session
+Click on the JupyterLab/VSCode link, or drop directly into an SSH shell by typying `shell`
+
+
+## â Frequently Asked Questions
+
+#### How does it work?
+
+The `clearml-session` creates a new interactive `Task` in the system (default project: DevOps).
+
+This `Task` is responsible for setting the SSH and JupyterLab/VSCode on the host machine.
+
+The local `clearml-session` awaits for the interactive Task to finish with the initial setup, then
+it connects via SSH to the host machine (see "safe and stable" above), and tunnels
+both SSH and JupyterLab over the SSH connection.
+
+The end results is a local link which you can use to access the JupyterLab/VSCode on the remote machine, over a **secure and encrypted** connection!
+
+#### Does `clearml-session` support Kubernetes clusters?
+
+Yes! `clearml-session` utilizes the `clearml-agent` kubernetes glue together with routing capabilities in order to allow
+any clearml-session to spin a container (pod) on the kubernetes cluster and securely connect **directly** into the pod.
+This feature does not require any kubernetes access from the users, and simplifies code
+development on kubernetes clusters as well as job scheduling & launching.
+Read more on how to deploy clearml on kubernetes [here](https://clear.ml/docs/latest/docs/clearml_agent/#kubernetes)
+
+#### How can I use `clearml-session` to scale up / out development resources?
+
+**Clearml** has a [cloud autoscaler](https://clear.ml/docs/latest/docs/cloud_autoscaling/autoscaling_overview), so you can easily and automatically spin machines for development!
+
+There is also a default docker image to use when initiating a task.
+
+This means that using **clearml-session**s
+with the autoscaler enabled, allows for turn-key secure development environment inside a docker of your choosing.
+
+Learn more about it [here](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler) & [here](https://clear.ml/docs/latest/docs/webapp/applications/apps_gpu_compute)
+
+#### Does `clearml-session` fit Work-From-Home setup?
+**YES**. Install `clearml-agent` on target machines inside the organization, connect over your company VPN
+and use `clearml-session` to gain access to a dedicated on-prem machine with the docker of your choosing
+(with out-of-the-box support for any internal docker artifactory).
+
+Learn more about how to utilize your office workstations and on-prem machines [here](https://clear.ml/docs/latest/docs/clearml_agent)
## CLI options
-``` bash
+```bash
clearml-session --help
```
-``` console
-clearml-session - CLI for launching JupyterLab / VSCode on a remote machine
-usage: clearml-session [-h] [--version] [--attach [ATTACH]]
- [--shutdown [SHUTDOWN]] [--shell]
- [--debugging-session DEBUGGING_SESSION] [--queue QUEUE]
- [--docker DOCKER] [--docker-args DOCKER_ARGS]
- [--public-ip [true/false]]
- [--remote-ssh-port REMOTE_SSH_PORT]
- [--vscode-server [true/false]]
- [--vscode-version VSCODE_VERSION]
- [--vscode-extensions VSCODE_EXTENSIONS]
- [--jupyter-lab [true/false]]
- [--upload-files UPLOAD_FILES]
- [--git-credentials [true/false]]
- [--user-folder USER_FOLDER]
- [--packages [PACKAGES [PACKAGES ...]]]
- [--requirements REQUIREMENTS]
- [--init-script [INIT_SCRIPT]]
- [--config-file CONFIG_FILE]
- [--remote-gateway [REMOTE_GATEWAY]]
- [--base-task-id BASE_TASK_ID] [--project PROJECT]
- [--keepalive [true/false]]
+```console
+clearml-session - CLI for launching JupyterLab / VSCode / SSH on a remote machine
+usage: clearml-session [-h] [--version] [--attach [ATTACH]] [--shutdown [SHUTDOWN]] [--shell]
+ [--debugging-session DEBUGGING_SESSION] [--queue QUEUE] [--docker DOCKER]
+ [--docker-args DOCKER_ARGS] [--public-ip [true/false]] [--remote-ssh-port REMOTE_SSH_PORT]
+ [--vscode-server [true/false]] [--vscode-version VSCODE_VERSION]
+ [--vscode-extensions VSCODE_EXTENSIONS] [--jupyter-lab [true/false]]
+ [--upload-files UPLOAD_FILES] [--continue-session CONTINUE_SESSION]
+ [--store-workspace STORE_WORKSPACE] [--git-credentials [true/false]]
+ [--user-folder USER_FOLDER] [--packages [PACKAGES [PACKAGES ...]]]
+ [--requirements REQUIREMENTS] [--init-script [INIT_SCRIPT]] [--config-file CONFIG_FILE]
+ [--remote-gateway [REMOTE_GATEWAY]] [--base-task-id BASE_TASK_ID] [--project PROJECT]
+ [--session-name SESSION_NAME] [--session-tags [SESSION_TAGS [SESSION_TAGS ...]]]
+ [--disable-session-cleanup [true/false]] [--keepalive [true/false]]
[--queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]]]
[--queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]]]
- [--skip-docker-network [true/false]]
- [--password PASSWORD] [--username USERNAME]
- [--force_dropbear [true/false]] [--verbose] [--yes]
+ [--skip-docker-network [true/false]] [--password PASSWORD] [--username USERNAME]
+ [--force-dropbear [true/false]] [--verbose] [--yes]
+ {list,info,shutdown} ...
-clearml-session - CLI for launching JupyterLab / VSCode on a remote machine
+clearml-session - CLI for launching JupyterLab / VSCode / SSH on a remote machine
+
+positional arguments:
+ {list,info,shutdown} ClearML session control commands
+ list List running Sessions
+ info Detailed information on specific session
+ shutdown Shutdown specific session
optional arguments:
-h, --help show this help message and exit
--version Display the clearml-session utility version
- --attach [ATTACH] Attach to running interactive session (default:
- previous session)
+ --attach [ATTACH] Attach to running interactive session (default: previous session)
--shutdown [SHUTDOWN], -S [SHUTDOWN]
- Shut down an active session (default: previous
- session)
- --shell Open the SSH shell session directly, notice quiting
- the SSH session will Not shutdown the remote session
+ Shut down an active session (default: previous session)
+ --shell Open the SSH shell session directly, notice quiting the SSH session will Not shutdown the
+ remote session
--debugging-session DEBUGGING_SESSION
- Pass existing Task id (experiment), create a copy of
- the experiment on a remote machine, and launch
- jupyter/ssh for interactive access. Example
- --debugging-session
- --queue QUEUE Select the queue to launch the interactive session on
- (default: previously used queue)
- --docker DOCKER Select the docker image to use in the interactive
- session on (default: previously used docker image or
- `nvidia/cuda:10.1-runtime-ubuntu18.04`)
+ Pass existing Task id (experiment), create a copy of the experiment on a remote machine,
+ and launch jupyter/ssh for interactive access. Example --debugging-session
+ --queue QUEUE Select the queue to launch the interactive session on (default: previously used queue)
+ --docker DOCKER Select the docker image to use in the interactive session on (default: previously used
+ docker image or `nvidia/cuda:11.6.2-runtime-ubuntu20.04`)
--docker-args DOCKER_ARGS
- Add additional arguments for the docker image to use
- in the interactive session on (default: previously
- used docker-args)
+ Add additional arguments for the docker image to use in the interactive session on
+ (default: previously used docker-args)
--public-ip [true/false]
- If True register the public IP of the remote machine.
- Set if running on the cloud. Default: false (use for
- local / on-premises)
+ If True register the public IP of the remote machine. Set if running on the cloud.
+ Default: false (use for local / on-premises)
--remote-ssh-port REMOTE_SSH_PORT
- Set the remote ssh server port, running on the agent`s
- machine. (default: 10022)
+ Set the remote ssh server port, running on the agent`s machine. (default: 10022)
--vscode-server [true/false]
- Install vscode server (code-server) on interactive
- session (default: true)
+ Install vscode server (code-server) on interactive session (default: true)
--vscode-version VSCODE_VERSION
- Set vscode server (code-server) version, as well as
- vscode python extension version
- (example: "3.7.4:2020.10.332292344")
+ Set vscode server (code-server) version, as well as vscode python extension version
+ (example: "3.7.4:2020.10.332292344")
--vscode-extensions VSCODE_EXTENSIONS
- Install additional vscode extensions, as well as
- vscode python extension (example: "ms-
- python.python,ms-python.black-formatter,ms-
- python.pylint,ms-python.flake8")
+ Install additional vscode extensions, as well as vscode python extension (example: "ms-
+ python.python,ms-python.black-formatter,ms-python.pylint,ms-python.flake8")
--jupyter-lab [true/false]
- Install Jupyter-Lab on interactive session (default:
- true)
+ Install Jupyter-Lab on interactive session (default: true)
--upload-files UPLOAD_FILES
- Advanced: Upload local files/folders to the remote
- session. Example: `/my/local/data/` will upload the
- local folder and extract it into the container in
- ~/session-files/
+ Advanced: Upload local files/folders to the remote session. Example: `/my/local/data/`
+ will upload the local folder and extract it into the container in ~/session-files/
+ --continue-session CONTINUE_SESSION
+ Continue previous session (ID provided) restoring your workspace (see --store-workspace)
+ --store-workspace STORE_WORKSPACE
+ Upload/Restore remote workspace folder. Example: `~/workspace/` will automatically
+ restore/store the *containers* folder and extract it into next the session. Use with
+ --continue-session to continue your previous work from your exact container state
--git-credentials [true/false]
- If true, local .git-credentials file is sent to the
- interactive session. (default: false)
+ If true, local .git-credentials file is sent to the interactive session. (default: false)
--user-folder USER_FOLDER
Advanced: Set the remote base folder (default: ~/)
--packages [PACKAGES [PACKAGES ...]]
- Additional packages to add, supports version numbers
- (default: previously added packages). examples:
- --packages torch==1.7 tqdm
+ Additional packages to add, supports version numbers (default: previously added packages).
+ examples: --packages torch==1.7 tqdm
--requirements REQUIREMENTS
- Specify requirements.txt file to install when setting
- the interactive session. Requirements file is read and
- stored in `packages` section as default for the next
+ Specify requirements.txt file to install when setting the interactive session.
+ Requirements file is read and stored in `packages` section as default for the next
sessions. Can be overridden by calling `--packages`
--init-script [INIT_SCRIPT]
- Specify BASH init script file to be executed when
- setting the interactive session. Script content is
- read and stored as default script for the next
- sessions. To clear the init-script do not pass a file
+ Specify BASH init script file to be executed when setting the interactive session. Script
+ content is read and stored as default script for the next sessions. To clear the init-
+ script do not pass a file
--config-file CONFIG_FILE
- Advanced: Change the configuration file used to store
- the previous state (default: ~/.clearml_session.json)
+ Advanced: Change the configuration file used to store the previous state (default:
+ ~/.clearml_session.json)
--remote-gateway [REMOTE_GATEWAY]
- Advanced: Specify gateway ip/address:port to be passed
- to interactive session (for use with k8s ingestion /
- ELB)
+ Advanced: Specify gateway ip/address:port to be passed to interactive session (for use
+ with k8s ingestion / ELB)
--base-task-id BASE_TASK_ID
- Advanced: Set the base task ID for the interactive
- session. (default: previously used Task). Use `none`
- for the default interactive session
- --project PROJECT Advanced: Set the project name for the interactive
- session Task
+ Advanced: Set the base task ID for the interactive session. (default: previously used
+ Task). Use `none` for the default interactive session
+ --project PROJECT Advanced: Set the project name for the interactive session Task
+ --session-name SESSION_NAME
+ Advanced: Set the name of the interactive session Task
+ --session-tags [SESSION_TAGS [SESSION_TAGS ...]]
+ Advanced: Add tags to the interactive session for increased visibility
+ --disable-session-cleanup [true/false]
+ Advanced: If set, previous interactive sessions are not deleted
--keepalive [true/false]
- Advanced: If set, enables the transparent proxy always
- keeping the sockets alive. Default: False, do not use
- transparent socket for mitigating connection drops.
+ Advanced: If set, enables the transparent proxy always keeping the sockets alive. Default:
+ False, do not use transparent socket for mitigating connection drops.
--queue-excluded-tag [QUEUE_EXCLUDED_TAG [QUEUE_EXCLUDED_TAG ...]]
- Advanced: Excluded queues with this specific tag from
- the selection
+ Advanced: Excluded queues with this specific tag from the selection
--queue-include-tag [QUEUE_INCLUDE_TAG [QUEUE_INCLUDE_TAG ...]]
- Advanced: Only include queues with this specific tag
- from the selection
+ Advanced: Only include queues with this specific tag from the selection
--skip-docker-network [true/false]
- Advanced: If set, `--network host` is **not** passed
- to docker (assumes k8s network ingestion) (default:
- false)
- --password PASSWORD Advanced: Select ssh password for the interactive
- session (default: `randomly-generated` or previously
+ Advanced: If set, `--network host` is **not** passed to docker (assumes k8s network
+ ingestion) (default: false)
+ --password PASSWORD Advanced: Select ssh password for the interactive session (default: `randomly-generated`
+ or previously used one)
+ --username USERNAME Advanced: Select ssh username for the interactive session (default: `root` or previously
used one)
- --username USERNAME Advanced: Select ssh username for the interactive
- session (default: `root` or previously used one)
- --force_dropbear [true/false]
+ --force-dropbear [true/false]
Force using `dropbear` instead of SSHd
- --verbose Advanced: If set, print verbose progress information,
- e.g. the remote machine setup process log
- --yes, -y Automatic yes to prompts; assume "yes" as answer to
- all prompts and run non-interactively
+ --verbose Advanced: If set, print verbose progress information, e.g. the remote machine setup
+ process log
+ --yes, -y Automatic yes to prompts; assume "yes" as answer to all prompts and run non-interactively
-Notice! all arguments are stored as new defaults for the next session
+Notice! all arguments are stored as new defaults for the next execution
```
+
+
diff --git a/clearml_session/__main__.py b/clearml_session/__main__.py
index 167f578..f844327 100644
--- a/clearml_session/__main__.py
+++ b/clearml_session/__main__.py
@@ -35,7 +35,7 @@ except Exception:
pass
system_tag = 'interactive'
-default_docker_image = 'nvidia/cuda:10.1-runtime-ubuntu18.04'
+default_docker_image = 'nvidia/cuda:11.6.2-runtime-ubuntu20.04'
class NonInteractiveError(Exception):
@@ -146,10 +146,21 @@ def _get_available_ports(list_initial_ports):
return available_ports
-def create_base_task(state, project_name=None, task_name=None):
- task = Task.create(project_name=project_name or 'DevOps',
- task_name=task_name or 'Interactive Session',
- task_type=Task.TaskTypes.application)
+def create_base_task(state, project_name=None, task_name=None, continue_task_id=None, project_id=None):
+ if continue_task_id:
+ task = Task.clone(
+ source_task=continue_task_id,
+ project=project_id,
+ parent=continue_task_id,
+ name=task_name or 'Interactive Session'
+ )
+ else:
+ task = Task.create(
+ project_name=project_name or 'DevOps',
+ task_name=task_name or 'Interactive Session',
+ task_type=Task.TaskTypes.application
+ )
+
task_script = task.data.script.to_dict()
base_script_file = os.path.abspath(os.path.join(__file__, '..', 'tcp_proxy.py'))
with open(base_script_file, 'rt') as f:
@@ -218,13 +229,13 @@ def create_base_task(state, project_name=None, task_name=None):
return task
-def create_debugging_task(state, debug_task_id):
+def create_debugging_task(state, debug_task_id, task_name=None, task_project_id=None):
debug_task = Task.get_task(task_id=debug_task_id)
# if there is no git repository, we cannot debug it
if not debug_task.data.script.repository:
raise ValueError("Debugging task has no git repository, single script debugging is not supported.")
- task = Task.clone(source_task=debug_task_id, parent=debug_task_id)
+ task = Task.clone(source_task=debug_task_id, parent=debug_task_id, name=task_name, project=task_project_id)
task_state = task.export_task()
@@ -285,23 +296,87 @@ def create_debugging_task(state, debug_task_id):
return task
-def delete_old_tasks(state, client, base_task_id):
+def find_prev_session(state, client):
+ # nothing to do
+ if not state.get("store_workspace"):
+ return
+
+ current_user_id = _get_user_id(client)
+ previous_tasks = client.tasks.get_all(**{
+ 'status': ['failed', 'stopped', 'completed'],
+ 'system_tags': [system_tag],
+ 'page_size': 100, 'page': 0,
+ 'order_by': ['-last_update'],
+ 'user': [current_user_id],
+ 'only_fields': ['id']
+ })
+
+ continue_session_id = state.get("continue_session")
+ # if we do not find something, we ignore it
+ state["continue_session"] = None
+
+ for i, t in enumerate(previous_tasks):
+ try:
+ task = Task.get_task(task_id=t.id)
+ if state.get("store_workspace") and task.artifacts:
+ if continue_session_id and continue_session_id == t.id:
+ print("Restoring workspace from previous session id={} [{}]".format(
+ continue_session_id, task.data.last_update))
+ state["continue_session"] = t.id
+ break
+ elif not continue_session_id and i == 0:
+ if not state.get("yes"):
+ choice = input("Restore workspace from session id={} '{}' @ {} [Y]/n? ".format(
+ t.id, task.name, str(task.data.last_update).split(".")[0]))
+ if str(choice).strip().lower() in ('n', 'no'):
+ continue
+
+ print("Restoring workspace from previous session id={}".format(t.id))
+ state["continue_session"] = t.id
+ break
+ except Exception as ex:
+ logging.getLogger().warning('Failed retrieving old session {}:'.format(t.id, ex))
+
+
+def delete_old_tasks(state, client, base_task_id, skip_latest_session=True):
+ if state["disable_session_cleanup"]:
+ return
+
print('Removing stale interactive sessions')
+
current_user_id = _get_user_id(client)
previous_tasks = client.tasks.get_all(**{
'status': ['failed', 'stopped', 'completed'],
'parent': base_task_id or None,
'system_tags': None if base_task_id else [system_tag],
'page_size': 100, 'page': 0,
+ 'order_by': ['-last_update'],
'user': [current_user_id],
'only_fields': ['id']
})
for i, t in enumerate(previous_tasks):
+ # skip the selected Task which has our new workspace
+ if state.get("continue_session") == t.id:
+ continue
+
if state.get('verbose'):
print('Removing {}/{} stale sessions'.format(i+1, len(previous_tasks)))
+ # no need to worry about workspace snapshots,
+ # because they are input artifacts and thus will Not actually be deleted
+ # we will delete them manually if the Task has its own workspace snapshot
try:
- Task.get_task(task_id=t.id).delete(delete_artifacts_and_models=True, skip_models_used_by_other_tasks=True, raise_on_error=True)
+ task = Task.get_task(task_id=t.id)
+ # if we have any artifacts on this session Task
+ if skip_latest_session and task.artifacts and i == 0:
+ # do not delete this workspace yet (only next time)
+ continue
+
+ task.delete(
+ delete_artifacts_and_models=True,
+ skip_models_used_by_other_tasks=True,
+ raise_on_error=True
+ )
except Exception as ex:
logging.getLogger().warning('{}\nFailed deleting old session {}'.format(ex, t.id))
try:
@@ -317,7 +392,8 @@ def _get_running_tasks(client, prev_task_id):
'system_tags': [system_tag],
'page_size': 10, 'page': 0,
'order_by': ['-last_update'],
- 'user': [current_user_id], 'only_fields': ['id', 'created', 'parent']
+ 'user': [current_user_id],
+ 'only_fields': ['id', 'created', 'parent']
})
tasks_id_created = [(t.id, t.created, t.parent) for t in previous_tasks]
if prev_task_id and prev_task_id not in (t[0] for t in tasks_id_created):
@@ -363,13 +439,9 @@ def _b64_encode_file(file):
def get_project_id(project_name):
project_id = None
if project_name:
- projects = Task.get_projects()
- project_id = [p for p in projects if p.name == project_name]
- if project_id:
- project_id = project_id[0]
- else:
+ project_id = Task.get_project_id(project_name=project_name)
+ if not project_id:
logging.getLogger().warning("could not locate project by the named '{}'".format(project_name))
- project_id = None
return project_id
@@ -465,17 +537,19 @@ def get_user_inputs(args, parser, state, client):
print("\nInteractive session config:\n{}\n".format(
json.dumps({k: v for k, v in state.items() if not str(k).startswith('__')}, indent=4, sort_keys=True)))
+ return state
+
+
+def ask_launch(args):
# no need to ask just return the value
- if assume_yes:
- return state
+ if args.yes:
+ return
choice = input('Launch interactive session [Y]/n? ')
if str(choice).strip().lower() in ('n', 'no'):
print('User aborted')
exit(0)
- return state
-
def save_state(state, state_file):
# if we are running in debugging mode,
@@ -504,25 +578,47 @@ def load_state(state_file):
state.pop('yes', None)
state.pop('shell', None)
state.pop('upload_files', None)
+ state.pop('continue_session', None)
return state
def clone_task(state, project_id=None):
new_task = False
+ project_id = \
+ project_id or (get_project_id(project_name=state.get('project')) if state.get('project') else None)
+
if state.get('debugging_session'):
print('Starting new debugging session to {}'.format(state.get('debugging_session')))
- task = create_debugging_task(state, state.get('debugging_session'))
+ task = create_debugging_task(
+ state,
+ state.get('debugging_session'),
+ task_name=state.get('session_name'),
+ task_project_id=project_id
+ )
elif state.get('base_task_id'):
- print('Cloning base session {}'.format(state['base_task_id']))
- project_id = \
- project_id or (get_project_id(project_name=state.get('project')) if state.get('project') else None)
- task = Task.clone(source_task=state['base_task_id'], project=project_id, parent=state['base_task_id'])
+ base_task_id = state.get('base_task_id')
+ print('Cloning base session {}'.format(base_task_id))
+ task = Task.clone(
+ source_task=base_task_id,
+ project=project_id,
+ parent=base_task_id,
+ name=state.get('session_name')
+ )
task.set_system_tags([system_tag])
else:
print('Creating new session')
- task = create_base_task(state, project_name=state.get('project'))
+ task = create_base_task(
+ state,
+ project_name=state.get('project'),
+ task_name=state.get('session_name'),
+ continue_task_id=state.get('continue_session'),
+ project_id=project_id
+ )
new_task = True
+ if state.get("session_tags"):
+ task.set_tags(state.get("session_tags"))
+
print('Configuring new session')
runtime_prop_support = Session.check_min_api_version("2.13")
if runtime_prop_support:
@@ -562,6 +658,7 @@ def clone_task(state, project_id=None):
task_params["{}/vscode_version".format(section)] = state.get('vscode_version') or ''
task_params["{}/vscode_extensions".format(section)] = state.get('vscode_extensions') or ''
task_params["{}/force_dropbear".format(section)] = bool(state.get('force_dropbear'))
+ task_params["{}/store_workspace".format(section)] = state.get('store_workspace')
if state.get('user_folder'):
task_params['{}/user_base_directory'.format(section)] = state.get('user_folder')
docker = state.get('docker') or task.get_base_docker()
@@ -624,6 +721,10 @@ def clone_task(state, project_id=None):
task.update_task({'script': {'requirements': requirements}})
task.set_parameters(task_params)
print('New session created [id={}]'.format(task.id))
+ if state.get("continue_session") and state.get("store_workspace"):
+ print('Restoring remote workspace from [{}] into {}'.format(
+ state.get("continue_session"), state.get("store_workspace")))
+
return task
@@ -819,6 +920,12 @@ def monitor_ssh_tunnel(state, task):
local_vscode_port_ = local_vscode_port
default_section = _get_config_section_name()[0]
+
+ workspace_header_msg = ''
+ if task.get_parameter("{}/store_workspace".format(default_section)):
+ workspace_header_msg = "Workspace at '{}' will be automatically synchronized when shutting down".format(
+ task.get_parameter("{}/store_workspace".format(default_section)))
+
local_remote_pair_list = []
shutdown = False
try:
@@ -914,6 +1021,9 @@ def monitor_ssh_tunnel(state, task):
local_vscode_port=local_vscode_port)
if state.get('user_folder'):
msg += "?folder={}".format(state.get('user_folder'))
+ if workspace_header_msg:
+ msg += "\n\n{}".format(workspace_header_msg)
+
print(msg)
print(connect_message)
else:
@@ -986,6 +1096,114 @@ def monitor_ssh_tunnel(state, task):
pass
+class CliCommands:
+ state = dict()
+
+ @classmethod
+ def list_sessions(cls, args):
+ client = APIClient()
+ filters = {
+ 'status': ['in_progress'],
+ 'system_tags': [system_tag],
+ 'page_size': 500, 'page': 0,
+ 'order_by': ['-last_update'],
+ 'only_fields': ['id', 'created', 'name', 'project', 'tags']
+ }
+ if args.session_tags:
+ filters['tags'] = args.session_tags
+ if args.project:
+ filters['project'] = [Task.get_project_id(project_name=args.project)]
+ if not args.all_users:
+ current_user_id = _get_user_id(client)
+ filters['user'] = [current_user_id]
+
+ msg = "Listing active sessions tags=[{}] project=[{}] all_users={}".format(
+ args.session_tags or "*", args.project or "*", args.all_users)
+ print(msg + "\n" + ("-" * len(msg)))
+
+ session_tasks = client.tasks.get_all(**filters)
+ if not session_tasks:
+ print("No interactive sessions found")
+ return 0
+
+ project_names = dict()
+ for i, t in enumerate(session_tasks):
+ # noinspection PyProtectedMember
+ pname = project_names.get(t.project, Task._get_project_name(t.project)) if t.project else ""
+ print("{}] id={} name='{}' tags={} project='{}'".format(i, t.id, t.name, t.tags, pname))
+
+ return 0
+
+ @classmethod
+ def session_info(cls, args):
+ print("Fetching interactive session details:")
+ client = APIClient()
+ try:
+ tasks = client.tasks.get_all(**{
+ 'id': [args.id],
+ 'page_size': 10, 'page': 0,
+ 'order_by': ['-last_update'],
+ 'only_fields': ['id', 'created', 'parent', 'status', 'project', 'tags', 'system_tags', 'type']
+ })
+ except APIError:
+ tasks = None
+
+ if tasks:
+ tid = tasks[0].id
+ t = Task.get_task(task_id=tid)
+ print(
+ " status={}\n".format(t.data.status) +
+ " id={}\n".format(t.id) +
+ " name={}\n".format(t.name) +
+ " project={}\n".format(t.get_project_name()) +
+ " tags={}\n".format(t.get_tags()) +
+ " log={}\n".format(t.get_output_log_web_page())
+ )
+ return 0
+ else:
+ print("ERROR: Interactive session id={} not found".format(args.id))
+ return 1
+
+ @classmethod
+ def shutdown_session(cls, args):
+ task_id = args.id or args.shutdown
+ print("Shutting down session id={}".format(task_id))
+ client = APIClient()
+ try:
+ tasks = client.tasks.get_all(**{
+ 'id': [args.id],
+ 'page_size': 10, 'page': 0,
+ 'order_by': ['-last_update'],
+ 'only_fields': ['id', 'created', 'parent', 'status', 'project', 'tags', 'system_tags', 'type']
+ })
+ except APIError:
+ tasks = None
+
+ if not tasks:
+ print("ERROR: Interactive session id={} not found".format(args.id))
+ return 1
+
+ try:
+ task = _get_previous_session(
+ client, args, cls.state,
+ task_id=task_id,
+ verb="Shutting down",
+ question_verb="Shutdown",
+ ask_for_explicit_id=True
+ )
+ except ValueError:
+ print("Warning: session not running - skipping shutdown")
+ return 0
+
+ if not task:
+ print("Warning: skipping session shutdown")
+ return 0
+
+ task.mark_stopped()
+ print("Session #{} shutdown".format(task.id))
+ return 0
+
+
def setup_parser(parser):
parser.add_argument('--version', action='store_true', default=None,
help='Display the clearml-session utility version')
@@ -1030,6 +1248,15 @@ def setup_parser(parser):
help='Advanced: Upload local files/folders to the remote session. '
'Example: `/my/local/data/` will upload the local folder and extract it '
'into the container in ~/session-files/')
+ parser.add_argument('--continue-session', type=str, default=None,
+ help='Continue previous session (ID provided) '
+ 'restoring your workspace (see --store-workspace)')
+ parser.add_argument('--store-workspace', type=str, default=None,
+ help='Upload/Restore remote workspace folder. '
+ 'Example: `~/workspace/` will automatically restore/store the *containers* folder '
+ 'and extract it into next the session. '
+ 'Use with --continue-session to continue your '
+ 'previous work from your exact container state')
parser.add_argument('--git-credentials', default=False, nargs='?', const='true', metavar='true/false',
type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
help='If true, local .git-credentials file is sent to the interactive session. '
@@ -1059,6 +1286,13 @@ def setup_parser(parser):
'(default: previously used Task). Use `none` for the default interactive session')
parser.add_argument('--project', type=str, default=None,
help='Advanced: Set the project name for the interactive session Task')
+ parser.add_argument('--session-name', type=str, default=None,
+ help='Advanced: Set the name of the interactive session Task')
+ parser.add_argument('--session-tags', type=str, nargs='*', default=None,
+ help='Advanced: Add tags to the interactive session for increased visibility')
+ parser.add_argument('--disable-session-cleanup', default=False, nargs='?', const='true', metavar='true/false',
+ type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
+ help='Advanced: If set, previous interactive sessions are not deleted')
parser.add_argument('--keepalive', default=False, nargs='?', const='true', metavar='true/false',
type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
help='Advanced: If set, enables the transparent proxy always keeping the sockets alive. '
@@ -1067,7 +1301,7 @@ def setup_parser(parser):
help='Advanced: Excluded queues with this specific tag from the selection')
parser.add_argument('--queue-include-tag', default=None, nargs='*',
help='Advanced: Only include queues with this specific tag from the selection')
- parser.add_argument('--skip-docker-network', default=None, nargs='?', const='true', metavar='true/false',
+ parser.add_argument('--skip-docker-network', default=None, nargs='?', const='true', metavar='true/false',
type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
help='Advanced: If set, `--network host` is **not** passed to docker '
'(assumes k8s network ingestion) (default: false)')
@@ -1077,7 +1311,7 @@ def setup_parser(parser):
parser.add_argument('--username', type=str, default=None,
help='Advanced: Select ssh username for the interactive session '
'(default: `root` or previously used one)')
- parser.add_argument('--force_dropbear', default=None, nargs='?', const='true', metavar='true/false',
+ parser.add_argument('--force-dropbear', default=None, nargs='?', const='true', metavar='true/false',
type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
help='Force using `dropbear` instead of SSHd')
parser.add_argument('--verbose', action='store_true', default=None,
@@ -1088,6 +1322,26 @@ def setup_parser(parser):
help='Automatic yes to prompts; assume \"yes\" as answer '
'to all prompts and run non-interactively',)
+ subparsers = parser.add_subparsers(help='ClearML session control commands', dest='command')
+
+ parser_list = subparsers.add_parser('list', help='List running Sessions')
+ parser_list.add_argument(
+ '--all_users', '-a',
+ action='store_true', default=False,
+ help='Return all running sessions (from all users). '
+ 'Default: return Only current users sessions',)
+ parser_list.set_defaults(func=CliCommands.list_sessions)
+
+ parser_info = subparsers.add_parser('info', help='Detailed information on specific session')
+ parser_info.add_argument(
+ '--id', type=str, default=None, help='Interactive session information details')
+ parser_info.set_defaults(func=CliCommands.session_info)
+
+ parser_shutdown = subparsers.add_parser('shutdown', help='Shutdown specific session')
+ parser_shutdown.add_argument(
+ '--id', type=str, default=None, help='Session ID to be shutdown')
+ parser_shutdown.set_defaults(func=CliCommands.shutdown_session)
+
def get_version():
from .version import __version__
@@ -1095,11 +1349,11 @@ def get_version():
def cli():
- title = 'clearml-session - CLI for launching JupyterLab / VSCode on a remote machine'
+ title = 'clearml-session - CLI for launching JupyterLab / VSCode / SSH on a remote machine'
print(title)
parser = ArgumentParser(
prog='clearml-session', description=title,
- epilog='Notice! all arguments are stored as new defaults for the next session')
+ epilog='Notice! all arguments are stored as new defaults for the next execution')
setup_parser(parser)
# get the args
@@ -1109,14 +1363,6 @@ def cli():
print('Version {}'.format(get_version()))
exit(0)
- # check ssh
- if not _check_ssh_executable():
- raise ValueError("Could not locate SSH executable")
-
- # check clearml.conf
- if not _check_configuration():
- raise ValueError("ClearML configuration not found. Please run `clearml-init`")
-
# load previous state
state_file = os.path.abspath(os.path.expandvars(os.path.expanduser(args.config_file)))
state = load_state(state_file)
@@ -1126,8 +1372,26 @@ def cli():
state['shell'] = bool(args.shell)
+ if args.command:
+ if args.command in ("info", "shutdown") and not args.id:
+ print("Notice! session info requires ID but it was not provided")
+ return
+
+ CliCommands.state = state
+ args.func(args)
+ return
+
+ # check ssh
+ if not _check_ssh_executable():
+ raise ValueError("Could not locate SSH executable")
+
+ # check clearml.conf
+ if not _check_configuration():
+ raise ValueError("ClearML configuration not found. Please run `clearml-init`")
+
client = APIClient()
+ # to be deprecated
if args.shutdown is not None:
task = _get_previous_session(
client, args, state, task_id=args.shutdown, verb="Shutting down",
@@ -1169,6 +1433,12 @@ def cli():
# save state
save_state(state, state_file)
+ # find previous workspace is needed
+ find_prev_session(state, client)
+
+ # ask user final approval
+ ask_launch(args)
+
# remove old Tasks created by us.
delete_old_tasks(state, client, state.get('base_task_id'))
diff --git a/clearml_session/interactive_session_task.py b/clearml_session/interactive_session_task.py
index 0a3819c..27c0b00 100644
--- a/clearml_session/interactive_session_task.py
+++ b/clearml_session/interactive_session_task.py
@@ -1,18 +1,22 @@
import base64
import json
import os
+import shutil
import socket
import subprocess
import sys
from copy import deepcopy
import getpass
-from tempfile import mkstemp, gettempdir
-from time import sleep
+from functools import partial
+from tempfile import mkstemp, gettempdir, mkdtemp
+from time import sleep, time
+from datetime import datetime
import psutil
import requests
from clearml import Task, StorageManager
from clearml.backend_api import Session
+from clearml.backend_api.services import tasks
from pathlib2 import Path
# noinspection SpellCheckingInspection
@@ -75,8 +79,10 @@ default_ssh_fingerprint = {
config_section_name = 'interactive_session'
config_object_section_ssh = 'SSH'
config_object_section_bash_init = 'interactive_init_script'
-
-
+artifact_workspace_name = "workspace"
+sync_runtime_property = "workspace_sync_ts"
+sync_workspace_creating_id = "created_by_session"
+__poor_lock = []
__allocated_ports = []
@@ -94,7 +100,10 @@ def init_task(param, a_default_ssh_fingerprint):
Task.add_requirements('jupyterlab')
Task.add_requirements('jupyterlab_git')
task = Task.init(
- project_name="DevOps", task_name="Allocate Jupyter Notebook Instance", task_type=Task.TaskTypes.service)
+ project_name="DevOps",
+ task_name="Allocate Jupyter Notebook Instance",
+ task_type=Task.TaskTypes.service
+ )
# Add jupyter server base folder
if Session.check_min_api_version('2.13'):
@@ -116,7 +125,7 @@ def init_task(param, a_default_ssh_fingerprint):
else:
task.connect(param, name=config_section_name)
- # connect ssh finger print configuration (with fallback if section is missing)
+ # connect ssh fingerprint configuration (with fallback if section is missing)
old_default_ssh_fingerprint = deepcopy(a_default_ssh_fingerprint)
try:
task.connect_configuration(configuration=a_default_ssh_fingerprint, name=config_object_section_ssh)
@@ -476,6 +485,13 @@ def start_jupyter_server(hostname, hostnames, param, task, env, bind_ip="127.0.0
env = dict(**env)
env['PATH'] = '{}:{}'.format(Path(sys.executable).parent.as_posix(), env.get('PATH', ''))
+ try:
+ # set default shell to bash if not defined
+ if not env.get("SHELL") and shutil.which("bash"):
+ env['SHELL'] = shutil.which("bash")
+ except Exception as ex:
+ print("WARNING: failed finding default shell bash: {}".format(ex))
+
# make sure we have the needed cwd
# noinspection PyBroadException
try:
@@ -999,6 +1015,169 @@ def run_user_init_script(task):
os.environ['CLEARML_DOCKER_BASH_SCRIPT'] = str(init_script)
+def _sync_workspace_snapshot(task, param):
+ workspace_folder = param.get("store_workspace")
+ if not workspace_folder:
+ # nothing to do
+ return
+
+ print("Syncing workspace {}".format(workspace_folder))
+
+ workspace_folder = Path(os.path.expandvars(workspace_folder)).expanduser()
+ if not workspace_folder.is_dir():
+ print("WARNING: failed to create workspace snapshot from '{}' - "
+ "directory does not exist".format(workspace_folder))
+ return
+
+ # build hash of
+ files_desc = ""
+ for f in workspace_folder.rglob("*"):
+ fs = f.stat()
+ files_desc += "{}: {}[{}]\n".format(f.absolute(), fs.st_size, fs.st_mtime)
+ workspace_hash = hash(str(files_desc))
+ if param.get("workspace_hash") == workspace_hash:
+ print("Skipping workspace snapshot upload, "
+ "already uploaded no files changed since last sync {}".format(param.get(sync_runtime_property)))
+ return
+
+ print("Uploading workspace: {}".format(workspace_folder))
+
+ # force running status - so that we can upload the artifact
+ if task.status not in ("in_progress", ):
+ task.mark_started(force=True)
+
+ try:
+ # create a tar file of the folder
+ # put a consistent file name into a temp folder because the filename is part of
+ # the compressed artifact, and we want consistency in hash.
+ # After that we rename compressed file to temp file and
+ temp_folder = Path(mkdtemp(prefix='workspace_'))
+ local_gzip = (temp_folder / "workspace_snapshot").as_posix()
+ # notice it will add a ".tar.gz" suffix to the file
+ local_gzip = shutil.make_archive(
+ base_name=local_gzip, format="gztar", root_dir=workspace_folder.as_posix())
+ if not local_gzip:
+ print("ERROR: Failed compressing workspace [{}]".format(workspace_folder))
+ raise ValueError("Failed compressing workspace")
+
+ # list archived files for preview
+ files = list(workspace_folder.rglob("*"))
+ archive_preview = 'Archive content {}:\n'.format(workspace_folder)
+ for filename in sorted(files):
+ if filename.is_file():
+ relative_file_name = filename.relative_to(workspace_folder)
+ archive_preview += '{} - {:,} B\n'.format(relative_file_name, filename.stat().st_size)
+
+ # upload actual snapshot tgz
+ task.upload_artifact(
+ name=artifact_workspace_name,
+ artifact_object=Path(local_gzip),
+ delete_after_upload=True,
+ preview=archive_preview,
+ metadata={"timestamp": str(datetime.utcnow()), sync_workspace_creating_id: task.id},
+ wait_on_upload=True,
+ retries=3
+ )
+
+ try:
+ temp_folder.rmdir()
+ except Exception as ex:
+ print("Warning: Failed removing temp artifact folder: {}".format(ex))
+
+ print("Finalizing workspace sync")
+
+ # change artifact to input artifact
+ task.reload()
+ # find our artifact and update it
+ for a in task.data.execution.artifacts:
+ if a.key != artifact_workspace_name:
+ # nothing to do
+ continue
+ elif a.mode == tasks.ArtifactModeEnum.input:
+ # the old input entry - we are changing to output artifact
+ # the reason is that we want this entry to be deleted with this Task
+ # in contrast to Input entries that are Not deleted when deleting the Task
+ a.mode = tasks.ArtifactModeEnum.output
+ a.key = "old_" + str(a.key)
+ else:
+ # set the new entry as an input artifact
+ a.mode = tasks.ArtifactModeEnum.input
+
+ # noinspection PyProtectedMember
+ task._edit(execution=task.data.execution, force=True)
+ task.reload()
+
+ # update our timestamp & hash
+ param[sync_runtime_property] = time()
+ param["workspace_hash"] = workspace_hash
+ # noinspection PyProtectedMember
+ task._set_runtime_properties(runtime_properties={sync_runtime_property: time()})
+ print("[{}] Workspace '{}' snapshot synced".format(datetime.utcnow(), workspace_folder))
+ except Exception as ex:
+ print("ERROR: Failed syncing workspace [{}]: {}".format(workspace_folder, ex))
+ finally:
+ task.mark_stopped(force=True, status_message="workspace shutdown sync completed")
+
+
+def sync_workspace_snapshot(task, param):
+ __poor_lock.append(time())
+ if len(__poor_lock) != 1:
+ # someone is already in, we should leave
+ __poor_lock.pop(-1)
+
+ try:
+ return _sync_workspace_snapshot(task, param)
+ finally:
+ __poor_lock.pop(-1)
+
+
+def restore_workspace(task, param):
+ if not param.get("store_workspace"):
+ # check if we have something to restore, show warning
+ if artifact_workspace_name in task.artifacts:
+ print("WARNING: Found workspace snapshot, but ignoring since store_workspace is 'None'")
+ return
+
+ # add sync callback, timeout 5 min
+ print("Setting workspace snapshot sync callback on session end")
+ task.register_abort_callback(
+ partial(sync_workspace_snapshot, task, param),
+ callback_execution_timeout=60*5)
+
+ try:
+ workspace_folder = Path(os.path.expandvars(param.get("store_workspace"))).expanduser()
+ workspace_folder.mkdir(parents=True, exist_ok=True)
+ except Exception as ex:
+ print("ERROR: Could not create workspace folder {}: {}".format(
+ param.get("store_workspace"), ex))
+ return
+
+ if artifact_workspace_name not in task.artifacts:
+ print("No workspace snapshot was found, a new workspace snapshot [{}] "
+ "will be created when session ends".format(workspace_folder))
+ return
+
+ print("Fetching previous workspace snapshot")
+ artifact_zip_file = task.artifacts[artifact_workspace_name].get_local_copy(extract_archive=False)
+ print("Restoring workspace snapshot")
+ try:
+ shutil.unpack_archive(artifact_zip_file, extract_dir=workspace_folder.as_posix())
+ except Exception as ex:
+ print("ERROR: restoring workspace snapshot failed: {}".format(ex))
+ return
+
+ # remove the workspace from the cache
+ try:
+ os.unlink(artifact_zip_file)
+ except Exception as ex:
+ print("WARNING: temp workspace zip could not be removed: {}".format(ex))
+
+ print("Successfully restored workspace checkpoint to {}".format(workspace_folder))
+ # set time stamp
+ # noinspection PyProtectedMember
+ task._set_runtime_properties(runtime_properties={sync_runtime_property: time()})
+
+
def main():
param = {
"user_base_directory": "~/",
@@ -1014,11 +1193,19 @@ def main():
"public_ip": False,
"ssh_ports": None,
"force_dropbear": False,
+ "store_workspace": None,
}
task = init_task(param, default_ssh_fingerprint)
run_user_init_script(task)
+ # restore workspace if exists
+ # notice, if "store_workspace" is not set we will Not restore the workspace
+ try:
+ restore_workspace(task, param)
+ except Exception as ex:
+ print("ERROR: Failed restoring workspace: {}".format(ex))
+
hostname, hostnames = get_host_name(task, param)
env = setup_user_env(param, task)
@@ -1029,7 +1216,11 @@ def main():
start_jupyter_server(hostname, hostnames, param, task, env)
- print('We are done')
+ print('We are done - sync workspace if needed')
+
+ sync_workspace_snapshot(task, param)
+
+ print('Goodbye')
if __name__ == '__main__':
diff --git a/requirements.txt b/requirements.txt
index 071b61e..16ae438 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-clearml >= 1.1.5
+clearml >= 1.9
pexpect ; sys_platform != 'win32'
wexpect_venv ; sys_platform == 'win32'
-pillow>=10.0.1 # not directly required, pinned by Snyk to avoid a vulnerability