From 88b53fa5c936dc93fba9e7803c850846a4c4f3f2 Mon Sep 17 00:00:00 2001
From: pollfly <75068813+pollfly@users.noreply.github.com>
Date: Tue, 10 May 2022 09:49:37 +0300
Subject: [PATCH] Add link support to clearml-data (#246)

---
 docs/clearml_data/clearml_data_cli.md | 15 +++++++-----
 docs/clearml_data/clearml_data_sdk.md | 35 ++++++++++++++++++++++++---
 2 files changed, 41 insertions(+), 9 deletions(-)
diff --git a/docs/clearml_data/clearml_data_cli.md b/docs/clearml_data/clearml_data_cli.md
index a7c63d0f..cdeedfc7 100644
--- a/docs/clearml_data/clearml_data_cli.md
+++ b/docs/clearml_data/clearml_data_cli.md
@@ -16,7 +16,7 @@ The following page provides a reference to `clearml-data`'s CLI commands.
 Creates a new dataset. 
 
 ```bash
-clearml-data create --project <project_name> --name <dataset_name> --parents <existing_dataset_id>
+clearml-data create [-h] [--parents [PARENTS [PARENTS ...]]] [--project PROJECT] --name NAME [--tags [TAGS [TAGS ...]]]
 ```
 
 **Parameters**
@@ -48,7 +48,9 @@ do not require the `--id` flag.
 Add individual files or complete folders to the dataset.
 
 ```bash
-clearml-data add --id <dataset_id> --files <filenames/folders_to_add>
+clearml-data add [-h] [--id ID] [--dataset-folder DATASET_FOLDER]
+                 [--files [FILES [FILES ...]]] [--links [LINKS [LINKS ...]]] 
+                 [--non-recursive] [--verbose]
 ```
 
 **Parameters**
@@ -58,7 +60,8 @@ clearml-data add --id <dataset_id> --files <filenames/folders_to_add>
 |Name|Description|Optional|
 |---|---|---|
 |`--id` | Dataset's ID. Default: previously created / accessed dataset| <img src="/docs/latest/icons/ico-optional-yes.svg" alt="Yes" className="icon size-md center-md" /> |
-|`--files`|Files / folders to add. Wildcard selection is supported, for example: `~/data/*.jpg ~/data/json` | <img src="/docs/latest/icons/ico-optional-no.svg" alt="No" className="icon size-md center-md" /> |
+|`--files`| Files / folders to add. Wildcard selection is supported, for example: `~/data/*.jpg ~/data/json`. Items will be uploaded to the dataset’s designated storage.  | <img src="/docs/latest/icons/ico-optional-yes.svg" alt="Yes" className="icon size-md center-md" /> |
+|`--links`| Files / folders link to add. Supports s3, gs, azure links. Example: `s3://bucket/data` `azure://bucket/folder`. Items remain in their original location. | <img src="/docs/latest/icons/ico-optional-yes.svg" alt="Yes" className="icon size-md center-md" /> |
 |`--dataset-folder` | Dataset base folder to add the files to in the dataset. Default: dataset root| <img src="/docs/latest/icons/ico-optional-yes.svg" alt="Yes" className="icon size-md center-md" /> |
 |`--non-recursive` | Disable recursive scan of files | <img src="/docs/latest/icons/ico-optional-yes.svg" alt="Yes" className="icon size-md center-md" /> |
 |`--verbose` | Verbose reporting | <img src="/docs/latest/icons/ico-optional-yes.svg" alt="Yes" className="icon size-md center-md" />|
@@ -69,10 +72,10 @@ clearml-data add --id <dataset_id> --files <filenames/folders_to_add>
 
 ## remove
 
-Remove files from the dataset.
+Remove files/links from the dataset.
 
 ```bash
-clearml-data remove --id <dataset_id_to_remove_from> --files <filenames/folders_to_remove>
+clearml-data remove [-h] [--id ID] [--files [FILES [FILES ...]]] [--non-recursive] [--verbose]
 ```
 
 **Parameters**
@@ -82,7 +85,7 @@ clearml-data remove --id <dataset_id_to_remove_from> --files <filenames/folders_
 |Name|Description|Optional|
 |---|---|---|
 |`--id` | Dataset's ID. Default: previously created / accessed dataset| <img src="/docs/latest/icons/ico-optional-yes.svg" alt="Yes" className="icon size-md center-md" /> |
-|`--files` |  Files / folders to remove (wildcard selection is supported, for example: `~/data/*.jpg ~/data/json`). Notice: file path is the path within the dataset, not the local path.| <img src="/docs/latest/icons/ico-optional-no.svg" alt="No" className="icon size-md center-md" /> |
+|`--files` |  Files / folders to remove (wildcard selection is supported, for example: `~/data/*.jpg ~/data/json`). Notice: file path is the path within the dataset, not the local path. For links, you can specify their URL (e.g. `s3://bucket/data`) | <img src="/docs/latest/icons/ico-optional-no.svg" alt="No" className="icon size-md center-md" /> |
 |`--non-recursive` | Disable recursive scan of files | <img src="/docs/latest/icons/ico-optional-yes.svg" alt="Yes" className="icon size-md center-md" /> |
 |`--verbose` | Verbose reporting | <img src="/docs/latest/icons/ico-optional-yes.svg" alt="Yes" className="icon size-md center-md" />|
 
diff --git a/docs/clearml_data/clearml_data_sdk.md b/docs/clearml_data/clearml_data_sdk.md
index 3dac8093..57be73bd 100644
--- a/docs/clearml_data/clearml_data_sdk.md
+++ b/docs/clearml_data/clearml_data_sdk.md
@@ -95,8 +95,10 @@ add updated files or remove unnecessary files.
 
 ### add_files()
 
-To add files or folders into the current dataset, use the [`Dataset.add_files`](../references/sdk/dataset.md#add_files) 
-method. If a file is already in a dataset, but it has been modified, it can be added again, and ClearML will 
+To add local files or folders into the current dataset, use the [`Dataset.add_files`](../references/sdk/dataset.md#add_files) 
+method. 
+
+If a file is already in a dataset, but it has been modified, it can be added again, and ClearML will 
 upload the file diff.
 
 ```python
@@ -117,9 +119,36 @@ dataset.add_files(
 )
 ```
  
+### add_external_files()
+
+To add files or folders to the current dataset, leaving them in their original location, use the [`Dataset.add_external_files`](../references/sdk/dataset.md#add_external_files) 
+method. Input the `source_url` argument, which can be a link from cloud storage (`s3://`, `gs://`, `azure://`) 
+or local / network storage (`file://`). 
+
+```python
+dataset = Dataset.create()
+dataset.add_external_files(
+  source_url="s3://my/bucket/path_to_folder_or_file", 
+  dataset_path="/my_dataset/new_folder/"
+) 
+```
+
+There is an option to add a set of files based on wildcard matching of a single string or a list of wildcards, using the 
+`wildcard` parameter. Specify whether to match the wildcard files recursively using the `recursive` parameter.
+
+```python
+# Add all jpg files located in s3 bucket called "my_bucket" to the dataset:
+dataset.add_external_files(
+  source_url="s3://my/bucket/", 
+  wildcard = "*.jpg",
+  dataset_path="/my_dataset/new_folder/"
+)
+```
+
 ### remove_files()
 To remove files from a current dataset, use the [`Dataset.remove_files`](../references/sdk/dataset.md#remove_files) method.
-Input the path to the folder or file to be removed in the `dataset_path` parameter. The path is relative to the dataset. 
+Input the path to the folder or file to be removed in the `dataset_path` parameter. The path is relative to the dataset.
+To remove links, specify their URL (e.g. `s3://bucket/file`).
 
 There is also an option to input a wildcard into `dataset_path` in order to remove a set of files matching the wildcard. 
 Set the `recursive` parameter to `True` in order to match all wildcard files recursively