From 6df1cd956192c09ee147e7682d0819cb912d48c6 Mon Sep 17 00:00:00 2001
From: pollfly <75068813+pollfly@users.noreply.github.com>
Date: Sun, 22 Jan 2023 14:46:30 +0200
Subject: [PATCH] Add Dataset `alias` explanation (#449)

---
 docs/clearml_data/best_practices.md                   |  6 ++++++
 .../data_man_cifar_classification.md                  | 11 +++++++++--
 docs/guides/datasets/data_man_cifar_classification.md | 10 +++++++++-
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/docs/clearml_data/best_practices.md b/docs/clearml_data/best_practices.md
index a4861d08..50b6134f 100644
--- a/docs/clearml_data/best_practices.md
+++ b/docs/clearml_data/best_practices.md
@@ -32,6 +32,12 @@ Organizing your datasets into projects by use-case makes it easier to access the
 If only a project is specified when using [`Dataset.get`](../references/sdk/dataset.md#datasetget), the method returns the 
 most recent dataset in a project. The same is true with tags; if a tag is specified, the method will return the most recent dataset that is labeled with that tag.
 
+In cases where you use a dataset in a task (e.g. consuming a dataset), you can easily track which dataset the task is 
+using by using `Dataset.get`'s `alias` parameter. Pass `alias=<dataset_alias_string>`, and the task using the dataset 
+will store the dataset’s ID in the `dataset_alias_string` parameter under the task's **CONFIGURATION > HYPERPARAMETERS >
+Datasets` section.
+
+
 ## Document your Datasets 
 
 Attach informative metrics or debug samples to the Dataset itself. Use the [`get_logger`](../references/sdk/dataset.md#get_logger)
diff --git a/docs/clearml_data/data_management_examples/data_man_cifar_classification.md b/docs/clearml_data/data_management_examples/data_man_cifar_classification.md
index 9ee4e2cf..5d6b46f6 100644
--- a/docs/clearml_data/data_management_examples/data_man_cifar_classification.md
+++ b/docs/clearml_data/data_management_examples/data_man_cifar_classification.md
@@ -85,7 +85,8 @@ from clearml import Dataset
 
 dataset_path = Dataset.get(
     dataset_name=dataset_name, 
-    dataset_project=dataset_project
+    dataset_project=dataset_project,
+    alias="Cifar dataset"
 ).get_local_copy()
 
 trainset = datasets.CIFAR10(
@@ -95,7 +96,13 @@ trainset = datasets.CIFAR10(
     transform=transform
 )
 ```
+
+In cases like this, where you use a dataset in a task, you can have the dataset's ID stored in the task’s 
+hyperparameters. Passing `alias=<dataset_alias_string>` stores the dataset’s ID in the 
+`dataset_alias_string` parameter in the experiment's **CONFIGURATION > HYPERPARAMETERS > Datasets** section. This way 
+you can easily track which dataset the task is using. 
+
 The Dataset's [`get_local_copy`](../../references/sdk/dataset.md#get_local_copy) method will return a path to the cached, 
-downloaded dataset. Then we provide the path to Pytorch's dataset object.
+downloaded dataset. Then we provide the path to PyTorch's dataset object.
 
 The script then trains a neural network to classify images using the dataset created above.
\ No newline at end of file
diff --git a/docs/guides/datasets/data_man_cifar_classification.md b/docs/guides/datasets/data_man_cifar_classification.md
index 1e0dea2c..aab4d4c1 100644
--- a/docs/guides/datasets/data_man_cifar_classification.md
+++ b/docs/guides/datasets/data_man_cifar_classification.md
@@ -85,7 +85,9 @@ from clearml import Dataset
 
 dataset_path = Dataset.get(
     dataset_name=dataset_name, 
-    dataset_project=dataset_project).get_local_copy()
+    dataset_project=dataset_project,
+    alias="Cifar dataset"
+).get_local_copy()
 
 trainset = datasets.CIFAR10(
     root=dataset_path,
@@ -94,6 +96,12 @@ trainset = datasets.CIFAR10(
     transform=transform
 )
 ```
+
+In cases like this, where you use a dataset in a task, you can have the dataset's ID stored in the task’s 
+hyperparameters. Passing `alias=<dataset_alias_string>` stores the dataset’s ID in the 
+`dataset_alias_string` parameter in the experiment's **CONFIGURATION > HYPERPARAMETERS > Datasets** section. This way 
+you can easily track which dataset the task is using. 
+
 The Dataset's [`get_local_copy`](../../references/sdk/dataset.md#get_local_copy) method will return a path to the cached, 
 downloaded dataset. Then we provide the path to Pytorch's dataset object.