From 6df1cd956192c09ee147e7682d0819cb912d48c6 Mon Sep 17 00:00:00 2001 From: pollfly <75068813+pollfly@users.noreply.github.com> Date: Sun, 22 Jan 2023 14:46:30 +0200 Subject: [PATCH] Add Dataset `alias` explanation (#449) --- docs/clearml_data/best_practices.md | 6 ++++++ .../data_man_cifar_classification.md | 11 +++++++++-- docs/guides/datasets/data_man_cifar_classification.md | 10 +++++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/docs/clearml_data/best_practices.md b/docs/clearml_data/best_practices.md index a4861d08..50b6134f 100644 --- a/docs/clearml_data/best_practices.md +++ b/docs/clearml_data/best_practices.md @@ -32,6 +32,12 @@ Organizing your datasets into projects by use-case makes it easier to access the If only a project is specified when using [`Dataset.get`](../references/sdk/dataset.md#datasetget), the method returns the most recent dataset in a project. The same is true with tags; if a tag is specified, the method will return the most recent dataset that is labeled with that tag. +In cases where you use a dataset in a task (e.g. consuming a dataset), you can easily track which dataset the task is +using by using `Dataset.get`'s `alias` parameter. Pass `alias=`, and the task using the dataset +will store the dataset’s ID in the `dataset_alias_string` parameter under the task's **CONFIGURATION > HYPERPARAMETERS > +Datasets` section. + + ## Document your Datasets Attach informative metrics or debug samples to the Dataset itself. Use the [`get_logger`](../references/sdk/dataset.md#get_logger) diff --git a/docs/clearml_data/data_management_examples/data_man_cifar_classification.md b/docs/clearml_data/data_management_examples/data_man_cifar_classification.md index 9ee4e2cf..5d6b46f6 100644 --- a/docs/clearml_data/data_management_examples/data_man_cifar_classification.md +++ b/docs/clearml_data/data_management_examples/data_man_cifar_classification.md @@ -85,7 +85,8 @@ from clearml import Dataset dataset_path = Dataset.get( dataset_name=dataset_name, - dataset_project=dataset_project + dataset_project=dataset_project, + alias="Cifar dataset" ).get_local_copy() trainset = datasets.CIFAR10( @@ -95,7 +96,13 @@ trainset = datasets.CIFAR10( transform=transform ) ``` + +In cases like this, where you use a dataset in a task, you can have the dataset's ID stored in the task’s +hyperparameters. Passing `alias=` stores the dataset’s ID in the +`dataset_alias_string` parameter in the experiment's **CONFIGURATION > HYPERPARAMETERS > Datasets** section. This way +you can easily track which dataset the task is using. + The Dataset's [`get_local_copy`](../../references/sdk/dataset.md#get_local_copy) method will return a path to the cached, -downloaded dataset. Then we provide the path to Pytorch's dataset object. +downloaded dataset. Then we provide the path to PyTorch's dataset object. The script then trains a neural network to classify images using the dataset created above. \ No newline at end of file diff --git a/docs/guides/datasets/data_man_cifar_classification.md b/docs/guides/datasets/data_man_cifar_classification.md index 1e0dea2c..aab4d4c1 100644 --- a/docs/guides/datasets/data_man_cifar_classification.md +++ b/docs/guides/datasets/data_man_cifar_classification.md @@ -85,7 +85,9 @@ from clearml import Dataset dataset_path = Dataset.get( dataset_name=dataset_name, - dataset_project=dataset_project).get_local_copy() + dataset_project=dataset_project, + alias="Cifar dataset" +).get_local_copy() trainset = datasets.CIFAR10( root=dataset_path, @@ -94,6 +96,12 @@ trainset = datasets.CIFAR10( transform=transform ) ``` + +In cases like this, where you use a dataset in a task, you can have the dataset's ID stored in the task’s +hyperparameters. Passing `alias=` stores the dataset’s ID in the +`dataset_alias_string` parameter in the experiment's **CONFIGURATION > HYPERPARAMETERS > Datasets** section. This way +you can easily track which dataset the task is using. + The Dataset's [`get_local_copy`](../../references/sdk/dataset.md#get_local_copy) method will return a path to the cached, downloaded dataset. Then we provide the path to Pytorch's dataset object.