From c253af58c0d8781a113242c8511b6dae7fc7d5fb Mon Sep 17 00:00:00 2001 From: pollfly <75068813+pollfly@users.noreply.github.com> Date: Sun, 28 Jan 2024 11:39:01 +0200 Subject: [PATCH] Add Hyper-Datasets intro (#766) --- docs/hyper_datasets.md | 46 ++++++++++++++++++++++++++++++++++++++++++ sidebars.js | 12 ++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 docs/hyper_datasets.md diff --git a/docs/hyper_datasets.md b/docs/hyper_datasets.md new file mode 100644 index 00000000..d2ce9b9c --- /dev/null +++ b/docs/hyper_datasets.md @@ -0,0 +1,46 @@ +--- +title: Hyper-Datasets +--- + +:::important Enterprise Feature +Hyper-Datasets are available under the ClearML Enterprise plan +::: + +
+ +
+ +
+ +ClearML's **Hyper-Datasets** are an MLOps-oriented abstraction of your data, which facilitates traceable, reproducible model development +through parameterized data access and metadata version control. + +Hyper-Datasets is a data management system specifically tailored for handling unstructured data, like text, audio, or +visual data. You can create, manage, and version your datasets. Datasets can be set up to inherit from other datasets, so +data lineages can be created, and users can track when and how their data changes. In the ClearML Enterprise's WebApp, +you can view a dataset's version history, as well as its contents, including annotations, metadata, masks, and other +information. + +![Frame viewer](img/hyperdatasets/web-app/dataset_example_frame_editor.png) + +The basic premise of Hyper-Datasets is that a user-formed query is a full representation of the dataset used by the ML/DL +process. Hyper-Datasets decouple metadata from raw data files, allowing you to manipulate metadata through sophisticated +queries and parameters that can be tracked through the experiment manager. You can clone experiments using different +data manipulations--or **DataViews**--without changing any of the hard coded values, making these manipulations part of +the experiment. + +ClearML **Enterprise**'s Hyper-Datasets supports rapid prototyping, creating new opportunities such as: +* Hyperparameter optimization of the data itself +* QA/QC pipelining +* CD/CT (continuous training) during deployment +* Enabling complex applications like collaborative (federated) learning. + + +For more information, see [Hyper-Datasets](hyperdatasets/overview.md). + diff --git a/sidebars.js b/sidebars.js index 66f31027..e2d2aa93 100644 --- a/sidebars.js +++ b/sidebars.js @@ -54,7 +54,17 @@ module.exports = { ] }, {'ClearML Data': ['clearml_data/clearml_data', 'clearml_data/clearml_data_cli', 'clearml_data/clearml_data_sdk', 'clearml_data/best_practices', - {'Workflows': ['clearml_data/data_management_examples/workflows', 'clearml_data/data_management_examples/data_man_simple', 'clearml_data/data_management_examples/data_man_folder_sync', 'clearml_data/data_management_examples/data_man_cifar_classification', 'clearml_data/data_management_examples/data_man_python']},]}, + {'Workflows': [ + 'clearml_data/data_management_examples/workflows', + 'clearml_data/data_management_examples/data_man_simple', + 'clearml_data/data_management_examples/data_man_folder_sync', + 'clearml_data/data_management_examples/data_man_cifar_classification', + 'clearml_data/data_management_examples/data_man_python' + ] + }, + ] + }, + 'hyper_datasets', 'model_registry', 'apps/clearml_session', {'ClearML Serving':['clearml_serving/clearml_serving', 'clearml_serving/clearml_serving_setup', 'clearml_serving/clearml_serving_cli', 'clearml_serving/clearml_serving_tutorial']},