Add hyperdataset examples (#823)

* Add hyperdataset examples Co-authored-by: Erez Schnaider <erez@clear.ml>
2025-06-26 18:16:07 +00:00 · 2022-11-20 16:28:42 +02:00 · 2022-11-20 16:28:42 +02:00 · f6b9efe54e
commit f6b9efe54e
parent 2aba12cf52
10 changed files with 480 additions and 2 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@ -6,8 +6,6 @@ assignees: ''

 ---

-Thank you for helping us making ClearML better!
-

 ## Describe the bug
 A clear and concise description of what the bug is.
--- a/docs/screenshots/hpd.png
+++ b/docs/screenshots/hpd.png
--- a/examples/hyperdatasets/README.md
+++ b/examples/hyperdatasets/README.md
@ -0,0 +1,18 @@
+# ClearML HypderDatasets #
+
+Hyper-Datasets is a data management system that’s designed for unstructured data such as text, audio, or visual data. It is part of the ClearML enterprise offering, which means it includes quite a few upgrades over the open source clearml-data.
+
+The main conceptual difference between the two is that Hyper-Datasets decouples the metadata from the raw data files. This allows you to manipulate the metadata in all kinds of ways, while abstracting away the logistics of having to deal with large amounts of data. 
+
+To leverage Hyper-Datasets power, users define Dataviews which are sophisticated queries connecting specific data from one or more datasets to an experiment in the Experiment Manager. Essentially it creates and manages local views of remote Datasets.
+
+![Dataview in the UI](../../docs/screenshots/hpd.png)
+## Examples Overview ##
+- Hyperdataset registration into ClearML Enterprise
+- Hypderdataset usage exmaples, retrieving frames using the Dataview Class and connecting to pytorch dataloader
+
+## Further Resources ##
+
+Learn about ClearML [HyperDataset](https://clear.ml/docs/latest/docs/hyperdatasets/overview)
+
+Watch video [Tutorial](https://www.youtube.com/watch?v=1VliYRexeLU)
--- a/examples/hyperdatasets/data-ingestion/dataview_example_framegroup.py
+++ b/examples/hyperdatasets/data-ingestion/dataview_example_framegroup.py
@ -0,0 +1,29 @@
+from allegroai import Task, DataView
+
+
+task = Task.init(project_name="examples", task_name="dataview example with masks")
+
+# simple query
+dataview = DataView(iteration_order='random')
+dataview.set_iteration_parameters(random_seed=123)
+
+dataview.add_query(dataset_name='sample-dataset-masks', version_name='Current')
+
+# print the number of frames the queries return
+print("count", dataview.get_count())
+
+# generate a list of FrameGroups from the query
+# Note that the metadata is cached locally, it means the next time we call to_list() it will return faster.
+list_frame_groups = dataview.to_list()
+
+# A FrameGroup is a dictionary of SingleFrames - you can access each object with the key it was register with ("000002")
+print([frame_group["000002"].get_local_source() for frame_group in list_frame_groups])
+
+print("now in iterator form")
+
+# iterator version of the same code, notice this time metadata is not locally cached
+for frame_group in dataview:
+    for key in frame_group.keys():
+        print(frame_group[key].get_local_source(), frame_group[key].get_local_mask_source())
+
+print("done")
--- a/examples/hyperdatasets/data-ingestion/dataview_example_singleframe.py
+++ b/examples/hyperdatasets/data-ingestion/dataview_example_singleframe.py
@ -0,0 +1,38 @@
+"""
+How to access and go over data
+The general flow:
+ - Create new dataview.
+ - Query your dataview.
+ - Two ways to go over the frames:
+   - dataview.get_iterator()
+   - dataview.to_list()
+"""
+from allegroai import Task, DataView
+
+
+task = Task.init(project_name="examples", task_name="dataview example")
+
+# simple query
+dataview = DataView(iteration_order='random')
+dataview.set_iteration_parameters(random_seed=123)
+
+# We can query our dataset(s) with `add_query` function, for all the data use roi_query="*" or
+# use only dataset and version.
+# This is a general example, you can change the parameters of the `add_query` function
+dataview.add_query(dataset_name='sample-dataset', version_name='Current', roi_query=["aeroplane"])
+
+# print the number of frames the queries return
+print("count", dataview.get_count())
+
+# generate a list of FrameGroups from the query
+# Note that the metadata is cached locally, it means the next time we call to_list() it will return faster.
+list_single_frames = dataview.to_list()
+print([f.get_local_source() for f in list_single_frames])
+
+print("now in iterator form")
+
+# iterator version of the same code, notice this time metadata is not locally cached
+for f in dataview:
+    print(f.get_local_source())
+
+print("done")
--- a/examples/hyperdatasets/data-ingestion/pytorch_dataset_example.py
+++ b/examples/hyperdatasets/data-ingestion/pytorch_dataset_example.py
@ -0,0 +1,51 @@
+import numpy as np
+import torch.utils.data
+from allegroai import DataView, SingleFrame, Task
+from PIL import Image
+from torch.utils.data import DataLoader
+
+
+class ExampleDataset(torch.utils.data.Dataset):
+    def __init__(self, dv):
+        # automatically adjust dataset to balance all queries
+        self.frames = dv.to_list()
+
+    def __getitem__(self, idx):
+        frame = self.frames[idx]  # type: SingleFrame
+        img_path = frame.get_local_source()
+        img = Image.open(img_path).convert("RGB").resize((256, 256))
+
+        return np.array(img)
+
+    def __len__(self):
+        return len(self.frames)
+
+
+task = Task.init(project_name='examples', task_name='PyTorch Sample Dataset')
+
+# Create DataView with example query
+dataview = DataView()
+dataview.add_query(dataset_name='sample-dataset', version_name='Current')
+
+# if we want all files to be downloaded in the background, we can call prefetch
+# dataview.prefetch_files()
+
+# create PyTorch Dataset
+dataset = ExampleDataset(dataview)
+
+# do your thing here :)
+print('Fake PyTorch stuff below:')
+print('Dataset length', len(dataset))
+
+torch.manual_seed(0)
+data_loader = DataLoader(
+    dataset,
+    batch_size=2,
+    num_workers=1,
+    pin_memory=True,
+    prefetch_factor=2,
+)
+for i, data in enumerate(data_loader):
+    print('{}] {}'.format(i, data))
+
+print('done')
--- a/examples/hyperdatasets/data-ingestion/pytorch_dataset_example_with_masks.py
+++ b/examples/hyperdatasets/data-ingestion/pytorch_dataset_example_with_masks.py
@ -0,0 +1,55 @@
+import numpy as np
+import torch.utils.data
+from allegroai import DataView, FrameGroup, Task
+from PIL import Image
+from torch.utils.data import DataLoader
+
+
+class ExampleDataset(torch.utils.data.Dataset):
+    def __init__(self, dv):
+        # automatically adjust dataset to balance all queries
+        self.frames = dv.to_list()
+
+    def __getitem__(self, idx):
+        frame_group = self.frames[idx]  # type: FrameGroup
+        img_path = frame_group["000002"].get_local_source()
+        img = Image.open(img_path).convert("RGB").resize((256, 256))
+
+        mask_path = frame_group["000002"].get_local_mask_source()
+        mask = Image.open(mask_path).resize((256, 256))
+
+        return np.array(img), np.array(mask),
+
+    def __len__(self):
+        return len(self.frames)
+
+
+task = Task.init(project_name='examples', task_name='PyTorch Sample Dataset with Masks')
+
+# Create DataView with example query
+dataview = DataView()
+dataview.add_query(dataset_name='sample-dataset-masks', version_name='Current')
+# dataview.add_query(dataset_name='sample-dataset', version_name='Current', roi_query=["aeroplane"])
+
+# if we want all files to be downloaded in the background, we can call prefetch
+# dataview.prefetch_files()
+
+# create PyTorch Dataset
+dataset = ExampleDataset(dataview)
+
+# do your thing here :)
+print('Fake PyTorch stuff below:')
+print('Dataset length', len(dataset))
+
+torch.manual_seed(0)
+data_loader = DataLoader(
+    dataset,
+    batch_size=2,
+    num_workers=1,
+    pin_memory=True,
+    prefetch_factor=2,
+)
+for i, data in enumerate(data_loader):
+    print('{}] {}'.format(i, data))
+
+print('done')
--- a/examples/hyperdatasets/data-registration/register_dataset_masks.py
+++ b/examples/hyperdatasets/data-registration/register_dataset_masks.py
@ -0,0 +1,136 @@
+"""
+How to register data with ROIs and metadata from a json file.
+Create a list of ROI's for each image in the metadata format required by a frame.
+
+Notice: This is a custom parser for a specific dataset. Each dataset requires a different parser.
+
+You can run this example from this dir with:
+
+python registration_with_roi_and_meta.py
+--path data/sample_ds --ext jpg --ds_name my_uploaded_dataset --version_name my_version
+"""
+
+import glob
+import json
+import os
+from argparse import ArgumentParser
+
+from allegroai import DatasetVersion, FrameGroup, SingleFrame, Task
+from clearml import StorageManager
+
+def get_frames_with_masks(data_path, ext="png", mask_ext="_mask.png"):
+    frame_groups = {}
+
+    # Go over each jpg file in base path
+    for file in glob.glob(os.path.join(data_path, "*.{}".format(ext))):
+        full_path = os.path.abspath(file)
+
+        # if this is a mask file skip it, we will manually add it later to the images it belongs to
+        if full_path.endswith(mask_ext):
+            continue
+
+        # let's check if we have a mask file
+        full_path_mask = full_path.replace(f".{ext}", mask_ext)
+        if not os.path.exists(full_path_mask):
+            # we do not have a mask file, so let's skip this one
+            continue
+
+        # now we need to add the actual
+        print("Getting files from: " + full_path)
+
+        # let's split the file name based on '_' and use the first part as ID
+        file_parts_key = os.path.split(full_path)[-1].split("_")
+
+        # this is used just so we can easily collect (group) the frames together
+        frame_group_id = file_parts_key[0]
+        # find the correct FrameGroup based on the filename
+        if frame_group_id not in frame_groups:
+            # this is acts like a Dict and the keys are string and the values are SingleFrames
+            frame_group = FrameGroup()
+            frame_groups[frame_group_id] = frame_group
+        else:
+            frame_group = frame_groups[frame_group_id]
+
+        # add the frame and the mask to the frame group,
+        # we have to give it a name (inside the FrameGroup) so we use
+        source_id = file_parts_key[1]
+        frame_group[source_id] = SingleFrame(source=full_path, mask_source=full_path_mask)
+
+    # return a list of FrameGroups
+    return list(frame_groups.values())
+
+
+def read_mask_class_values(local_dataset_path):
+    json_file_path = os.path.join(local_dataset_path, "_mask_legend.json")
+
+    json_file = open(json_file_path, "r")
+    data = json.load(json_file)
+    json_file.close()
+
+    # now we need to convert it to pixel RGB value mapping to classes
+    label_mapping = {tuple(value): [key] for key, value in data.items()}
+
+    return label_mapping
+
+
+def create_version_with_frames(new_frames, masks_lookup, ds_name, ver_name, local_dataset_path):
+
+    # Get the dataset (it will create a new one if we don't have it)
+    ds = DatasetVersion.create_new_dataset(dataset_name=ds_name)
+
+    # create a specific dataset version, or just use the latest version
+    dv = ds.create_version(version_name=ver_name) if ver_name else \
+        DatasetVersion.get_current(dataset_name=ds_name)
+
+    dv.set_masks_labels(masks_lookup)
+
+    # Add and upload frames to created version
+    dv.add_frames(
+        new_frames,
+        # where to upload the files, we will use for example the default one, you can also s3://bucket/ etc
+        auto_upload_destination=Task.current_task().get_output_destination(),
+        # The local root, this will make sure we keep the same
+        # files structure in the upload destination as we have on the local machine
+        local_dataset_root_path=local_dataset_path
+    )
+    dv.commit_version()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description='Register allegro dataset with frame group and masks')
+
+    parser.add_argument(
+        '--ext', type=str, help='Files extension to upload from the dir. Default "png"',
+        default="png")
+    parser.add_argument(
+        '--mask-ext', type=str, help='Files extension to upload from the dir. Default "_mask.png"',
+        default="_mask.png")
+
+    parser.add_argument(
+        '--ds_name', type=str, help='Dataset name for the data',
+        default="sample-dataset-masks")
+    parser.add_argument(
+        '--version_name', type=str, help='Version name for the data (default is current version)',
+        default="initial")
+
+    args = parser.parse_args()
+
+    example_dataset_path = 's3://clearml-public/datasets/hyperdataset_example/ds_with_masks'
+    local_img_path = StorageManager.download_folder(example_dataset_path)
+    # this folder contains the images and json files for the data
+    base_path = os.path.abspath('{}/datasets/hyperdataset_example/ds_with_masks'.format(local_img_path))
+    dataset_name = args.ds_name
+    version_name = args.version_name
+
+    task = Task.init(
+        project_name="uploading_datasets", task_name="upload_sample_dataset_with_masks",
+        task_type=Task.TaskTypes.data_processing,
+        # This will make sure we have a valid output destination for our local files to be uploaded to
+        output_uri=True
+    )
+
+    frames = get_frames_with_masks(data_path=base_path, ext=args.ext, mask_ext=args.mask_ext)
+    mask_class_lookup = read_mask_class_values(base_path)
+    create_version_with_frames(frames, mask_class_lookup, dataset_name, version_name, base_path)
+
+    print("We are done :)")
--- a/examples/hyperdatasets/data-registration/register_dataset_with_roi.py
+++ b/examples/hyperdatasets/data-registration/register_dataset_with_roi.py
@ -0,0 +1,148 @@
+"""
+How to register data with ROIs and metadata from a json file.
+Create a list of ROI's for each image in the metadata format required by a frame.
+
+Notice: This is a custom parser for a specific dataset. Each dataset requires a different parser.
+
+You can run this example from this dir with:
+
+python registration_with_roi_and_meta.py
+--path data/sample_ds --ext jpg --ds_name my_uploaded_dataset --version_name my_version
+"""
+
+import glob
+import json
+import os
+from allegroai import DatasetVersion, SingleFrame, Task
+from argparse import ArgumentParser
+from clearml import StorageManager
+
+def get_json_file(filename):
+    """
+    Get the data from the json file
+
+    :param filename: Full file path
+    :type filename: str
+    :return: json data parse as python dictionary
+    """
+    json_file_path = filename.replace('.jpg', '.json')
+
+    json_file = open(json_file_path, "r")
+    data = json.load(json_file)
+    json_file.close()
+
+    return data
+
+
+def get_frames_with_roi_meta(data_path, ext):
+    """
+    Create a ready to register list of SingleFrame(s)
+
+    :param data_path: Path to the folder you like to register
+    :type data_path: str
+    :param ext: Files extension to upload from the dir
+    :type ext: str
+    :return: List[SingleFrame] list contains all the SingleFrame that should be register
+    """
+    frames_to_reg = []
+    # Go over each jpg file in base path
+    for file in glob.glob(os.path.join(data_path, "*.{}".format(ext))):
+        full_path = os.path.abspath(file)
+        print("Getting files from: " + full_path)
+        # read the json file next to the image
+        data = get_json_file(full_path)
+
+        # Create the SingleFrame object
+        a_frame = SingleFrame(source=full_path)
+
+        # Iterating over rois in the json, and add them
+        for roi in data['rois']:
+            a_frame.add_annotation(
+                poly2d_xy=roi["poly"],
+                labels=roi['labels'],
+                metadata={'alive': roi['meta']['alive']},
+                confidence=roi['confidence']
+            )
+        # add generic meta-data to the frame
+        a_frame.width = data['size']['x']
+        a_frame.height = data['size']['y']
+        a_frame.metadata['dangerous'] = data['meta']['dangerous']
+
+        # add to our SingleFrame Collection
+        frames_to_reg.append(a_frame)
+    return frames_to_reg
+
+
+def create_version_with_frames(new_frames, ds_name, ver_name, local_dataset_path):
+    """
+    Create a DatasetVersion with new_frames as frames
+
+    :param new_frames: list with all the frames to be registered
+    :type new_frames: List[SingleFrame]
+    :param ds_name: The dataset name
+    :type ds_name: str
+    :param ver_name: The version name
+    :type ver_name: str
+    :param local_dataset_path: Path to the folder you register
+    :param local_dataset_path: str
+    """
+    # Get the dataset (it will create a new one if we don't have it)
+    ds = DatasetVersion.create_new_dataset(dataset_name=ds_name)
+
+    # create a specific dataset version, or just use the latest version
+    dv = ds.create_version(version_name=ver_name) if ver_name else \
+        DatasetVersion.get_current(dataset_name=ds_name)
+
+    # Add and upload frames to created version
+    dv.add_frames(
+        new_frames,
+        # where to upload the files, we will use for example the default one, you can also s3://bucket/ etc
+        auto_upload_destination=Task.current_task().get_output_destination(),
+        # The local root, this will make sure we keep the same
+        # files structure in the upload destination as we have on the local machine
+        local_dataset_root_path=local_dataset_path
+    )
+    dv.commit_version()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description='Register allegro dataset with rois and meta')
+
+    parser.add_argument(
+        '--ext', type=str, help='Files extension to upload from the dir. Default: "jpg"',
+        default="jpg")
+    parser.add_argument(
+        '--ds_name', type=str, help='Dataset name for the data. Default: "sample-dataset"',
+        default="sample-dataset")
+    parser.add_argument(
+        '--version_name', type=str, help='Version name for the data (default is current version)',
+        default="initial")
+
+    args = parser.parse_args()
+
+    # this folder contains the images and json files for the data
+    example_dataset_path = 's3://clearml-public/datasets/hyperdataset_example/ds_with_rois'
+    local_img_path = StorageManager.download_folder(example_dataset_path)
+
+    # this folder contains the images and json files for the data
+    base_path = os.path.abspath('{}/datasets/hyperdataset_example/ds_with_rois'.format(local_img_path))
+    dataset_name = args.ds_name
+    version_name = args.version_name
+
+    task = Task.init(
+        project_name="uploading_datasets", task_name="upload_sample",
+        task_type=Task.TaskTypes.data_processing,
+        # This will make sure we have a valid output destination for our local files to be uploaded to. This support
+        # also other storage types:
+        #             - A shared folder: ``/mnt/share/folder``
+        #             - S3: ``s3://bucket/folder``
+        #             - Google Cloud Storage: ``gs://bucket-name/folder``
+        #             - Azure Storage: ``azure://company.blob.core.windows.net/folder/``
+        output_uri=True
+    )
+
+    frames = get_frames_with_roi_meta(base_path, args.ext)
+
+    create_version_with_frames(frames, dataset_name, version_name, base_path)
+
+    print("We are done :)")
--- a/examples/hyperdatasets/requirements.txt
+++ b/examples/hyperdatasets/requirements.txt
@ -0,0 +1,5 @@
+allegroai~=3.5.7
+clearml
+numpy
+Pillow
+torch