Add hyperdataset examples (#823)

* Add hyperdataset examples Co-authored-by: Erez Schnaider <erez@clear.ml>
2025-06-26 18:16:07 +00:00 · 2022-11-20 16:28:42 +02:00
parent 2aba12cf52
commit f6b9efe54e
10 changed files with 480 additions and 2 deletions
--- a/examples/hyperdatasets/data-registration/register_dataset_masks.py
+++ b/examples/hyperdatasets/data-registration/register_dataset_masks.py
@@ -0,0 +1,136 @@
+"""
+How to register data with ROIs and metadata from a json file.
+Create a list of ROI's for each image in the metadata format required by a frame.
+
+Notice: This is a custom parser for a specific dataset. Each dataset requires a different parser.
+
+You can run this example from this dir with:
+
+python registration_with_roi_and_meta.py
+--path data/sample_ds --ext jpg --ds_name my_uploaded_dataset --version_name my_version
+"""
+
+import glob
+import json
+import os
+from argparse import ArgumentParser
+
+from allegroai import DatasetVersion, FrameGroup, SingleFrame, Task
+from clearml import StorageManager
+
+def get_frames_with_masks(data_path, ext="png", mask_ext="_mask.png"):
+    frame_groups = {}
+
+    # Go over each jpg file in base path
+    for file in glob.glob(os.path.join(data_path, "*.{}".format(ext))):
+        full_path = os.path.abspath(file)
+
+        # if this is a mask file skip it, we will manually add it later to the images it belongs to
+        if full_path.endswith(mask_ext):
+            continue
+
+        # let's check if we have a mask file
+        full_path_mask = full_path.replace(f".{ext}", mask_ext)
+        if not os.path.exists(full_path_mask):
+            # we do not have a mask file, so let's skip this one
+            continue
+
+        # now we need to add the actual
+        print("Getting files from: " + full_path)
+
+        # let's split the file name based on '_' and use the first part as ID
+        file_parts_key = os.path.split(full_path)[-1].split("_")
+
+        # this is used just so we can easily collect (group) the frames together
+        frame_group_id = file_parts_key[0]
+        # find the correct FrameGroup based on the filename
+        if frame_group_id not in frame_groups:
+            # this is acts like a Dict and the keys are string and the values are SingleFrames
+            frame_group = FrameGroup()
+            frame_groups[frame_group_id] = frame_group
+        else:
+            frame_group = frame_groups[frame_group_id]
+
+        # add the frame and the mask to the frame group,
+        # we have to give it a name (inside the FrameGroup) so we use
+        source_id = file_parts_key[1]
+        frame_group[source_id] = SingleFrame(source=full_path, mask_source=full_path_mask)
+
+    # return a list of FrameGroups
+    return list(frame_groups.values())
+
+
+def read_mask_class_values(local_dataset_path):
+    json_file_path = os.path.join(local_dataset_path, "_mask_legend.json")
+
+    json_file = open(json_file_path, "r")
+    data = json.load(json_file)
+    json_file.close()
+
+    # now we need to convert it to pixel RGB value mapping to classes
+    label_mapping = {tuple(value): [key] for key, value in data.items()}
+
+    return label_mapping
+
+
+def create_version_with_frames(new_frames, masks_lookup, ds_name, ver_name, local_dataset_path):
+
+    # Get the dataset (it will create a new one if we don't have it)
+    ds = DatasetVersion.create_new_dataset(dataset_name=ds_name)
+
+    # create a specific dataset version, or just use the latest version
+    dv = ds.create_version(version_name=ver_name) if ver_name else \
+        DatasetVersion.get_current(dataset_name=ds_name)
+
+    dv.set_masks_labels(masks_lookup)
+
+    # Add and upload frames to created version
+    dv.add_frames(
+        new_frames,
+        # where to upload the files, we will use for example the default one, you can also s3://bucket/ etc
+        auto_upload_destination=Task.current_task().get_output_destination(),
+        # The local root, this will make sure we keep the same
+        # files structure in the upload destination as we have on the local machine
+        local_dataset_root_path=local_dataset_path
+    )
+    dv.commit_version()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description='Register allegro dataset with frame group and masks')
+
+    parser.add_argument(
+        '--ext', type=str, help='Files extension to upload from the dir. Default "png"',
+        default="png")
+    parser.add_argument(
+        '--mask-ext', type=str, help='Files extension to upload from the dir. Default "_mask.png"',
+        default="_mask.png")
+
+    parser.add_argument(
+        '--ds_name', type=str, help='Dataset name for the data',
+        default="sample-dataset-masks")
+    parser.add_argument(
+        '--version_name', type=str, help='Version name for the data (default is current version)',
+        default="initial")
+
+    args = parser.parse_args()
+
+    example_dataset_path = 's3://clearml-public/datasets/hyperdataset_example/ds_with_masks'
+    local_img_path = StorageManager.download_folder(example_dataset_path)
+    # this folder contains the images and json files for the data
+    base_path = os.path.abspath('{}/datasets/hyperdataset_example/ds_with_masks'.format(local_img_path))
+    dataset_name = args.ds_name
+    version_name = args.version_name
+
+    task = Task.init(
+        project_name="uploading_datasets", task_name="upload_sample_dataset_with_masks",
+        task_type=Task.TaskTypes.data_processing,
+        # This will make sure we have a valid output destination for our local files to be uploaded to
+        output_uri=True
+    )
+
+    frames = get_frames_with_masks(data_path=base_path, ext=args.ext, mask_ext=args.mask_ext)
+    mask_class_lookup = read_mask_class_values(base_path)
+    create_version_with_frames(frames, mask_class_lookup, dataset_name, version_name, base_path)
+
+    print("We are done :)")
--- a/examples/hyperdatasets/data-registration/register_dataset_with_roi.py
+++ b/examples/hyperdatasets/data-registration/register_dataset_with_roi.py
@@ -0,0 +1,148 @@
+"""
+How to register data with ROIs and metadata from a json file.
+Create a list of ROI's for each image in the metadata format required by a frame.
+
+Notice: This is a custom parser for a specific dataset. Each dataset requires a different parser.
+
+You can run this example from this dir with:
+
+python registration_with_roi_and_meta.py
+--path data/sample_ds --ext jpg --ds_name my_uploaded_dataset --version_name my_version
+"""
+
+import glob
+import json
+import os
+from allegroai import DatasetVersion, SingleFrame, Task
+from argparse import ArgumentParser
+from clearml import StorageManager
+
+def get_json_file(filename):
+    """
+    Get the data from the json file
+
+    :param filename: Full file path
+    :type filename: str
+    :return: json data parse as python dictionary
+    """
+    json_file_path = filename.replace('.jpg', '.json')
+
+    json_file = open(json_file_path, "r")
+    data = json.load(json_file)
+    json_file.close()
+
+    return data
+
+
+def get_frames_with_roi_meta(data_path, ext):
+    """
+    Create a ready to register list of SingleFrame(s)
+
+    :param data_path: Path to the folder you like to register
+    :type data_path: str
+    :param ext: Files extension to upload from the dir
+    :type ext: str
+    :return: List[SingleFrame] list contains all the SingleFrame that should be register
+    """
+    frames_to_reg = []
+    # Go over each jpg file in base path
+    for file in glob.glob(os.path.join(data_path, "*.{}".format(ext))):
+        full_path = os.path.abspath(file)
+        print("Getting files from: " + full_path)
+        # read the json file next to the image
+        data = get_json_file(full_path)
+
+        # Create the SingleFrame object
+        a_frame = SingleFrame(source=full_path)
+
+        # Iterating over rois in the json, and add them
+        for roi in data['rois']:
+            a_frame.add_annotation(
+                poly2d_xy=roi["poly"],
+                labels=roi['labels'],
+                metadata={'alive': roi['meta']['alive']},
+                confidence=roi['confidence']
+            )
+        # add generic meta-data to the frame
+        a_frame.width = data['size']['x']
+        a_frame.height = data['size']['y']
+        a_frame.metadata['dangerous'] = data['meta']['dangerous']
+
+        # add to our SingleFrame Collection
+        frames_to_reg.append(a_frame)
+    return frames_to_reg
+
+
+def create_version_with_frames(new_frames, ds_name, ver_name, local_dataset_path):
+    """
+    Create a DatasetVersion with new_frames as frames
+
+    :param new_frames: list with all the frames to be registered
+    :type new_frames: List[SingleFrame]
+    :param ds_name: The dataset name
+    :type ds_name: str
+    :param ver_name: The version name
+    :type ver_name: str
+    :param local_dataset_path: Path to the folder you register
+    :param local_dataset_path: str
+    """
+    # Get the dataset (it will create a new one if we don't have it)
+    ds = DatasetVersion.create_new_dataset(dataset_name=ds_name)
+
+    # create a specific dataset version, or just use the latest version
+    dv = ds.create_version(version_name=ver_name) if ver_name else \
+        DatasetVersion.get_current(dataset_name=ds_name)
+
+    # Add and upload frames to created version
+    dv.add_frames(
+        new_frames,
+        # where to upload the files, we will use for example the default one, you can also s3://bucket/ etc
+        auto_upload_destination=Task.current_task().get_output_destination(),
+        # The local root, this will make sure we keep the same
+        # files structure in the upload destination as we have on the local machine
+        local_dataset_root_path=local_dataset_path
+    )
+    dv.commit_version()
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(description='Register allegro dataset with rois and meta')
+
+    parser.add_argument(
+        '--ext', type=str, help='Files extension to upload from the dir. Default: "jpg"',
+        default="jpg")
+    parser.add_argument(
+        '--ds_name', type=str, help='Dataset name for the data. Default: "sample-dataset"',
+        default="sample-dataset")
+    parser.add_argument(
+        '--version_name', type=str, help='Version name for the data (default is current version)',
+        default="initial")
+
+    args = parser.parse_args()
+
+    # this folder contains the images and json files for the data
+    example_dataset_path = 's3://clearml-public/datasets/hyperdataset_example/ds_with_rois'
+    local_img_path = StorageManager.download_folder(example_dataset_path)
+
+    # this folder contains the images and json files for the data
+    base_path = os.path.abspath('{}/datasets/hyperdataset_example/ds_with_rois'.format(local_img_path))
+    dataset_name = args.ds_name
+    version_name = args.version_name
+
+    task = Task.init(
+        project_name="uploading_datasets", task_name="upload_sample",
+        task_type=Task.TaskTypes.data_processing,
+        # This will make sure we have a valid output destination for our local files to be uploaded to. This support
+        # also other storage types:
+        #             - A shared folder: ``/mnt/share/folder``
+        #             - S3: ``s3://bucket/folder``
+        #             - Google Cloud Storage: ``gs://bucket-name/folder``
+        #             - Azure Storage: ``azure://company.blob.core.windows.net/folder/``
+        output_uri=True
+    )
+
+    frames = get_frames_with_roi_meta(base_path, args.ext)
+
+    create_version_with_frames(frames, dataset_name, version_name, base_path)
+
+    print("We are done :)")