Add hyperdataset examples (#823)

* Add hyperdataset examples

Co-authored-by: Erez Schnaider <erez@clear.ml>
This commit is contained in:
erezalg 2022-11-20 16:28:42 +02:00 committed by GitHub
parent 2aba12cf52
commit f6b9efe54e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 480 additions and 2 deletions

View File

@ -6,8 +6,6 @@ assignees: ''
---
Thank you for helping us making ClearML better!
## Describe the bug
A clear and concise description of what the bug is.

BIN
docs/screenshots/hpd.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

View File

@ -0,0 +1,18 @@
# ClearML HypderDatasets #
Hyper-Datasets is a data management system thats designed for unstructured data such as text, audio, or visual data. It is part of the ClearML enterprise offering, which means it includes quite a few upgrades over the open source clearml-data.
The main conceptual difference between the two is that Hyper-Datasets decouples the metadata from the raw data files. This allows you to manipulate the metadata in all kinds of ways, while abstracting away the logistics of having to deal with large amounts of data.
To leverage Hyper-Datasets power, users define Dataviews which are sophisticated queries connecting specific data from one or more datasets to an experiment in the Experiment Manager. Essentially it creates and manages local views of remote Datasets.
![Dataview in the UI](../../docs/screenshots/hpd.png)
## Examples Overview ##
- Hyperdataset registration into ClearML Enterprise
- Hypderdataset usage exmaples, retrieving frames using the Dataview Class and connecting to pytorch dataloader
## Further Resources ##
Learn about ClearML [HyperDataset](https://clear.ml/docs/latest/docs/hyperdatasets/overview)
Watch video [Tutorial](https://www.youtube.com/watch?v=1VliYRexeLU)

View File

@ -0,0 +1,29 @@
from allegroai import Task, DataView
task = Task.init(project_name="examples", task_name="dataview example with masks")
# simple query
dataview = DataView(iteration_order='random')
dataview.set_iteration_parameters(random_seed=123)
dataview.add_query(dataset_name='sample-dataset-masks', version_name='Current')
# print the number of frames the queries return
print("count", dataview.get_count())
# generate a list of FrameGroups from the query
# Note that the metadata is cached locally, it means the next time we call to_list() it will return faster.
list_frame_groups = dataview.to_list()
# A FrameGroup is a dictionary of SingleFrames - you can access each object with the key it was register with ("000002")
print([frame_group["000002"].get_local_source() for frame_group in list_frame_groups])
print("now in iterator form")
# iterator version of the same code, notice this time metadata is not locally cached
for frame_group in dataview:
for key in frame_group.keys():
print(frame_group[key].get_local_source(), frame_group[key].get_local_mask_source())
print("done")

View File

@ -0,0 +1,38 @@
"""
How to access and go over data
The general flow:
- Create new dataview.
- Query your dataview.
- Two ways to go over the frames:
- dataview.get_iterator()
- dataview.to_list()
"""
from allegroai import Task, DataView
task = Task.init(project_name="examples", task_name="dataview example")
# simple query
dataview = DataView(iteration_order='random')
dataview.set_iteration_parameters(random_seed=123)
# We can query our dataset(s) with `add_query` function, for all the data use roi_query="*" or
# use only dataset and version.
# This is a general example, you can change the parameters of the `add_query` function
dataview.add_query(dataset_name='sample-dataset', version_name='Current', roi_query=["aeroplane"])
# print the number of frames the queries return
print("count", dataview.get_count())
# generate a list of FrameGroups from the query
# Note that the metadata is cached locally, it means the next time we call to_list() it will return faster.
list_single_frames = dataview.to_list()
print([f.get_local_source() for f in list_single_frames])
print("now in iterator form")
# iterator version of the same code, notice this time metadata is not locally cached
for f in dataview:
print(f.get_local_source())
print("done")

View File

@ -0,0 +1,51 @@
import numpy as np
import torch.utils.data
from allegroai import DataView, SingleFrame, Task
from PIL import Image
from torch.utils.data import DataLoader
class ExampleDataset(torch.utils.data.Dataset):
def __init__(self, dv):
# automatically adjust dataset to balance all queries
self.frames = dv.to_list()
def __getitem__(self, idx):
frame = self.frames[idx] # type: SingleFrame
img_path = frame.get_local_source()
img = Image.open(img_path).convert("RGB").resize((256, 256))
return np.array(img)
def __len__(self):
return len(self.frames)
task = Task.init(project_name='examples', task_name='PyTorch Sample Dataset')
# Create DataView with example query
dataview = DataView()
dataview.add_query(dataset_name='sample-dataset', version_name='Current')
# if we want all files to be downloaded in the background, we can call prefetch
# dataview.prefetch_files()
# create PyTorch Dataset
dataset = ExampleDataset(dataview)
# do your thing here :)
print('Fake PyTorch stuff below:')
print('Dataset length', len(dataset))
torch.manual_seed(0)
data_loader = DataLoader(
dataset,
batch_size=2,
num_workers=1,
pin_memory=True,
prefetch_factor=2,
)
for i, data in enumerate(data_loader):
print('{}] {}'.format(i, data))
print('done')

View File

@ -0,0 +1,55 @@
import numpy as np
import torch.utils.data
from allegroai import DataView, FrameGroup, Task
from PIL import Image
from torch.utils.data import DataLoader
class ExampleDataset(torch.utils.data.Dataset):
def __init__(self, dv):
# automatically adjust dataset to balance all queries
self.frames = dv.to_list()
def __getitem__(self, idx):
frame_group = self.frames[idx] # type: FrameGroup
img_path = frame_group["000002"].get_local_source()
img = Image.open(img_path).convert("RGB").resize((256, 256))
mask_path = frame_group["000002"].get_local_mask_source()
mask = Image.open(mask_path).resize((256, 256))
return np.array(img), np.array(mask),
def __len__(self):
return len(self.frames)
task = Task.init(project_name='examples', task_name='PyTorch Sample Dataset with Masks')
# Create DataView with example query
dataview = DataView()
dataview.add_query(dataset_name='sample-dataset-masks', version_name='Current')
# dataview.add_query(dataset_name='sample-dataset', version_name='Current', roi_query=["aeroplane"])
# if we want all files to be downloaded in the background, we can call prefetch
# dataview.prefetch_files()
# create PyTorch Dataset
dataset = ExampleDataset(dataview)
# do your thing here :)
print('Fake PyTorch stuff below:')
print('Dataset length', len(dataset))
torch.manual_seed(0)
data_loader = DataLoader(
dataset,
batch_size=2,
num_workers=1,
pin_memory=True,
prefetch_factor=2,
)
for i, data in enumerate(data_loader):
print('{}] {}'.format(i, data))
print('done')

View File

@ -0,0 +1,136 @@
"""
How to register data with ROIs and metadata from a json file.
Create a list of ROI's for each image in the metadata format required by a frame.
Notice: This is a custom parser for a specific dataset. Each dataset requires a different parser.
You can run this example from this dir with:
python registration_with_roi_and_meta.py
--path data/sample_ds --ext jpg --ds_name my_uploaded_dataset --version_name my_version
"""
import glob
import json
import os
from argparse import ArgumentParser
from allegroai import DatasetVersion, FrameGroup, SingleFrame, Task
from clearml import StorageManager
def get_frames_with_masks(data_path, ext="png", mask_ext="_mask.png"):
frame_groups = {}
# Go over each jpg file in base path
for file in glob.glob(os.path.join(data_path, "*.{}".format(ext))):
full_path = os.path.abspath(file)
# if this is a mask file skip it, we will manually add it later to the images it belongs to
if full_path.endswith(mask_ext):
continue
# let's check if we have a mask file
full_path_mask = full_path.replace(f".{ext}", mask_ext)
if not os.path.exists(full_path_mask):
# we do not have a mask file, so let's skip this one
continue
# now we need to add the actual
print("Getting files from: " + full_path)
# let's split the file name based on '_' and use the first part as ID
file_parts_key = os.path.split(full_path)[-1].split("_")
# this is used just so we can easily collect (group) the frames together
frame_group_id = file_parts_key[0]
# find the correct FrameGroup based on the filename
if frame_group_id not in frame_groups:
# this is acts like a Dict and the keys are string and the values are SingleFrames
frame_group = FrameGroup()
frame_groups[frame_group_id] = frame_group
else:
frame_group = frame_groups[frame_group_id]
# add the frame and the mask to the frame group,
# we have to give it a name (inside the FrameGroup) so we use
source_id = file_parts_key[1]
frame_group[source_id] = SingleFrame(source=full_path, mask_source=full_path_mask)
# return a list of FrameGroups
return list(frame_groups.values())
def read_mask_class_values(local_dataset_path):
json_file_path = os.path.join(local_dataset_path, "_mask_legend.json")
json_file = open(json_file_path, "r")
data = json.load(json_file)
json_file.close()
# now we need to convert it to pixel RGB value mapping to classes
label_mapping = {tuple(value): [key] for key, value in data.items()}
return label_mapping
def create_version_with_frames(new_frames, masks_lookup, ds_name, ver_name, local_dataset_path):
# Get the dataset (it will create a new one if we don't have it)
ds = DatasetVersion.create_new_dataset(dataset_name=ds_name)
# create a specific dataset version, or just use the latest version
dv = ds.create_version(version_name=ver_name) if ver_name else \
DatasetVersion.get_current(dataset_name=ds_name)
dv.set_masks_labels(masks_lookup)
# Add and upload frames to created version
dv.add_frames(
new_frames,
# where to upload the files, we will use for example the default one, you can also s3://bucket/ etc
auto_upload_destination=Task.current_task().get_output_destination(),
# The local root, this will make sure we keep the same
# files structure in the upload destination as we have on the local machine
local_dataset_root_path=local_dataset_path
)
dv.commit_version()
if __name__ == '__main__':
parser = ArgumentParser(description='Register allegro dataset with frame group and masks')
parser.add_argument(
'--ext', type=str, help='Files extension to upload from the dir. Default "png"',
default="png")
parser.add_argument(
'--mask-ext', type=str, help='Files extension to upload from the dir. Default "_mask.png"',
default="_mask.png")
parser.add_argument(
'--ds_name', type=str, help='Dataset name for the data',
default="sample-dataset-masks")
parser.add_argument(
'--version_name', type=str, help='Version name for the data (default is current version)',
default="initial")
args = parser.parse_args()
example_dataset_path = 's3://clearml-public/datasets/hyperdataset_example/ds_with_masks'
local_img_path = StorageManager.download_folder(example_dataset_path)
# this folder contains the images and json files for the data
base_path = os.path.abspath('{}/datasets/hyperdataset_example/ds_with_masks'.format(local_img_path))
dataset_name = args.ds_name
version_name = args.version_name
task = Task.init(
project_name="uploading_datasets", task_name="upload_sample_dataset_with_masks",
task_type=Task.TaskTypes.data_processing,
# This will make sure we have a valid output destination for our local files to be uploaded to
output_uri=True
)
frames = get_frames_with_masks(data_path=base_path, ext=args.ext, mask_ext=args.mask_ext)
mask_class_lookup = read_mask_class_values(base_path)
create_version_with_frames(frames, mask_class_lookup, dataset_name, version_name, base_path)
print("We are done :)")

View File

@ -0,0 +1,148 @@
"""
How to register data with ROIs and metadata from a json file.
Create a list of ROI's for each image in the metadata format required by a frame.
Notice: This is a custom parser for a specific dataset. Each dataset requires a different parser.
You can run this example from this dir with:
python registration_with_roi_and_meta.py
--path data/sample_ds --ext jpg --ds_name my_uploaded_dataset --version_name my_version
"""
import glob
import json
import os
from allegroai import DatasetVersion, SingleFrame, Task
from argparse import ArgumentParser
from clearml import StorageManager
def get_json_file(filename):
"""
Get the data from the json file
:param filename: Full file path
:type filename: str
:return: json data parse as python dictionary
"""
json_file_path = filename.replace('.jpg', '.json')
json_file = open(json_file_path, "r")
data = json.load(json_file)
json_file.close()
return data
def get_frames_with_roi_meta(data_path, ext):
"""
Create a ready to register list of SingleFrame(s)
:param data_path: Path to the folder you like to register
:type data_path: str
:param ext: Files extension to upload from the dir
:type ext: str
:return: List[SingleFrame] list contains all the SingleFrame that should be register
"""
frames_to_reg = []
# Go over each jpg file in base path
for file in glob.glob(os.path.join(data_path, "*.{}".format(ext))):
full_path = os.path.abspath(file)
print("Getting files from: " + full_path)
# read the json file next to the image
data = get_json_file(full_path)
# Create the SingleFrame object
a_frame = SingleFrame(source=full_path)
# Iterating over rois in the json, and add them
for roi in data['rois']:
a_frame.add_annotation(
poly2d_xy=roi["poly"],
labels=roi['labels'],
metadata={'alive': roi['meta']['alive']},
confidence=roi['confidence']
)
# add generic meta-data to the frame
a_frame.width = data['size']['x']
a_frame.height = data['size']['y']
a_frame.metadata['dangerous'] = data['meta']['dangerous']
# add to our SingleFrame Collection
frames_to_reg.append(a_frame)
return frames_to_reg
def create_version_with_frames(new_frames, ds_name, ver_name, local_dataset_path):
"""
Create a DatasetVersion with new_frames as frames
:param new_frames: list with all the frames to be registered
:type new_frames: List[SingleFrame]
:param ds_name: The dataset name
:type ds_name: str
:param ver_name: The version name
:type ver_name: str
:param local_dataset_path: Path to the folder you register
:param local_dataset_path: str
"""
# Get the dataset (it will create a new one if we don't have it)
ds = DatasetVersion.create_new_dataset(dataset_name=ds_name)
# create a specific dataset version, or just use the latest version
dv = ds.create_version(version_name=ver_name) if ver_name else \
DatasetVersion.get_current(dataset_name=ds_name)
# Add and upload frames to created version
dv.add_frames(
new_frames,
# where to upload the files, we will use for example the default one, you can also s3://bucket/ etc
auto_upload_destination=Task.current_task().get_output_destination(),
# The local root, this will make sure we keep the same
# files structure in the upload destination as we have on the local machine
local_dataset_root_path=local_dataset_path
)
dv.commit_version()
if __name__ == '__main__':
parser = ArgumentParser(description='Register allegro dataset with rois and meta')
parser.add_argument(
'--ext', type=str, help='Files extension to upload from the dir. Default: "jpg"',
default="jpg")
parser.add_argument(
'--ds_name', type=str, help='Dataset name for the data. Default: "sample-dataset"',
default="sample-dataset")
parser.add_argument(
'--version_name', type=str, help='Version name for the data (default is current version)',
default="initial")
args = parser.parse_args()
# this folder contains the images and json files for the data
example_dataset_path = 's3://clearml-public/datasets/hyperdataset_example/ds_with_rois'
local_img_path = StorageManager.download_folder(example_dataset_path)
# this folder contains the images and json files for the data
base_path = os.path.abspath('{}/datasets/hyperdataset_example/ds_with_rois'.format(local_img_path))
dataset_name = args.ds_name
version_name = args.version_name
task = Task.init(
project_name="uploading_datasets", task_name="upload_sample",
task_type=Task.TaskTypes.data_processing,
# This will make sure we have a valid output destination for our local files to be uploaded to. This support
# also other storage types:
# - A shared folder: ``/mnt/share/folder``
# - S3: ``s3://bucket/folder``
# - Google Cloud Storage: ``gs://bucket-name/folder``
# - Azure Storage: ``azure://company.blob.core.windows.net/folder/``
output_uri=True
)
frames = get_frames_with_roi_meta(base_path, args.ext)
create_version_with_frames(frames, dataset_name, version_name, base_path)
print("We are done :)")

View File

@ -0,0 +1,5 @@
allegroai~=3.5.7
clearml
numpy
Pillow
torch