Add hyperdataset examples (#823)

* Add hyperdataset examples Co-authored-by: Erez Schnaider <erez@clear.ml>
2025-06-26 18:16:07 +00:00 · 2022-11-20 16:28:42 +02:00
parent 2aba12cf52
commit f6b9efe54e
10 changed files with 480 additions and 2 deletions
--- a/examples/hyperdatasets/data-ingestion/dataview_example_framegroup.py
+++ b/examples/hyperdatasets/data-ingestion/dataview_example_framegroup.py
@@ -0,0 +1,29 @@
+from allegroai import Task, DataView
+
+
+task = Task.init(project_name="examples", task_name="dataview example with masks")
+
+# simple query
+dataview = DataView(iteration_order='random')
+dataview.set_iteration_parameters(random_seed=123)
+
+dataview.add_query(dataset_name='sample-dataset-masks', version_name='Current')
+
+# print the number of frames the queries return
+print("count", dataview.get_count())
+
+# generate a list of FrameGroups from the query
+# Note that the metadata is cached locally, it means the next time we call to_list() it will return faster.
+list_frame_groups = dataview.to_list()
+
+# A FrameGroup is a dictionary of SingleFrames - you can access each object with the key it was register with ("000002")
+print([frame_group["000002"].get_local_source() for frame_group in list_frame_groups])
+
+print("now in iterator form")
+
+# iterator version of the same code, notice this time metadata is not locally cached
+for frame_group in dataview:
+    for key in frame_group.keys():
+        print(frame_group[key].get_local_source(), frame_group[key].get_local_mask_source())
+
+print("done")
--- a/examples/hyperdatasets/data-ingestion/dataview_example_singleframe.py
+++ b/examples/hyperdatasets/data-ingestion/dataview_example_singleframe.py
@@ -0,0 +1,38 @@
+"""
+How to access and go over data
+The general flow:
+ - Create new dataview.
+ - Query your dataview.
+ - Two ways to go over the frames:
+   - dataview.get_iterator()
+   - dataview.to_list()
+"""
+from allegroai import Task, DataView
+
+
+task = Task.init(project_name="examples", task_name="dataview example")
+
+# simple query
+dataview = DataView(iteration_order='random')
+dataview.set_iteration_parameters(random_seed=123)
+
+# We can query our dataset(s) with `add_query` function, for all the data use roi_query="*" or
+# use only dataset and version.
+# This is a general example, you can change the parameters of the `add_query` function
+dataview.add_query(dataset_name='sample-dataset', version_name='Current', roi_query=["aeroplane"])
+
+# print the number of frames the queries return
+print("count", dataview.get_count())
+
+# generate a list of FrameGroups from the query
+# Note that the metadata is cached locally, it means the next time we call to_list() it will return faster.
+list_single_frames = dataview.to_list()
+print([f.get_local_source() for f in list_single_frames])
+
+print("now in iterator form")
+
+# iterator version of the same code, notice this time metadata is not locally cached
+for f in dataview:
+    print(f.get_local_source())
+
+print("done")
--- a/examples/hyperdatasets/data-ingestion/pytorch_dataset_example.py
+++ b/examples/hyperdatasets/data-ingestion/pytorch_dataset_example.py
@@ -0,0 +1,51 @@
+import numpy as np
+import torch.utils.data
+from allegroai import DataView, SingleFrame, Task
+from PIL import Image
+from torch.utils.data import DataLoader
+
+
+class ExampleDataset(torch.utils.data.Dataset):
+    def __init__(self, dv):
+        # automatically adjust dataset to balance all queries
+        self.frames = dv.to_list()
+
+    def __getitem__(self, idx):
+        frame = self.frames[idx]  # type: SingleFrame
+        img_path = frame.get_local_source()
+        img = Image.open(img_path).convert("RGB").resize((256, 256))
+
+        return np.array(img)
+
+    def __len__(self):
+        return len(self.frames)
+
+
+task = Task.init(project_name='examples', task_name='PyTorch Sample Dataset')
+
+# Create DataView with example query
+dataview = DataView()
+dataview.add_query(dataset_name='sample-dataset', version_name='Current')
+
+# if we want all files to be downloaded in the background, we can call prefetch
+# dataview.prefetch_files()
+
+# create PyTorch Dataset
+dataset = ExampleDataset(dataview)
+
+# do your thing here :)
+print('Fake PyTorch stuff below:')
+print('Dataset length', len(dataset))
+
+torch.manual_seed(0)
+data_loader = DataLoader(
+    dataset,
+    batch_size=2,
+    num_workers=1,
+    pin_memory=True,
+    prefetch_factor=2,
+)
+for i, data in enumerate(data_loader):
+    print('{}] {}'.format(i, data))
+
+print('done')
--- a/examples/hyperdatasets/data-ingestion/pytorch_dataset_example_with_masks.py
+++ b/examples/hyperdatasets/data-ingestion/pytorch_dataset_example_with_masks.py
@@ -0,0 +1,55 @@
+import numpy as np
+import torch.utils.data
+from allegroai import DataView, FrameGroup, Task
+from PIL import Image
+from torch.utils.data import DataLoader
+
+
+class ExampleDataset(torch.utils.data.Dataset):
+    def __init__(self, dv):
+        # automatically adjust dataset to balance all queries
+        self.frames = dv.to_list()
+
+    def __getitem__(self, idx):
+        frame_group = self.frames[idx]  # type: FrameGroup
+        img_path = frame_group["000002"].get_local_source()
+        img = Image.open(img_path).convert("RGB").resize((256, 256))
+
+        mask_path = frame_group["000002"].get_local_mask_source()
+        mask = Image.open(mask_path).resize((256, 256))
+
+        return np.array(img), np.array(mask),
+
+    def __len__(self):
+        return len(self.frames)
+
+
+task = Task.init(project_name='examples', task_name='PyTorch Sample Dataset with Masks')
+
+# Create DataView with example query
+dataview = DataView()
+dataview.add_query(dataset_name='sample-dataset-masks', version_name='Current')
+# dataview.add_query(dataset_name='sample-dataset', version_name='Current', roi_query=["aeroplane"])
+
+# if we want all files to be downloaded in the background, we can call prefetch
+# dataview.prefetch_files()
+
+# create PyTorch Dataset
+dataset = ExampleDataset(dataview)
+
+# do your thing here :)
+print('Fake PyTorch stuff below:')
+print('Dataset length', len(dataset))
+
+torch.manual_seed(0)
+data_loader = DataLoader(
+    dataset,
+    batch_size=2,
+    num_workers=1,
+    pin_memory=True,
+    prefetch_factor=2,
+)
+for i, data in enumerate(data_loader):
+    print('{}] {}'.format(i, data))
+
+print('done')