clearml/examples/datasets/multi_parent_child_dataset.py

42 lines
1.4 KiB
Python
Raw Permalink Normal View History

2023-02-28 15:19:08 +00:00
from pathlib2 import Path
from clearml import Dataset, StorageManager
def main():
manager = StorageManager()
print("STEP1 : Downloading mnist dataset")
2024-08-05 12:46:11 +00:00
mnist_dataset = Path(
manager.get_local_copy(remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST")
)
2023-02-28 15:19:08 +00:00
mnist_dataset_train = mnist_dataset / "TRAIN"
mnist_dataset_test = mnist_dataset / "TEST"
print("STEP2 : Creating the training dataset")
2024-08-05 12:46:11 +00:00
train_dataset = Dataset.create(dataset_project="dataset_examples/MNIST", dataset_name="MNIST Training Dataset")
2023-02-28 15:19:08 +00:00
train_dataset.add_files(path=mnist_dataset_train, dataset_path="TRAIN")
train_dataset.upload()
train_dataset.finalize()
print("STEP3 : Creating the testing dataset")
2024-08-05 12:46:11 +00:00
test_dataset = Dataset.create(dataset_project="dataset_examples/MNIST", dataset_name="MNIST Testing Dataset")
2023-02-28 15:19:08 +00:00
test_dataset.add_files(path=mnist_dataset_test, dataset_path="TEST")
test_dataset.upload()
test_dataset.finalize()
print("STEP4 : Create a child dataset with both mnist train and test data")
child_dataset = Dataset.create(
2024-08-05 12:46:11 +00:00
dataset_project="dataset_examples/MNIST",
dataset_name="MNIST Complete Dataset",
parent_datasets=[train_dataset.id, test_dataset.id],
)
2023-02-28 15:19:08 +00:00
child_dataset.upload()
child_dataset.finalize()
print("We are done, have a great day :)")
if __name__ == "__main__":
main()