Add ClearML Datasets examples

This commit is contained in:
allegroai 2023-02-28 17:19:08 +02:00
parent 22df336860
commit 627135c68a
5 changed files with 192 additions and 0 deletions

View File

@ -0,0 +1,42 @@
# CLI Examples
---
Install `clearml` in your environment
$ pip3 install clearml
---
## 1. Create a simple dataset from a file
- Creation
`clearml-data create --project Datasets_Examples_CLI --name Simple_CSV_dataset_CLI`
- Adding the file
`clearml-data add --files YOUR_CSV_DATASET.csv`
- Upload and finalize
`clearml-data close --verbose`
### 2. Creating a dataset from a folder
- Creation
`clearml-data create --project Datasets_Examples_CLI --name Datset_From_Folder_CLI`
- Adding the folder
`clearml-data add --files ./YOUR_DATASET_FOLDER`
- Upload and finalize
`clearml-data close --verbose`
### 3. Create, add all the files of the directory structure, upload and finalize/close in one command
- Create, add all the files of the directory structure, upload and finalize/close in one command !
`clearml-data sync --folder ./DATA/MNIST/TRAIN --project Datasets_Examples_CLI --name MNIST_training_dataset_CLI`
### 4. Creating a dataset with child
- Create, add all the files of the directory structure, upload and finalize/close in one command !
`clearml-data sync --folder ./YOUR_DATASET/TRAIN --project Datasets_Examples_CLI --name MNIST_training_dataset_CLI_2`
`clearml-data sync --folder ./YOUR_DATASET/TEST --project Datasets_Examples_CLI --name MNIST_testing_dataset_CLI_2`
- Create the child version
`clearml-data create --project Datasets_Examples_CLI --name MNIST_complete_dataset_CLI_2 --parents ID_OF_TRAIN_DATASET ID_OF_TEST_DATASET`
`clearml-data close --verbose`

View File

@ -0,0 +1,27 @@
from clearml import StorageManager, Dataset
def main():
manager = StorageManager()
print("STEP1 : Downloading CSV dataset")
csv_file_path = manager.get_local_copy(
remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/Iris_Species.csv")
print("STEP2 : Creating a dataset")
# By default, clearml data uploads to the clearml fileserver. Adding output_uri argument to the create() method
# allows you to specify custom storage like s3 \ gcs \ azure \ local storage
simple_dataset = Dataset.create(dataset_project="dataset_examples", dataset_name="CSV_Dataset")
print("STEP3 : Adding CSV file to the Dataset")
simple_dataset.add_files(path=csv_file_path)
print("STEP4 : Upload and finalize")
simple_dataset.upload()
simple_dataset.finalize()
print("We are done, have a great day :)")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,50 @@
import shutil
from uuid import uuid4
from pathlib2 import Path
from clearml import Dataset, StorageManager
def download_mnist_dataset():
manager = StorageManager()
mnist_dataset = Path(manager.get_local_copy(
remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST"))
mnist_dataset_train = mnist_dataset / "TRAIN"
mnist_dataset_test = mnist_dataset / "TEST"
return mnist_dataset_train, mnist_dataset_test
def main():
print("STEP1 : Downloading mnist dataset")
mnist_dataset_train, mnist_dataset_test = download_mnist_dataset()
print("STEP2 : Preparing mnist dataset folder")
mnist_path = Path(f"MNIST_{uuid4().hex}")
mnist_train_path = mnist_path / "TRAIN"
mnist_test_path = mnist_path / "TEST"
mnist_path.mkdir()
print("STEP3 : Creating the dataset")
mnist_dataset = Dataset.create(
dataset_project="dataset_examples", dataset_name="MNIST Complete Dataset (Syncing Example)")
print("STEP4 : Syncing train dataset")
shutil.copytree(mnist_dataset_train, mnist_train_path) # Populating dataset folder with TRAIN images
mnist_dataset.sync_folder(mnist_path)
mnist_dataset.upload()
print("STEP5 : Syncing test dataset")
shutil.copytree(mnist_dataset_train, mnist_test_path) # Populating dataset folder with TEST images
mnist_dataset.sync_folder(mnist_path)
mnist_dataset.upload()
print("STEP6 : Finalizing dataset")
mnist_dataset.finalize()
print("We are done, have a great day :)")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,40 @@
from pathlib2 import Path
from clearml import Dataset, StorageManager
def main():
manager = StorageManager()
print("STEP1 : Downloading mnist dataset")
mnist_dataset = Path(manager.get_local_copy(
remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST"))
mnist_dataset_train = mnist_dataset / "TRAIN"
mnist_dataset_test = mnist_dataset / "TEST"
print("STEP2 : Creating the training dataset")
train_dataset = Dataset.create(
dataset_project="dataset_examples/MNIST", dataset_name="MNIST Training Dataset")
train_dataset.add_files(path=mnist_dataset_train, dataset_path="TRAIN")
train_dataset.upload()
train_dataset.finalize()
print("STEP3 : Creating the testing dataset")
test_dataset = Dataset.create(
dataset_project="dataset_examples/MNIST", dataset_name="MNIST Testing Dataset")
test_dataset.add_files(path=mnist_dataset_test, dataset_path="TEST")
test_dataset.upload()
test_dataset.finalize()
print("STEP4 : Create a child dataset with both mnist train and test data")
child_dataset = Dataset.create(
dataset_project="dataset_examples/MNIST", dataset_name="MNIST Complete Dataset",
parent_datasets=[train_dataset.id, test_dataset.id])
child_dataset.upload()
child_dataset.finalize()
print("We are done, have a great day :)")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,33 @@
from pathlib2 import Path
from clearml import Dataset, StorageManager
def main():
manager = StorageManager()
print("STEP1 : Downloading mnist dataset")
mnist_dataset = Path(manager.get_local_copy(
remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST"))
mnist_dataset_train = mnist_dataset / "TRAIN"
mnist_dataset_test = mnist_dataset / "TEST"
print("STEP2 : Creating the training dataset")
mnist_dataset = Dataset.create(
dataset_project="dataset_examples", dataset_name="MNIST Training Dataset")
mnist_dataset.add_files(path=mnist_dataset_train, dataset_path="TRAIN")
mnist_dataset.upload()
mnist_dataset.finalize()
print("STEP3 : Create a child dataset of mnist dataset using TEST Dataset")
child_dataset = Dataset.create(
dataset_project="dataset_examples", dataset_name="MNIST Complete Dataset", parent_datasets=[mnist_dataset.id])
child_dataset.add_files(path=mnist_dataset_test, dataset_path="TEST")
child_dataset.upload()
child_dataset.finalize()
print("We are done, have a great day :)")
if __name__ == '__main__':
main()