diff --git a/examples/datasets/README.md b/examples/datasets/README.md new file mode 100644 index 00000000..f26e5c65 --- /dev/null +++ b/examples/datasets/README.md @@ -0,0 +1,42 @@ +# CLI Examples + +--- +Install `clearml` in your environment + + $ pip3 install clearml + +--- + +## 1. Create a simple dataset from a file + +- Creation + `clearml-data create --project Datasets_Examples_CLI --name Simple_CSV_dataset_CLI` +- Adding the file + `clearml-data add --files YOUR_CSV_DATASET.csv` +- Upload and finalize + `clearml-data close --verbose` + +### 2. Creating a dataset from a folder + +- Creation + `clearml-data create --project Datasets_Examples_CLI --name Datset_From_Folder_CLI` +- Adding the folder + `clearml-data add --files ./YOUR_DATASET_FOLDER` +- Upload and finalize + `clearml-data close --verbose` + +### 3. Create, add all the files of the directory structure, upload and finalize/close in one command + +- Create, add all the files of the directory structure, upload and finalize/close in one command ! + `clearml-data sync --folder ./DATA/MNIST/TRAIN --project Datasets_Examples_CLI --name MNIST_training_dataset_CLI` + +### 4. Creating a dataset with child + +- Create, add all the files of the directory structure, upload and finalize/close in one command ! + `clearml-data sync --folder ./YOUR_DATASET/TRAIN --project Datasets_Examples_CLI --name MNIST_training_dataset_CLI_2` + `clearml-data sync --folder ./YOUR_DATASET/TEST --project Datasets_Examples_CLI --name MNIST_testing_dataset_CLI_2` + +- Create the child version + `clearml-data create --project Datasets_Examples_CLI --name MNIST_complete_dataset_CLI_2 --parents ID_OF_TRAIN_DATASET ID_OF_TEST_DATASET` + `clearml-data close --verbose` + diff --git a/examples/datasets/csv_dataset_creation.py b/examples/datasets/csv_dataset_creation.py new file mode 100644 index 00000000..8dbb4999 --- /dev/null +++ b/examples/datasets/csv_dataset_creation.py @@ -0,0 +1,27 @@ +from clearml import StorageManager, Dataset + + +def main(): + manager = StorageManager() + + print("STEP1 : Downloading CSV dataset") + csv_file_path = manager.get_local_copy( + remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/Iris_Species.csv") + + print("STEP2 : Creating a dataset") + # By default, clearml data uploads to the clearml fileserver. Adding output_uri argument to the create() method + # allows you to specify custom storage like s3 \ gcs \ azure \ local storage + simple_dataset = Dataset.create(dataset_project="dataset_examples", dataset_name="CSV_Dataset") + + print("STEP3 : Adding CSV file to the Dataset") + simple_dataset.add_files(path=csv_file_path) + + print("STEP4 : Upload and finalize") + simple_dataset.upload() + simple_dataset.finalize() + + print("We are done, have a great day :)") + + +if __name__ == '__main__': + main() diff --git a/examples/datasets/dataset_folder_syncing.py b/examples/datasets/dataset_folder_syncing.py new file mode 100644 index 00000000..a039642a --- /dev/null +++ b/examples/datasets/dataset_folder_syncing.py @@ -0,0 +1,50 @@ +import shutil +from uuid import uuid4 + +from pathlib2 import Path + +from clearml import Dataset, StorageManager + + +def download_mnist_dataset(): + manager = StorageManager() + mnist_dataset = Path(manager.get_local_copy( + remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST")) + mnist_dataset_train = mnist_dataset / "TRAIN" + mnist_dataset_test = mnist_dataset / "TEST" + + return mnist_dataset_train, mnist_dataset_test + + +def main(): + print("STEP1 : Downloading mnist dataset") + mnist_dataset_train, mnist_dataset_test = download_mnist_dataset() + + print("STEP2 : Preparing mnist dataset folder") + mnist_path = Path(f"MNIST_{uuid4().hex}") + mnist_train_path = mnist_path / "TRAIN" + mnist_test_path = mnist_path / "TEST" + mnist_path.mkdir() + + print("STEP3 : Creating the dataset") + mnist_dataset = Dataset.create( + dataset_project="dataset_examples", dataset_name="MNIST Complete Dataset (Syncing Example)") + + print("STEP4 : Syncing train dataset") + shutil.copytree(mnist_dataset_train, mnist_train_path) # Populating dataset folder with TRAIN images + mnist_dataset.sync_folder(mnist_path) + mnist_dataset.upload() + + print("STEP5 : Syncing test dataset") + shutil.copytree(mnist_dataset_train, mnist_test_path) # Populating dataset folder with TEST images + mnist_dataset.sync_folder(mnist_path) + mnist_dataset.upload() + + print("STEP6 : Finalizing dataset") + mnist_dataset.finalize() + + print("We are done, have a great day :)") + + +if __name__ == '__main__': + main() diff --git a/examples/datasets/multi_parent_child_dataset.py b/examples/datasets/multi_parent_child_dataset.py new file mode 100644 index 00000000..51f83207 --- /dev/null +++ b/examples/datasets/multi_parent_child_dataset.py @@ -0,0 +1,40 @@ +from pathlib2 import Path + +from clearml import Dataset, StorageManager + + +def main(): + manager = StorageManager() + + print("STEP1 : Downloading mnist dataset") + mnist_dataset = Path(manager.get_local_copy( + remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST")) + mnist_dataset_train = mnist_dataset / "TRAIN" + mnist_dataset_test = mnist_dataset / "TEST" + + print("STEP2 : Creating the training dataset") + train_dataset = Dataset.create( + dataset_project="dataset_examples/MNIST", dataset_name="MNIST Training Dataset") + train_dataset.add_files(path=mnist_dataset_train, dataset_path="TRAIN") + train_dataset.upload() + train_dataset.finalize() + + print("STEP3 : Creating the testing dataset") + test_dataset = Dataset.create( + dataset_project="dataset_examples/MNIST", dataset_name="MNIST Testing Dataset") + test_dataset.add_files(path=mnist_dataset_test, dataset_path="TEST") + test_dataset.upload() + test_dataset.finalize() + + print("STEP4 : Create a child dataset with both mnist train and test data") + child_dataset = Dataset.create( + dataset_project="dataset_examples/MNIST", dataset_name="MNIST Complete Dataset", + parent_datasets=[train_dataset.id, test_dataset.id]) + child_dataset.upload() + child_dataset.finalize() + + print("We are done, have a great day :)") + + +if __name__ == "__main__": + main() diff --git a/examples/datasets/single_parent_child_dataset.py b/examples/datasets/single_parent_child_dataset.py new file mode 100644 index 00000000..664cd91c --- /dev/null +++ b/examples/datasets/single_parent_child_dataset.py @@ -0,0 +1,33 @@ +from pathlib2 import Path + +from clearml import Dataset, StorageManager + + +def main(): + manager = StorageManager() + + print("STEP1 : Downloading mnist dataset") + mnist_dataset = Path(manager.get_local_copy( + remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST")) + mnist_dataset_train = mnist_dataset / "TRAIN" + mnist_dataset_test = mnist_dataset / "TEST" + + print("STEP2 : Creating the training dataset") + mnist_dataset = Dataset.create( + dataset_project="dataset_examples", dataset_name="MNIST Training Dataset") + mnist_dataset.add_files(path=mnist_dataset_train, dataset_path="TRAIN") + mnist_dataset.upload() + mnist_dataset.finalize() + + print("STEP3 : Create a child dataset of mnist dataset using TEST Dataset") + child_dataset = Dataset.create( + dataset_project="dataset_examples", dataset_name="MNIST Complete Dataset", parent_datasets=[mnist_dataset.id]) + child_dataset.add_files(path=mnist_dataset_test, dataset_path="TEST") + child_dataset.upload() + child_dataset.finalize() + + print("We are done, have a great day :)") + + +if __name__ == '__main__': + main()