mirror of
https://github.com/clearml/clearml
synced 2025-03-13 07:08:24 +00:00
Add ClearML Datasets examples
This commit is contained in:
parent
22df336860
commit
627135c68a
42
examples/datasets/README.md
Normal file
42
examples/datasets/README.md
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# CLI Examples
|
||||||
|
|
||||||
|
---
|
||||||
|
Install `clearml` in your environment
|
||||||
|
|
||||||
|
$ pip3 install clearml
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Create a simple dataset from a file
|
||||||
|
|
||||||
|
- Creation
|
||||||
|
`clearml-data create --project Datasets_Examples_CLI --name Simple_CSV_dataset_CLI`
|
||||||
|
- Adding the file
|
||||||
|
`clearml-data add --files YOUR_CSV_DATASET.csv`
|
||||||
|
- Upload and finalize
|
||||||
|
`clearml-data close --verbose`
|
||||||
|
|
||||||
|
### 2. Creating a dataset from a folder
|
||||||
|
|
||||||
|
- Creation
|
||||||
|
`clearml-data create --project Datasets_Examples_CLI --name Datset_From_Folder_CLI`
|
||||||
|
- Adding the folder
|
||||||
|
`clearml-data add --files ./YOUR_DATASET_FOLDER`
|
||||||
|
- Upload and finalize
|
||||||
|
`clearml-data close --verbose`
|
||||||
|
|
||||||
|
### 3. Create, add all the files of the directory structure, upload and finalize/close in one command
|
||||||
|
|
||||||
|
- Create, add all the files of the directory structure, upload and finalize/close in one command !
|
||||||
|
`clearml-data sync --folder ./DATA/MNIST/TRAIN --project Datasets_Examples_CLI --name MNIST_training_dataset_CLI`
|
||||||
|
|
||||||
|
### 4. Creating a dataset with child
|
||||||
|
|
||||||
|
- Create, add all the files of the directory structure, upload and finalize/close in one command !
|
||||||
|
`clearml-data sync --folder ./YOUR_DATASET/TRAIN --project Datasets_Examples_CLI --name MNIST_training_dataset_CLI_2`
|
||||||
|
`clearml-data sync --folder ./YOUR_DATASET/TEST --project Datasets_Examples_CLI --name MNIST_testing_dataset_CLI_2`
|
||||||
|
|
||||||
|
- Create the child version
|
||||||
|
`clearml-data create --project Datasets_Examples_CLI --name MNIST_complete_dataset_CLI_2 --parents ID_OF_TRAIN_DATASET ID_OF_TEST_DATASET`
|
||||||
|
`clearml-data close --verbose`
|
||||||
|
|
27
examples/datasets/csv_dataset_creation.py
Normal file
27
examples/datasets/csv_dataset_creation.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
from clearml import StorageManager, Dataset
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
manager = StorageManager()
|
||||||
|
|
||||||
|
print("STEP1 : Downloading CSV dataset")
|
||||||
|
csv_file_path = manager.get_local_copy(
|
||||||
|
remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/Iris_Species.csv")
|
||||||
|
|
||||||
|
print("STEP2 : Creating a dataset")
|
||||||
|
# By default, clearml data uploads to the clearml fileserver. Adding output_uri argument to the create() method
|
||||||
|
# allows you to specify custom storage like s3 \ gcs \ azure \ local storage
|
||||||
|
simple_dataset = Dataset.create(dataset_project="dataset_examples", dataset_name="CSV_Dataset")
|
||||||
|
|
||||||
|
print("STEP3 : Adding CSV file to the Dataset")
|
||||||
|
simple_dataset.add_files(path=csv_file_path)
|
||||||
|
|
||||||
|
print("STEP4 : Upload and finalize")
|
||||||
|
simple_dataset.upload()
|
||||||
|
simple_dataset.finalize()
|
||||||
|
|
||||||
|
print("We are done, have a great day :)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
50
examples/datasets/dataset_folder_syncing.py
Normal file
50
examples/datasets/dataset_folder_syncing.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
import shutil
|
||||||
|
from uuid import uuid4
|
||||||
|
|
||||||
|
from pathlib2 import Path
|
||||||
|
|
||||||
|
from clearml import Dataset, StorageManager
|
||||||
|
|
||||||
|
|
||||||
|
def download_mnist_dataset():
|
||||||
|
manager = StorageManager()
|
||||||
|
mnist_dataset = Path(manager.get_local_copy(
|
||||||
|
remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST"))
|
||||||
|
mnist_dataset_train = mnist_dataset / "TRAIN"
|
||||||
|
mnist_dataset_test = mnist_dataset / "TEST"
|
||||||
|
|
||||||
|
return mnist_dataset_train, mnist_dataset_test
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("STEP1 : Downloading mnist dataset")
|
||||||
|
mnist_dataset_train, mnist_dataset_test = download_mnist_dataset()
|
||||||
|
|
||||||
|
print("STEP2 : Preparing mnist dataset folder")
|
||||||
|
mnist_path = Path(f"MNIST_{uuid4().hex}")
|
||||||
|
mnist_train_path = mnist_path / "TRAIN"
|
||||||
|
mnist_test_path = mnist_path / "TEST"
|
||||||
|
mnist_path.mkdir()
|
||||||
|
|
||||||
|
print("STEP3 : Creating the dataset")
|
||||||
|
mnist_dataset = Dataset.create(
|
||||||
|
dataset_project="dataset_examples", dataset_name="MNIST Complete Dataset (Syncing Example)")
|
||||||
|
|
||||||
|
print("STEP4 : Syncing train dataset")
|
||||||
|
shutil.copytree(mnist_dataset_train, mnist_train_path) # Populating dataset folder with TRAIN images
|
||||||
|
mnist_dataset.sync_folder(mnist_path)
|
||||||
|
mnist_dataset.upload()
|
||||||
|
|
||||||
|
print("STEP5 : Syncing test dataset")
|
||||||
|
shutil.copytree(mnist_dataset_train, mnist_test_path) # Populating dataset folder with TEST images
|
||||||
|
mnist_dataset.sync_folder(mnist_path)
|
||||||
|
mnist_dataset.upload()
|
||||||
|
|
||||||
|
print("STEP6 : Finalizing dataset")
|
||||||
|
mnist_dataset.finalize()
|
||||||
|
|
||||||
|
print("We are done, have a great day :)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
40
examples/datasets/multi_parent_child_dataset.py
Normal file
40
examples/datasets/multi_parent_child_dataset.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from pathlib2 import Path
|
||||||
|
|
||||||
|
from clearml import Dataset, StorageManager
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
manager = StorageManager()
|
||||||
|
|
||||||
|
print("STEP1 : Downloading mnist dataset")
|
||||||
|
mnist_dataset = Path(manager.get_local_copy(
|
||||||
|
remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST"))
|
||||||
|
mnist_dataset_train = mnist_dataset / "TRAIN"
|
||||||
|
mnist_dataset_test = mnist_dataset / "TEST"
|
||||||
|
|
||||||
|
print("STEP2 : Creating the training dataset")
|
||||||
|
train_dataset = Dataset.create(
|
||||||
|
dataset_project="dataset_examples/MNIST", dataset_name="MNIST Training Dataset")
|
||||||
|
train_dataset.add_files(path=mnist_dataset_train, dataset_path="TRAIN")
|
||||||
|
train_dataset.upload()
|
||||||
|
train_dataset.finalize()
|
||||||
|
|
||||||
|
print("STEP3 : Creating the testing dataset")
|
||||||
|
test_dataset = Dataset.create(
|
||||||
|
dataset_project="dataset_examples/MNIST", dataset_name="MNIST Testing Dataset")
|
||||||
|
test_dataset.add_files(path=mnist_dataset_test, dataset_path="TEST")
|
||||||
|
test_dataset.upload()
|
||||||
|
test_dataset.finalize()
|
||||||
|
|
||||||
|
print("STEP4 : Create a child dataset with both mnist train and test data")
|
||||||
|
child_dataset = Dataset.create(
|
||||||
|
dataset_project="dataset_examples/MNIST", dataset_name="MNIST Complete Dataset",
|
||||||
|
parent_datasets=[train_dataset.id, test_dataset.id])
|
||||||
|
child_dataset.upload()
|
||||||
|
child_dataset.finalize()
|
||||||
|
|
||||||
|
print("We are done, have a great day :)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
33
examples/datasets/single_parent_child_dataset.py
Normal file
33
examples/datasets/single_parent_child_dataset.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from pathlib2 import Path
|
||||||
|
|
||||||
|
from clearml import Dataset, StorageManager
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
manager = StorageManager()
|
||||||
|
|
||||||
|
print("STEP1 : Downloading mnist dataset")
|
||||||
|
mnist_dataset = Path(manager.get_local_copy(
|
||||||
|
remote_url="https://allegro-datasets.s3.amazonaws.com/datasets/MNIST.zip", name="MNIST"))
|
||||||
|
mnist_dataset_train = mnist_dataset / "TRAIN"
|
||||||
|
mnist_dataset_test = mnist_dataset / "TEST"
|
||||||
|
|
||||||
|
print("STEP2 : Creating the training dataset")
|
||||||
|
mnist_dataset = Dataset.create(
|
||||||
|
dataset_project="dataset_examples", dataset_name="MNIST Training Dataset")
|
||||||
|
mnist_dataset.add_files(path=mnist_dataset_train, dataset_path="TRAIN")
|
||||||
|
mnist_dataset.upload()
|
||||||
|
mnist_dataset.finalize()
|
||||||
|
|
||||||
|
print("STEP3 : Create a child dataset of mnist dataset using TEST Dataset")
|
||||||
|
child_dataset = Dataset.create(
|
||||||
|
dataset_project="dataset_examples", dataset_name="MNIST Complete Dataset", parent_datasets=[mnist_dataset.id])
|
||||||
|
child_dataset.add_files(path=mnist_dataset_test, dataset_path="TEST")
|
||||||
|
child_dataset.upload()
|
||||||
|
child_dataset.finalize()
|
||||||
|
|
||||||
|
print("We are done, have a great day :)")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user