Add dataset reporting example (#1180)

2025-06-26 18:16:07 +00:00 · 2024-01-25 19:41:05 +02:00 · 2024-01-25 19:41:05 +02:00 · 64c37fe97f
commit 64c37fe97f
parent 39652cac94
2 changed files with 259 additions and 0 deletions
--- a/examples/datasets/urbansounds_dataset_preprocessing.py
+++ b/examples/datasets/urbansounds_dataset_preprocessing.py
@ -0,0 +1,163 @@
+import os.path
+from pathlib import Path
+
+import matplotlib as mpl
+import numpy as np
+from tqdm import tqdm
+import torchaudio
+import torch
+from clearml import Task, Dataset
+
+task = Task.init(project_name="examples/Urbansounds", task_name="preprocessing")
+
+# Let's preprocess the data and create a new ClearML dataset from it, so we can track it around
+# The cool thing is, we can easily debug, by using, you guessed it: debug samples! We can log both
+# the original sound and its processed mel spectrogram as debug samples, so we can manually check
+# if everything went as planned.
+
+
+class PreProcessor:
+    def __init__(self):
+        self.configuration = {"number_of_mel_filters": 64, "resample_freq": 22050}
+        task.connect(self.configuration)
+
+    def preprocess_sample(self, sample, original_sample_freq):
+        if self.configuration["resample_freq"] > 0:
+            resample_transform = torchaudio.transforms.Resample(
+                orig_freq=original_sample_freq,
+                new_freq=self.configuration["resample_freq"],
+            )
+            sample = resample_transform(sample)
+
+        # This will convert audio files with two channels into one
+        sample = torch.mean(sample, dim=0, keepdim=True)
+
+        # Convert audio to log-scale Mel spectrogram
+        melspectrogram_transform = torchaudio.transforms.MelSpectrogram(
+            sample_rate=self.configuration["resample_freq"],
+            n_mels=self.configuration["number_of_mel_filters"],
+        )
+        melspectrogram = melspectrogram_transform(sample)
+        melspectogram_db = torchaudio.transforms.AmplitudeToDB()(melspectrogram)
+
+        # Make sure all spectrograms are the same size
+        fixed_length = 3 * (self.configuration["resample_freq"] // 200)
+        if melspectogram_db.shape[2] < fixed_length:
+            melspectogram_db = torch.nn.functional.pad(
+                melspectogram_db, (0, fixed_length - melspectogram_db.shape[2])
+            )
+        else:
+            melspectogram_db = melspectogram_db[:, :, :fixed_length]
+
+        return melspectogram_db
+
+
+class DataSetBuilder:
+    def __init__(self):
+        self.configuration = {"dataset_path": "dataset"}
+        task.connect(self.configuration)
+
+        self.original_dataset = Dataset.get(
+            dataset_project="examples/Urbansounds",
+            dataset_name="UrbanSounds example",
+            dataset_tags=["raw"],
+            alias="Raw Dataset",
+        )
+        # This will return the pandas dataframe we added in the previous task
+        self.metadata = (
+            Task.get_task(task_id=self.original_dataset._task.id)
+            .artifacts["metadata"]
+            .get()
+        )
+        # This will download the data and return a local path to the data
+        self.original_dataset_path = Path(
+            self.original_dataset.get_mutable_local_copy(
+                self.configuration["dataset_path"], overwrite=True
+            )
+        )
+
+        # Prepare a preprocessor that will handle each sample one by one
+        self.preprocessor = PreProcessor()
+
+        # Get ready for the new one
+        self.preprocessed_dataset = None
+
+    def log_dataset_statistics(self):
+        histogram_data = self.metadata["label"].value_counts()
+        self.preprocessed_dataset.get_logger().report_table(
+            title="Raw Dataset Metadata",
+            series="Raw Dataset Metadata",
+            table_plot=self.metadata,
+        )
+        self.preprocessed_dataset.get_logger().report_histogram(
+            title="Class distribution",
+            series="Class distribution",
+            values=histogram_data,
+            iteration=0,
+            xlabels=histogram_data.index.tolist(),
+            yaxis="Amount of samples",
+        )
+
+    def build_dataset(self):
+        # Let's create a new dataset that is a child of the original one
+        # We'll add the preprocessed samples to the original dataset, leading to a new version
+        # Providing the parent dataset allows us to keep a clear lineage of our data
+        self.preprocessed_dataset = Dataset.create(
+            dataset_name="UrbanSounds example",
+            dataset_project="examples/Urbansounds",
+            dataset_tags=["preprocessed"],
+            parent_datasets=[self.original_dataset.id],
+        )
+
+        # loop through the metadata entries and preprocess each sample, then add some of them as debug samples to
+        # manually double check in the UI that everything has worked (you can watch the spectrogram and listen to the
+        # audio side by side in the debug sample UI)
+        for i, (_, data) in tqdm(enumerate(self.metadata.iterrows())):
+            _, audio_file_path, label = data.tolist()
+            sample, sample_freq = torchaudio.load(
+                self.original_dataset_path / audio_file_path, normalize=True
+            )
+            spectrogram = self.preprocessor.preprocess_sample(sample, sample_freq)
+            # Get only the filename and replace the extension, we're saving an image here
+            new_file_name = os.path.basename(audio_file_path).replace(".wav", ".npy")
+            # Get the correct folder, basically the original dataset folder + the new filename
+            spectrogram_path = (
+                self.original_dataset_path
+                / os.path.dirname(audio_file_path)
+                / new_file_name
+            )
+            # Save the numpy array to disk
+            np.save(spectrogram_path, spectrogram)
+
+            # Log every 10th sample as a debug sample to the UI, so we can manually check it
+            if i % 10 == 0:
+                # Convert the numpy array to a viewable JPEG
+                rgb_image = mpl.colormaps["viridis"](
+                    spectrogram[0, :, :].detach().numpy() * 255
+                )[:, :, :3]
+                title = os.path.splitext(os.path.basename(audio_file_path))[0]
+
+                # Report the image and the original sound, so they can be viewed side by side
+                self.preprocessed_dataset.get_logger().report_image(
+                    title=title, series="spectrogram", image=rgb_image
+                )
+                self.preprocessed_dataset.get_logger().report_media(
+                    title=title,
+                    series="original_audio",
+                    local_path=self.original_dataset_path / audio_file_path,
+                )
+        # The original data path will now also have the spectrograms in its filetree.
+        # So that's why we add it here to fill up the new dataset with.
+        self.preprocessed_dataset.add_files(self.original_dataset_path)
+        # Again add some visualizations to the task
+        self.log_dataset_statistics()
+        # We still want the metadata
+        self.preprocessed_dataset._task.upload_artifact(
+            name="metadata", artifact_object=self.metadata
+        )
+        self.preprocessed_dataset.finalize(auto_upload=True)
+
+
+if __name__ == "__main__":
+    datasetbuilder = DataSetBuilder()
+    datasetbuilder.build_dataset()
--- a/examples/datasets/urbansounds_get_data.py
+++ b/examples/datasets/urbansounds_get_data.py
@ -0,0 +1,96 @@
+import pandas as pd
+from pathlib import Path
+from clearml import Task, Dataset, StorageManager
+
+task = Task.init(project_name="examples/Urbansounds", task_name="download data")
+
+configuration = {
+    "selected_classes": [
+        "air_conditioner",
+        "car_horn",
+        "children_playing",
+        "dog_bark",
+        "drilling",
+        "engine_idling",
+        "gun_shot",
+        "jackhammer",
+        "siren",
+        "street_music",
+    ]
+}
+task.connect(configuration)
+
+
+def get_urbansound8k():
+    # Download UrbanSound8K dataset (https://urbansounddataset.weebly.com/urbansound8k.html)
+    # For simplicity we will use here a subset of that dataset using clearml StorageManager
+    path_to_urbansound8k = StorageManager.get_local_copy(
+        "https://allegro-datasets.s3.amazonaws.com/clearml/UrbanSound8K.zip",
+        extract_archive=True,
+    )
+    path_to_urbansound8k_csv = (
+        Path(path_to_urbansound8k) / "UrbanSound8K" / "metadata" / "UrbanSound8K.csv"
+    )
+    path_to_urbansound8k_audio = Path(path_to_urbansound8k) / "UrbanSound8K" / "audio"
+
+    return path_to_urbansound8k_csv, path_to_urbansound8k_audio
+
+
+def log_dataset_statistics(dataset, metadata):
+    histogram_data = metadata["class"].value_counts()
+    dataset.get_logger().report_table(
+        title="Raw Dataset Metadata", series="Raw Dataset Metadata", table_plot=metadata
+    )
+    dataset.get_logger().report_histogram(
+        title="Class distribution",
+        series="Class distribution",
+        values=histogram_data,
+        iteration=0,
+        xlabels=histogram_data.index.tolist(),
+        yaxis="Amount of samples",
+    )
+
+
+def build_clearml_dataset():
+    # Get a local copy of both the data and the labels
+    path_to_urbansound8k_csv, path_to_urbansound8k_audio = get_urbansound8k()
+    urbansound8k_metadata = pd.read_csv(path_to_urbansound8k_csv)
+    # Subset the data to only include the classes we want
+    urbansound8k_metadata = urbansound8k_metadata[
+        urbansound8k_metadata["class"].isin(configuration["selected_classes"])
+    ]
+
+    # Create a pandas dataframe containing labels and other info we need later (fold is for train test split)
+    metadata = pd.DataFrame(
+        {
+            "fold": urbansound8k_metadata.loc[:, "fold"],
+            "filepath": (
+                "fold"
+                + urbansound8k_metadata.loc[:, "fold"].astype(str)
+                + "/"
+                + urbansound8k_metadata.loc[:, "slice_file_name"].astype(str)
+            ),
+            "label": urbansound8k_metadata.loc[:, "classID"],
+        }
+    )
+
+    # Now create a clearml dataset to start versioning our changes and make it much easier to get the right data
+    # in other tasks as well as on different machines
+    dataset = Dataset.create(
+        dataset_name="UrbanSounds example",
+        dataset_project="examples/Urbansounds",
+        dataset_tags=["raw"],
+    )
+
+    # Add the local files we downloaded earlier
+    dataset.add_files(path_to_urbansound8k_audio)
+    # Add the metadata in pandas format, we can now see it in the webUI and have it be easily accessible
+    dataset._task.upload_artifact(name="metadata", artifact_object=metadata)
+    # Let's add some cool graphs as statistics in the plots section!
+    log_dataset_statistics(dataset, urbansound8k_metadata)
+    # Finalize and upload the data and labels of the dataset
+    dataset.finalize(auto_upload=True)
+
+
+if __name__ == "__main__":
+    build_clearml_dataset()