diff --git a/examples/reporting/model_config.py b/examples/reporting/model_config.py deleted file mode 100644 index 5c606092..00000000 --- a/examples/reporting/model_config.py +++ /dev/null @@ -1,48 +0,0 @@ -# ClearML - Example of manual model configuration -import json -import yaml -from clearml import Task, OutputModel - - -# Connecting ClearML with the current process, -task = Task.init(project_name='examples', task_name='Model configuration example') - -# Connect a local configuration file in json format -config_file_json = 'data_samples/sample.json' -# In the web UI, this file will appear in the CONFIGURATION OBJECTS tab, -# under the "json file" subsection because of the `name` parameter entered here -task.connect_configuration(name="json file", configuration=config_file_json) - -# Read configuration as usual, the backend will contain a copy of it. -# When executing remotely, the returned `config_file_json` will be a temporary file -# that contains a new copy of the configuration retrieved form the backend -model_config_dictionary_json = json.load(open(config_file_json, 'rt')) - - -# Connecting a local configuration file in yaml format -config_file_yaml = 'data_samples/config_yaml.yaml' -task.connect_configuration(configuration=config_file_yaml, name="yaml file") -# Read configuration as usual -model_config_dictionary_yaml = yaml.load(open(config_file_yaml), Loader=yaml.FullLoader) - -# Connecting a dictionary of definitions for a specific network design -model_config_dict = { - 'CHANGE ME': 13.37, - 'dict': {'sub_value': 'string', 'sub_integer': 11}, - 'list_of_ints': [1, 2, 3, 4], -} -model_config_dict = task.connect_configuration(name='dictionary', configuration=model_config_dict) - -# Update the dictionary after connecting it, and the changes will be tracked as well. -model_config_dict['new value'] = 10 -model_config_dict['CHANGE ME'] *= model_config_dict['new value'] - -# Connecting label enumeration -labels = {'background': 0, 'cat': 1, 'dog': 2} -task.connect_label_enumeration(labels) - -# Manually log a local model file, which will have the labels connected above -OutputModel().update_weights('my_best_model.bin') - -# Any saved model (keras / pytorch / tensorflow / etc.) will have the task network configuration and label enumeration -print('Any model stored from this point onwards, will contain both model_config and label_enumeration') \ No newline at end of file diff --git a/examples/reporting/model_reporting.py b/examples/reporting/model_reporting.py new file mode 100644 index 00000000..5a177c03 --- /dev/null +++ b/examples/reporting/model_reporting.py @@ -0,0 +1,16 @@ +# ClearML - Example of manual model reporting +from clearml import Task, OutputModel + +# Connecting ClearML with the current process, +task = Task.init(project_name="examples", task_name="Model reporting example") + +# Create output model and connect it to the task +output_model = OutputModel(task=task) + +labels = {"background": 0, "cat": 1, "dog": 2} +output_model.update_labels(labels) + +model_url = "https://allegro-examples.s3.amazonaws.com/clearml-public-resources/v1.0/clearml-examples-open/newexamples/examples/pytorch%20lightning%20mnist%20example.fb969db720e241e5859d522aa5226b81/models/training.pt" + +# Manually log a model file, which will have the labels connected above +output_model.update_weights(register_uri=model_url) diff --git a/examples/reporting/model_update_pytorch.py b/examples/reporting/model_update_pytorch.py new file mode 100644 index 00000000..d6bf2777 --- /dev/null +++ b/examples/reporting/model_update_pytorch.py @@ -0,0 +1,250 @@ +from pathlib import Path + +import matplotlib.pyplot as plt +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torchvision.datasets as datasets +import torchvision.transforms as transforms +from ignite.contrib.handlers import TensorboardLogger +from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator +from ignite.handlers import global_step_from_engine +from ignite.metrics import Accuracy, Loss, Recall +from ignite.utils import setup_logger +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + +from clearml import Task, StorageManager, OutputModel + + +def main(): + # Connecting ClearML with the current process, + # from here on everything is logged automatically + task = Task.init( + project_name="examples", + task_name="Model update pytorch", + auto_connect_frameworks={"pytorch": False}, + ) + params = { + "number_of_epochs": 1, + "batch_size": 64, + "dropout": 0.25, + "base_lr": 0.001, + "momentum": 0.9, + "loss_report": 100, + } + + params = task.connect(params) # enabling configuration override by clearml + print(params) # printing actual configuration (after override in remote mode) + + model = OutputModel(task=task, framework="pytorch") + model_config_dict = { + "list_of_ints": [1, 2, 3, 4], + "dict": { + "sub_value": "string", + "sub_integer": 11 + }, + "value": 13.37 + } + model.update_design(config_dict=model_config_dict) + + manager = StorageManager() + + dataset_path = Path( + manager.get_local_copy( + remote_url="https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" + ) + ) + + # Dataset and Dataloader initializations + transform = transforms.Compose([transforms.ToTensor()]) + + trainset = datasets.CIFAR10( + root=dataset_path, train=True, download=False, transform=transform + ) + trainloader = torch.utils.data.DataLoader( + trainset, batch_size=params.get("batch_size", 4), shuffle=True, num_workers=10 + ) + + testset = datasets.CIFAR10( + root=dataset_path, train=False, download=False, transform=transform + ) + testloader = torch.utils.data.DataLoader( + testset, batch_size=params.get("batch_size", 4), shuffle=False, num_workers=10 + ) + + run( + params["number_of_epochs"], + params["base_lr"], + params["momentum"], + 10, + params, + trainloader, + testloader, + model, + ) + + +# Helper function to store predictions and scores using matplotlib +def predictions_gt_images_handler(engine, logger, *args, **kwargs): + x, _ = engine.state.batch + y_pred, y = engine.state.output + + num_x = num_y = 4 + le = num_x * num_y + fig = plt.figure(figsize=(20, 20)) + trans = transforms.ToPILImage() + classes = ( + "plane", + "car", + "bird", + "cat", + "deer", + "dog", + "frog", + "horse", + "ship", + "truck", + ) + enumeration = {k: v for v, k in enumerate(classes, 1)} + Task.current_task().connect_label_enumeration(enumeration) + + for idx in range(le): + preds = torch.argmax(F.softmax(y_pred[idx], dim=0)) + probs = torch.max(F.softmax(y_pred[idx], dim=0)) + ax = fig.add_subplot(num_x, num_y, idx + 1, xticks=[], yticks=[]) + ax.imshow(trans(x[idx])) + ax.set_title( + "{0} {1:.1f}% (label: {2})".format( + classes[preds], probs * 100, classes[y[idx]] + ), + color=("green" if preds == y[idx] else "red"), + ) + logger.writer.add_figure( + "predictions vs actuals", figure=fig, global_step=engine.state.epoch + ) + + +class Net(nn.Module): + def __init__(self, params): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(3, 6, 3) + self.conv2 = nn.Conv2d(6, 16, 3) + self.pool = nn.MaxPool2d(2, 2) + self.fc1 = nn.Linear(16 * 6 * 6, 120) + self.fc2 = nn.Linear(120, 84) + self.dorpout = nn.Dropout(p=params.get("dropout", 0.25)) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + x = self.pool(F.relu(self.conv1(x))) + x = self.pool(F.relu(self.conv2(x))) + x = x.view(-1, 16 * 6 * 6) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(self.dorpout(x)) + return x + + +# Training +def run(epochs, lr, momentum, log_interval, params, trainloader, testloader, model): + device = "cuda" if torch.cuda.is_available() else "cpu" + net = Net(params).to(device) + criterion = nn.CrossEntropyLoss() + optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum) + + trainer = create_supervised_trainer(net, optimizer, criterion, device=device) + trainer.logger = setup_logger("trainer") + + val_metrics = {"accuracy": Accuracy(), "loss": Loss(criterion), "recall": Recall()} + evaluator = create_supervised_evaluator(net, metrics=val_metrics, device=device) + evaluator.logger = setup_logger("evaluator") + + # Attach handler to plot trainer's loss every 100 iterations + tb_logger = TensorboardLogger(log_dir="cifar-output") + tb_logger.attach_output_handler( + trainer, + event_name=Events.ITERATION_COMPLETED(every=params.get("loss_report")), + tag="training", + output_transform=lambda loss: {"loss": loss}, + ) + + # Attach handler to dump evaluator's metrics every epoch completed + for tag, evaluator in [("training", trainer), ("validation", evaluator)]: + tb_logger.attach_output_handler( + evaluator, + event_name=Events.EPOCH_COMPLETED, + tag=tag, + metric_names="all", + global_step_transform=global_step_from_engine(trainer), + ) + + # Attach function to build debug images and report every epoch end + tb_logger.attach( + evaluator, + log_handler=predictions_gt_images_handler, + event_name=Events.EPOCH_COMPLETED(once=1), + ) + + desc = "ITERATION - loss: {:.2f}" + pbar = tqdm(initial=0, leave=False, total=len(trainloader), desc=desc.format(0)) + + @trainer.on(Events.ITERATION_COMPLETED(every=log_interval)) + def log_training_loss(engine): + pbar.desc = desc.format(engine.state.output) + pbar.update(log_interval) + + @trainer.on(Events.EPOCH_COMPLETED) + def log_training_results(engine): + pbar.refresh() + evaluator.run(trainloader) + metrics = evaluator.state.metrics + avg_accuracy = metrics["accuracy"] + avg_nll = metrics["loss"] + tqdm.write( + "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}".format( + engine.state.epoch, avg_accuracy, avg_nll + ) + ) + + @trainer.on(Events.EPOCH_COMPLETED) + def log_validation_results(engine): + evaluator.run(testloader) + metrics = evaluator.state.metrics + avg_accuracy = metrics["accuracy"] + avg_nll = metrics["loss"] + tqdm.write( + "Validation Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}".format( + engine.state.epoch, avg_accuracy, avg_nll + ) + ) + + pbar.n = pbar.last_print_n = 0 + + @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) + def log_time(): + tqdm.write( + "{} took {} seconds".format( + trainer.last_event_name.name, + trainer.state.times[trainer.last_event_name.name], + ) + ) + + trainer.run(trainloader, max_epochs=epochs) + pbar.close() + + PATH = "./cifar_net.pth" + + # CONDITION depicts a custom condition for when to save the model. The model is saved and then updated in ClearML + CONDITION = True + + if CONDITION: + torch.save(net.state_dict(), PATH) + model.update_weights(weights_filename=PATH) + print("Finished Training") + print("Task ID number is: {}".format(Task.current_task().id)) + + +if __name__ == "__main__": + main()