From 8658198f8bac976e5c2766190d8c593c2308bfa1 Mon Sep 17 00:00:00 2001 From: pollfly <75068813+pollfly@users.noreply.github.com> Date: Sun, 18 Apr 2021 13:40:52 +0300 Subject: [PATCH] Add using_artifacts_example (#334) * add example for Task, multiple Tasks created in same code * fixes to clearml-task.md * add example of remote_execution * add using_artifacts_example * Rename remote_execution_example.py to execute_remotely_example.py * change name to execute_remotely_example.py * add header to using_artifacts_example.py * add newline to using_artifacts_example.py --- examples/advanced/execute_remotely_example.py | 135 ++++++++++++++++++ examples/reporting/using_artifacts_example.py | 30 ++++ 2 files changed, 165 insertions(+) create mode 100644 examples/advanced/execute_remotely_example.py create mode 100644 examples/reporting/using_artifacts_example.py diff --git a/examples/advanced/execute_remotely_example.py b/examples/advanced/execute_remotely_example.py new file mode 100644 index 00000000..01e0904f --- /dev/null +++ b/examples/advanced/execute_remotely_example.py @@ -0,0 +1,135 @@ +# ClearML - Example of remote_execution with Pytorch mnist training +""" the task.remote_execution option is used when it's needed to run part of the code locally and then move it for +full execution remotely. When running locally, the task.remote_execution() will complete the currently running task and +enqueue it to a chosen queue. When running in an agent, it will ignore the task.remote_execution() and proceed to execute +the code. This feature is especially helpful if you want to run the first epoch locally on your machine to debug and to +make sure code doesn't crash, and then move to a stronger machine for the entire training. +""" + +from __future__ import print_function +import argparse +import os +from tempfile import gettempdir +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +from clearml import Task, Logger + + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.Linear(4 * 4 * 50, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4 * 4 * 50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + +def train(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + Logger.current_logger().report_scalar( + "train", "loss", iteration=(epoch * len(train_loader) + batch_idx), value=loss.item()) + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + + +def test(args, model, device, test_loader, epoch): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss + pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + test_loss /= len(test_loader.dataset) + Logger.current_logger().report_scalar( + "test", "loss", iteration=epoch, value=test_loss) + Logger.current_logger().report_scalar( + "test", "accuracy", iteration=epoch, value=(correct / len(test_loader.dataset))) + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), + 100. * correct / len(test_loader.dataset))) + + +def main(): + # Connecting ClearML with the current process, + # from here on everything is logged automatically + task = Task.init(project_name='examples', task_name='remote_execution pytorch mnist train') + # Training settings + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') + parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') + parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') + parser.add_argument('--save-model', action='store_true', default=True, + help='For Saving the current Model') + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + torch.manual_seed(args.seed) + device = torch.device("cuda" if use_cuda else "cpu") + kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {} + train_loader = torch.utils.data.DataLoader( + datasets.MNIST(os.path.join('..', 'data'), train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.batch_size, shuffle=True, **kwargs) + test_loader = torch.utils.data.DataLoader( + datasets.MNIST(os.path.join('..', 'data'), train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.test_batch_size, shuffle=True, **kwargs) + model = Net().to(device) + optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) + + for epoch in range(1, args.epochs + 1): + if epoch > 1: + # We run training for 1 epoch to make sure nothing crashes then local execution will be terminated. + # Execution will switch to remote execution by the agent listening to specified queue + task.execute_remotely(queue_name="default") + train(args, model, device, train_loader, optimizer, epoch) + test(args, model, device, test_loader, epoch) + if (args.save_model): + torch.save(model.state_dict(), os.path.join(gettempdir(), "mnist_cnn.pt")) + + +if __name__ == '__main__': + main() diff --git a/examples/reporting/using_artifacts_example.py b/examples/reporting/using_artifacts_example.py new file mode 100644 index 00000000..c3353d8c --- /dev/null +++ b/examples/reporting/using_artifacts_example.py @@ -0,0 +1,30 @@ +# Using artifacts example +""" +Upload artifacts from a Task, and then a different Task can access and utilize the data from that artifact. +""" +from clearml import Task +from time import sleep + + +task1 = Task.init(project_name='examples', task_name='create artifact') +# upload data file to the initialized task, inputting a name and file location +task1.upload_artifact(name='data file', artifact_object='data_samples/sample.json') +# close the task, to be able to initialize a new task +task1.close() + +# initialize another task to use some other task's artifacts +task2 = Task.init(project_name='examples', task_name='use artifact from other task') +# get instance of Task that created artifact (task1), using Task's project and name. You could also use its ID number. +preprocess_task = Task.get_task(project_name='examples', task_name='create artifact') +# access artifact from task1, using the artifact's name +# get_local_copy() caches the files for later use and returns a path to the cached file +local_json = preprocess_task.artifacts['data file'].get_local_copy() + +# Doing some stuff with file from other Task in current Task +with open(local_json) as data_file: + file_text = data_file.read() + +print(file_text) +# Simulate the work of a Task +sleep(1.0) +print('Finished doing stuff with some data :)')