From 8658198f8bac976e5c2766190d8c593c2308bfa1 Mon Sep 17 00:00:00 2001
From: pollfly <75068813+pollfly@users.noreply.github.com>
Date: Sun, 18 Apr 2021 13:40:52 +0300
Subject: [PATCH] Add using_artifacts_example (#334)

* add example for Task, multiple Tasks created in same code

* fixes to clearml-task.md

* add example of  remote_execution

* add using_artifacts_example

* Rename remote_execution_example.py to execute_remotely_example.py

* change name to execute_remotely_example.py

* add header to using_artifacts_example.py

* add newline to using_artifacts_example.py
---
 examples/advanced/execute_remotely_example.py | 135 ++++++++++++++++++
 examples/reporting/using_artifacts_example.py |  30 ++++
 2 files changed, 165 insertions(+)
 create mode 100644 examples/advanced/execute_remotely_example.py
 create mode 100644 examples/reporting/using_artifacts_example.py

diff --git a/examples/advanced/execute_remotely_example.py b/examples/advanced/execute_remotely_example.py
new file mode 100644
index 00000000..01e0904f
--- /dev/null
+++ b/examples/advanced/execute_remotely_example.py
@@ -0,0 +1,135 @@
+# ClearML - Example of remote_execution with Pytorch mnist training
+""" the task.remote_execution option is used when it's needed to run part of the code locally and then move it for
+full execution remotely. When running locally, the task.remote_execution() will complete the currently running task and
+enqueue it to a chosen queue. When running in an agent, it will ignore the task.remote_execution() and proceed to execute
+the code. This feature is especially helpful if you want to run the first epoch locally on your machine to debug and to
+make sure code doesn't crash, and then move to a stronger machine for the entire training.
+"""
+
+from __future__ import print_function
+import argparse
+import os
+from tempfile import gettempdir
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from clearml import Task, Logger
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 20, 5, 1)
+        self.conv2 = nn.Conv2d(20, 50, 5, 1)
+        self.fc1 = nn.Linear(4 * 4 * 50, 500)
+        self.fc2 = nn.Linear(500, 10)
+
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = F.relu(self.conv2(x))
+        x = F.max_pool2d(x, 2, 2)
+        x = x.view(-1, 4 * 4 * 50)
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            Logger.current_logger().report_scalar(
+                "train", "loss", iteration=(epoch * len(train_loader) + batch_idx), value=loss.item())
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+
+def test(args, model, device, test_loader, epoch):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
+            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+    test_loss /= len(test_loader.dataset)
+    Logger.current_logger().report_scalar(
+        "test", "loss", iteration=epoch, value=test_loss)
+    Logger.current_logger().report_scalar(
+        "test", "accuracy", iteration=epoch, value=(correct / len(test_loader.dataset)))
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+
+def main():
+    # Connecting ClearML with the current process,
+    # from here on everything is logged automatically
+    task = Task.init(project_name='examples', task_name='remote_execution pytorch mnist train')
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 10)')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                        help='SGD momentum (default: 0.5)')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                        help='how many batches to wait before logging training status')
+    parser.add_argument('--save-model', action='store_true', default=True,
+                        help='For Saving the current Model')
+    args = parser.parse_args()
+    use_cuda = not args.no_cuda and torch.cuda.is_available()
+    torch.manual_seed(args.seed)
+    device = torch.device("cuda" if use_cuda else "cpu")
+    kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST(os.path.join('..', 'data'), train=True, download=True,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ])),
+        batch_size=args.batch_size, shuffle=True, **kwargs)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST(os.path.join('..', 'data'), train=False, transform=transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,))
+        ])),
+        batch_size=args.test_batch_size, shuffle=True, **kwargs)
+    model = Net().to(device)
+    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
+
+    for epoch in range(1, args.epochs + 1):
+        if epoch > 1:
+            # We run training for 1 epoch to make sure nothing crashes then local execution will be terminated.
+            # Execution will switch to remote execution by the agent listening to specified queue
+            task.execute_remotely(queue_name="default")
+        train(args, model, device, train_loader, optimizer, epoch)
+        test(args, model, device, test_loader, epoch)
+    if (args.save_model):
+        torch.save(model.state_dict(), os.path.join(gettempdir(), "mnist_cnn.pt"))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/reporting/using_artifacts_example.py b/examples/reporting/using_artifacts_example.py
new file mode 100644
index 00000000..c3353d8c
--- /dev/null
+++ b/examples/reporting/using_artifacts_example.py
@@ -0,0 +1,30 @@
+# Using artifacts example
+"""
+Upload artifacts from a Task, and then a different Task can access and utilize the data from that artifact.
+"""
+from clearml import Task
+from time import sleep
+
+
+task1 = Task.init(project_name='examples', task_name='create artifact')
+# upload data file to the initialized task, inputting a name and file location
+task1.upload_artifact(name='data file', artifact_object='data_samples/sample.json')
+# close the task, to be able to initialize a new task
+task1.close()
+
+# initialize another task to use some other task's artifacts
+task2 = Task.init(project_name='examples', task_name='use artifact from other task')
+# get instance of Task that created artifact (task1), using Task's project and name. You could also use its ID number.
+preprocess_task = Task.get_task(project_name='examples', task_name='create artifact')
+# access artifact from task1, using the artifact's name
+# get_local_copy() caches the files for later use and returns a path to the cached file
+local_json = preprocess_task.artifacts['data file'].get_local_copy()
+
+# Doing some stuff with file from other Task in current Task
+with open(local_json) as data_file:
+    file_text = data_file.read()
+
+print(file_text)
+# Simulate the work of a Task
+sleep(1.0)
+print('Finished doing stuff with some data :)')