@ -0,0 +1,135 @@
# ClearML - Example of remote_execution with Pytorch mnist training
""" the task.remote_execution option is used when it's needed to run part of the code locally and then move it for
full execution remotely. When running locally, the task.remote_execution() will complete the currently running task and
enqueue it to a chosen queue. When running in an agent, it will ignore the task.remote_execution() and proceed to execute
the code. This feature is especially helpful if you want to run the first epoch locally on your machine to debug and to
make sure code doesn't crash, and then move to a stronger machine for the entire training.
from __future__ import print_function
import argparse
import os
from tempfile import gettempdir
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from clearml import Task, Logger
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4 * 4 * 50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4 * 4 * 50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
def train(args, model, device, train_loader, optimizer, epoch):
for batch_idx, (data, target) in enumerate(train_loader):
data, target =,
output = model(data)
loss = F.nll_loss(output, target)
if batch_idx % args.log_interval == 0:
"train", "loss", iteration=(epoch * len(train_loader) + batch_idx), value=loss.item())
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
def test(args, model, device, test_loader, epoch):
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target =,
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
"test", "loss", iteration=epoch, value=test_loss)
"test", "accuracy", iteration=epoch, value=(correct / len(test_loader.dataset)))
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
def main():
# Connecting ClearML with the current process,
# from here on everything is logged automatically
task = Task.init(project_name='examples', task_name='remote_execution pytorch mnist train')
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
help='learning rate (default: 0.01)')
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
help='SGD momentum (default: 0.5)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--save-model', action='store_true', default=True,
help='For Saving the current Model')
args = parser.parse_args()
use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
train_loader =
datasets.MNIST(os.path.join('..', 'data'), train=True, download=True,
transforms.Normalize((0.1307,), (0.3081,))
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader =
datasets.MNIST(os.path.join('..', 'data'), train=False, transform=transforms.Compose([
transforms.Normalize((0.1307,), (0.3081,))
batch_size=args.test_batch_size, shuffle=True, **kwargs)
model = Net().to(device)
optimizer = optim.SGD(model.parameters(),, momentum=args.momentum)
for epoch in range(1, args.epochs + 1):
if epoch > 1:
# We run training for 1 epoch to make sure nothing crashes then local execution will be terminated.
# Execution will switch to remote execution by the agent listening to specified queue
train(args, model, device, train_loader, optimizer, epoch)
test(args, model, device, test_loader, epoch)
if (args.save_model):, os.path.join(gettempdir(), ""))
if __name__ == '__main__':

@ -0,0 +1,30 @@
# Using artifacts example
Upload artifacts from a Task, and then a different Task can access and utilize the data from that artifact.
from clearml import Task
from time import sleep
task1 = Task.init(project_name='examples', task_name='create artifact')
# upload data file to the initialized task, inputting a name and file location
task1.upload_artifact(name='data file', artifact_object='data_samples/sample.json')
# close the task, to be able to initialize a new task
# initialize another task to use some other task's artifacts
task2 = Task.init(project_name='examples', task_name='use artifact from other task')
# get instance of Task that created artifact (task1), using Task's project and name. You could also use its ID number.
preprocess_task = Task.get_task(project_name='examples', task_name='create artifact')
# access artifact from task1, using the artifact's name
# get_local_copy() caches the files for later use and returns a path to the cached file
local_json = preprocess_task.artifacts['data file'].get_local_copy()
# Doing some stuff with file from other Task in current Task
with open(local_json) as data_file:
file_text =
# Simulate the work of a Task
print('Finished doing stuff with some data :)')