mirror of
https://github.com/clearml/clearml
synced 2025-06-26 18:16:07 +00:00
Refactor examples
This commit is contained in:
43
examples/frameworks/pytorch/manual_model_upload.py
Normal file
43
examples/frameworks/pytorch/manual_model_upload.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# TRAINS - Example of manual model configuration and uploading
|
||||
#
|
||||
import os
|
||||
from tempfile import gettempdir
|
||||
|
||||
import torch
|
||||
from trains import Task
|
||||
|
||||
|
||||
task = Task.init(project_name='examples', task_name='Model configuration and upload')
|
||||
|
||||
# create a model
|
||||
model = torch.nn.Module
|
||||
|
||||
# Connect a local configuration file
|
||||
config_file = os.path.join('..', '..', 'reporting', 'data_samples', 'sample.json')
|
||||
config_file = task.connect_configuration(config_file)
|
||||
# then read configuration as usual, the backend will contain a copy of it.
|
||||
# later when executing remotely, the returned `config_file` will be a temporary file
|
||||
# containing a new copy of the configuration retrieved form the backend
|
||||
# # model_config_dict = json.load(open(config_file, 'rt'))
|
||||
|
||||
# Or Store dictionary of definition for a specific network design
|
||||
model_config_dict = {
|
||||
'value': 13.37,
|
||||
'dict': {'sub_value': 'string', 'sub_integer': 11},
|
||||
'list_of_ints': [1, 2, 3, 4],
|
||||
}
|
||||
model_config_dict = task.connect_configuration(model_config_dict)
|
||||
|
||||
# We now update the dictionary after connecting it, and the changes will be tracked as well.
|
||||
model_config_dict['new value'] = 10
|
||||
model_config_dict['value'] *= model_config_dict['new value']
|
||||
|
||||
# store the label enumeration of the training model
|
||||
labels = {'background': 0, 'cat': 1, 'dog': 2}
|
||||
task.connect_label_enumeration(labels)
|
||||
|
||||
# storing the model, it will have the task network configuration and label enumeration
|
||||
print('Any model stored from this point onwards, will contain both model_config and label_enumeration')
|
||||
|
||||
torch.save(model, os.path.join(gettempdir(), "model"))
|
||||
print('Model saved')
|
||||
@@ -0,0 +1,380 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "e-YsQrBjzNdX"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install -U pip\n",
|
||||
"! pip install -U torch==1.5.0\n",
|
||||
"! pip install -U torchaudio==0.5.0\n",
|
||||
"! pip install -U torchvision==0.6.0\n",
|
||||
"! pip install -U matplotlib==3.2.1\n",
|
||||
"! pip install -U trains==0.15.0\n",
|
||||
"! pip install -U pandas==1.0.4\n",
|
||||
"! pip install -U numpy==1.18.4\n",
|
||||
"! pip install -U tensorboard==2.2.1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "T7T0Rf26zNdm"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import PIL\n",
|
||||
"import io\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from pathlib2 import Path\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"import torch.optim as optim\n",
|
||||
"from torch.utils.data import Dataset\n",
|
||||
"from torch.utils.tensorboard import SummaryWriter\n",
|
||||
"\n",
|
||||
"import torchaudio\n",
|
||||
"from torchvision.transforms import ToTensor\n",
|
||||
"\n",
|
||||
"from trains import Task\n",
|
||||
"\n",
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task = Task.init(project_name='Audio Example', task_name='audio classifier')\n",
|
||||
"configuration_dict = {'number_of_epochs': 10, 'batch_size': 4, 'dropout': 0.25, 'base_lr': 0.001}\n",
|
||||
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "msiz7QdvzNeA",
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download UrbanSound8K dataset (https://urbansounddataset.weebly.com/urbansound8k.html)\n",
|
||||
"path_to_UrbanSound8K = './data/UrbanSound8K'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "wXtmZe7yzNeS"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class UrbanSoundDataset(Dataset):\n",
|
||||
"#rapper for the UrbanSound8K dataset\n",
|
||||
" def __init__(self, csv_path, file_path, folderList):\n",
|
||||
" self.file_path = file_path\n",
|
||||
" self.file_names = []\n",
|
||||
" self.labels = []\n",
|
||||
" self.folders = []\n",
|
||||
" \n",
|
||||
" #loop through the csv entries and only add entries from folders in the folder list\n",
|
||||
" csvData = pd.read_csv(csv_path)\n",
|
||||
" for i in range(0,len(csvData)):\n",
|
||||
" if csvData.iloc[i, 5] in folderList:\n",
|
||||
" self.file_names.append(csvData.iloc[i, 0])\n",
|
||||
" self.labels.append(csvData.iloc[i, 6])\n",
|
||||
" self.folders.append(csvData.iloc[i, 5])\n",
|
||||
" \n",
|
||||
" def __getitem__(self, index):\n",
|
||||
" #format the file path and load the file\n",
|
||||
" path = self.file_path / (\"fold\" + str(self.folders[index])) / self.file_names[index]\n",
|
||||
" sound, sample_rate = torchaudio.load(path, out = None, normalization = True)\n",
|
||||
"\n",
|
||||
" # UrbanSound8K uses two channels, this will convert them to one\n",
|
||||
" soundData = torch.mean(sound, dim=0, keepdim=True)\n",
|
||||
" \n",
|
||||
" #Make sure all files are the same size\n",
|
||||
" if soundData.numel() < 160000:\n",
|
||||
" fixedsize_data = torch.nn.functional.pad(soundData, (0, 160000 - soundData.numel()))\n",
|
||||
" else:\n",
|
||||
" fixedsize_data = soundData[0,:160000].reshape(1,160000)\n",
|
||||
" \n",
|
||||
" #downsample the audio\n",
|
||||
" downsample_data = fixedsize_data[::5]\n",
|
||||
" \n",
|
||||
" melspectogram_transform = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate)\n",
|
||||
" melspectogram = melspectogram_transform(downsample_data)\n",
|
||||
" melspectogram_db = torchaudio.transforms.AmplitudeToDB()(melspectogram)\n",
|
||||
"\n",
|
||||
" return fixedsize_data, sample_rate, melspectogram_db, self.labels[index]\n",
|
||||
" \n",
|
||||
" def __len__(self):\n",
|
||||
" return len(self.file_names)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"csv_path = Path(path_to_UrbanSound8K) / 'metadata' / 'UrbanSound8K.csv'\n",
|
||||
"file_path = Path(path_to_UrbanSound8K) / 'audio'\n",
|
||||
"\n",
|
||||
"train_set = UrbanSoundDataset(csv_path, file_path, range(1,10))\n",
|
||||
"test_set = UrbanSoundDataset(csv_path, file_path, [10])\n",
|
||||
"print(\"Train set size: \" + str(len(train_set)))\n",
|
||||
"print(\"Test set size: \" + str(len(test_set)))\n",
|
||||
"\n",
|
||||
"train_loader = torch.utils.data.DataLoader(train_set, batch_size = configuration_dict.get('batch_size', 4), \n",
|
||||
" shuffle = True, pin_memory=True, num_workers=1)\n",
|
||||
"test_loader = torch.utils.data.DataLoader(test_set, batch_size = configuration_dict.get('batch_size', 4), \n",
|
||||
" shuffle = False, pin_memory=True, num_workers=1)\n",
|
||||
"\n",
|
||||
"classes = ('air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling', 'engine_idling', \n",
|
||||
" 'gun_shot', 'jackhammer', 'siren', 'street_music')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "ylblw-k1zNeZ"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class Net(nn.Module):\n",
|
||||
" def __init__(self, num_classes, dropout_value):\n",
|
||||
" super(Net,self).__init__()\n",
|
||||
" self.num_classes = num_classes\n",
|
||||
" self.dropout_value = dropout_value\n",
|
||||
" \n",
|
||||
" self.C1 = nn.Conv2d(1,16,3)\n",
|
||||
" self.C2 = nn.Conv2d(16,32,3)\n",
|
||||
" self.C3 = nn.Conv2d(32,64,3)\n",
|
||||
" self.C4 = nn.Conv2d(64,128,3)\n",
|
||||
" self.maxpool1 = nn.MaxPool2d(2,2) \n",
|
||||
" self.fc1 = nn.Linear(128*29*197,128)\n",
|
||||
" self.fc2 = nn.Linear(128,self.num_classes)\n",
|
||||
" self.dropout = nn.Dropout(self.dropout_value)\n",
|
||||
" \n",
|
||||
" def forward(self,x):\n",
|
||||
" # add sequence of convolutional and max pooling layers\n",
|
||||
" x = F.relu(self.C1(x))\n",
|
||||
" x = self.maxpool1(F.relu(self.C2(x)))\n",
|
||||
" x = F.relu(self.C3(x))\n",
|
||||
" x = self.maxpool1(F.relu(self.C4(x)))\n",
|
||||
" # flatten image input\n",
|
||||
" x = x.view(-1,128*29*197)\n",
|
||||
" x = F.relu(self.fc1(self.dropout(x)))\n",
|
||||
" x = self.fc2(self.dropout(x))\n",
|
||||
" return x\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"model = Net(len(classes), configuration_dict.get('dropout', 0.25))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "3yKYru14zNef"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = optim.SGD(model.parameters(), lr = configuration_dict.get('base_lr', 0.001), momentum = 0.9)\n",
|
||||
"scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 3, gamma = 0.1)\n",
|
||||
"criterion = nn.CrossEntropyLoss()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')\n",
|
||||
"print('Device to use: {}'.format(device))\n",
|
||||
"model.to(device)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tensorboard_writer = SummaryWriter('./tensorboard_logs')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def plot_signal(signal, title, cmap=None):\n",
|
||||
" fig = plt.figure()\n",
|
||||
" if signal.ndim == 1:\n",
|
||||
" plt.plot(signal)\n",
|
||||
" else:\n",
|
||||
" plt.imshow(signal, cmap=cmap) \n",
|
||||
" plt.title(title)\n",
|
||||
" \n",
|
||||
" plot_buf = io.BytesIO()\n",
|
||||
" plt.savefig(plot_buf, format='jpeg')\n",
|
||||
" plot_buf.seek(0)\n",
|
||||
" plt.close(fig)\n",
|
||||
" return ToTensor()(PIL.Image.open(plot_buf))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "Vdthqz3JzNem"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train(model, epoch):\n",
|
||||
" model.train()\n",
|
||||
" for batch_idx, (sounds, sample_rate, inputs, labels) in enumerate(train_loader):\n",
|
||||
" inputs = inputs.to(device)\n",
|
||||
" labels = labels.to(device)\n",
|
||||
"\n",
|
||||
" # zero the parameter gradients\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
"\n",
|
||||
" # forward + backward + optimize\n",
|
||||
" outputs = model(inputs)\n",
|
||||
" _, predicted = torch.max(outputs, 1)\n",
|
||||
" loss = criterion(outputs, labels)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" iteration = epoch * len(train_loader) + batch_idx\n",
|
||||
" if batch_idx % log_interval == 0: #print training stats\n",
|
||||
" print('Train Epoch: {} [{}/{} ({:.0f}%)]\\tLoss: {:.6f}'\n",
|
||||
" .format(epoch, batch_idx * len(inputs), len(train_loader.dataset), \n",
|
||||
" 100. * batch_idx / len(train_loader), loss))\n",
|
||||
" tensorboard_writer.add_scalar('training loss/loss', loss, iteration)\n",
|
||||
" tensorboard_writer.add_scalar('learning rate/lr', optimizer.param_groups[0]['lr'], iteration)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if batch_idx % debug_interval == 0: # report debug image every 500 mini-batches\n",
|
||||
" for n, (inp, pred, label) in enumerate(zip(inputs, predicted, labels)):\n",
|
||||
" series = 'label_{}_pred_{}'.format(classes[label.cpu()], classes[pred.cpu()])\n",
|
||||
" tensorboard_writer.add_image('Train MelSpectrogram samples/{}'.format(n), \n",
|
||||
" plot_signal(inp.cpu().numpy().squeeze(), series, 'hot'), iteration)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "LBWoj7u5zNes"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def test(model, epoch):\n",
|
||||
" model.eval()\n",
|
||||
" class_correct = list(0. for i in range(10))\n",
|
||||
" class_total = list(0. for i in range(10))\n",
|
||||
" with torch.no_grad():\n",
|
||||
" for idx, (sounds, sample_rate, inputs, labels) in enumerate(test_loader):\n",
|
||||
" inputs = inputs.to(device)\n",
|
||||
" labels = labels.to(device)\n",
|
||||
"\n",
|
||||
" outputs = model(inputs)\n",
|
||||
"\n",
|
||||
" _, predicted = torch.max(outputs, 1)\n",
|
||||
" c = (predicted == labels)\n",
|
||||
" for i in range(len(inputs)):\n",
|
||||
" label = labels[i].item()\n",
|
||||
" class_correct[label] += c[i].item()\n",
|
||||
" class_total[label] += 1\n",
|
||||
" \n",
|
||||
" iteration = (epoch + 1) * len(train_loader)\n",
|
||||
" if idx % debug_interval == 0: # report debug image every 100 mini-batches\n",
|
||||
" for n, (sound, inp, pred, label) in enumerate(zip(sounds, inputs, predicted, labels)):\n",
|
||||
" series = 'label_{}_pred_{}'.format(classes[label.cpu()], classes[pred.cpu()])\n",
|
||||
" tensorboard_writer.add_audio('Test audio samples/{}'.format(n), \n",
|
||||
" sound, iteration, int(sample_rate[n]))\n",
|
||||
" tensorboard_writer.add_image('Test MelSpectrogram samples/{}_{}'.format(idx, n), \n",
|
||||
" plot_signal(inp.cpu().numpy().squeeze(), series, 'hot'), iteration)\n",
|
||||
"\n",
|
||||
" total_accuracy = 100 * sum(class_correct)/sum(class_total)\n",
|
||||
" print('[Iteration {}] Accuracy on the {} test images: {}%\\n'.format(epoch, sum(class_total), total_accuracy))\n",
|
||||
" tensorboard_writer.add_scalar('accuracy/total', total_accuracy, iteration)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "X5lx3g_5zNey",
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"log_interval = 100\n",
|
||||
"debug_interval = 200\n",
|
||||
"for epoch in range(configuration_dict.get('number_of_epochs', 10)):\n",
|
||||
" train(model, epoch)\n",
|
||||
" test(model, epoch)\n",
|
||||
" scheduler.step()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "audio_classifier_tutorial.ipynb",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
@@ -0,0 +1,128 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install -U pip\n",
|
||||
"! pip install -U torch==1.5.0\n",
|
||||
"! pip install -U torchaudio==0.5.0\n",
|
||||
"! pip install -U matplotlib==3.2.1\n",
|
||||
"! pip install -U trains==0.15.0\n",
|
||||
"! pip install -U tensorboard==2.2.1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import torchaudio\n",
|
||||
"from torch.utils.tensorboard import SummaryWriter\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"from trains import Task\n",
|
||||
"\n",
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task = Task.init(project_name='Audio Example', task_name='data pre-processing')\n",
|
||||
"configuration_dict = {'number_of_smaples': 3}\n",
|
||||
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tensorboard_writer = SummaryWriter('./tensorboard_logs')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if not os.path.isdir('./data'):\n",
|
||||
" os.mkdir('./data')\n",
|
||||
"yesno_data = torchaudio.datasets.YESNO('./data', download=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def plot_signal(signal, title, cmap=None):\n",
|
||||
" plt.figure()\n",
|
||||
" if signal.ndim == 1:\n",
|
||||
" plt.plot(signal)\n",
|
||||
" else:\n",
|
||||
" plt.imshow(signal, cmap=cmap) \n",
|
||||
" plt.title(title)\n",
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for n in range(configuration_dict.get('number_of_smaples', 3)):\n",
|
||||
" waveform, sample_rate, labels = yesno_data[n]\n",
|
||||
" melspectogram_transform = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate)\n",
|
||||
" plot_signal(waveform[0,:], 'Original waveform')\n",
|
||||
" tensorboard_writer.add_audio('Audio samples/{}'.format(n), waveform, n, sample_rate)\n",
|
||||
" plot_signal(melspectogram_transform(waveform.squeeze()).numpy(), 'Mel spectogram', 'hot')\n",
|
||||
" plot_signal(torchaudio.transforms.AmplitudeToDB()(melspectogram_transform(waveform.squeeze())).numpy(), 'Mel spectogram DB', 'hot')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
@@ -0,0 +1,136 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# execute this in command line on all machines to be used as workers before initiating the hyperparamer search \n",
|
||||
"# ! pip install -U trains-agent==0.15.0\n",
|
||||
"# ! trains-agent daemon --queue default\n",
|
||||
"\n",
|
||||
"# pip install with locked versions\n",
|
||||
"! pip install -U pandas==1.0.3\n",
|
||||
"! pip install -U trains==0.15.0\n",
|
||||
"! pip install -U hpbandster==0.7.4 # Needed only for Bayesian optimization Hyper-Band"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from trains.automation import UniformParameterRange, UniformIntegerParameterRange\n",
|
||||
"from trains.automation import RandomSearch, HyperParameterOptimizer\n",
|
||||
"from trains.automation.hpbandster import OptimizerBOHB # Needed only for Bayesian optimization Hyper-Band\n",
|
||||
"\n",
|
||||
"from trains import Task"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task = Task.init(project_name='Hyper-Parameter Search', task_name='Hyper-Parameter Optimization')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#####################################################################\n",
|
||||
"### Don't forget to replace this default id with your own task id ###\n",
|
||||
"#####################################################################\n",
|
||||
"TEMPLATE_TASK_ID = 'd8e928460f98437c998f3597768597f8'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = HyperParameterOptimizer(\n",
|
||||
" base_task_id=TEMPLATE_TASK_ID, # This is the experiment we want to optimize\n",
|
||||
" # here we define the hyper-parameters to optimize\n",
|
||||
" hyper_parameters=[\n",
|
||||
" UniformIntegerParameterRange('number_of_epochs', min_value=5, max_value=15, step_size=1),\n",
|
||||
" UniformIntegerParameterRange('batch_size', min_value=2, max_value=12, step_size=2),\n",
|
||||
" UniformParameterRange('dropout', min_value=0, max_value=0.5, step_size=0.05),\n",
|
||||
" UniformParameterRange('base_lr', min_value=0.0005, max_value=0.01, step_size=0.0005),\n",
|
||||
" ],\n",
|
||||
" # this is the objective metric we want to maximize/minimize\n",
|
||||
" objective_metric_title='accuracy',\n",
|
||||
" objective_metric_series='total',\n",
|
||||
" objective_metric_sign='max', # maximize or minimize the objective metric\n",
|
||||
" max_number_of_concurrent_tasks=3, # number of concurrent experiments\n",
|
||||
" # setting optimizer - trains supports GridSearch, RandomSearch or OptimizerBOHB\n",
|
||||
" optimizer_class=OptimizerBOHB, # can be replaced with OptimizerBOHB\n",
|
||||
" execution_queue='default', # queue to schedule the experiments for execution\n",
|
||||
" optimization_time_limit=30., # time limit for each experiment (optional, ignored by OptimizerBOHB)\n",
|
||||
" pool_period_min=1, # Check the experiments every x minutes\n",
|
||||
" # set the maximum number of experiments for the optimization.\n",
|
||||
" # OptimizerBOHB sets the total number of iteration as total_max_jobs * max_iteration_per_job\n",
|
||||
" total_max_jobs=12,\n",
|
||||
" # setting OptimizerBOHB configuration (ignored by other optimizers)\n",
|
||||
" min_iteration_per_job=15000, # minimum number of iterations per experiment, till early stopping\n",
|
||||
" max_iteration_per_job=150000, # maximum number of iterations per experiment\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer.set_time_limit(in_minutes=120.0) # set the time limit for the optimization process\n",
|
||||
"optimizer.start() \n",
|
||||
"optimizer.wait() # wait until process is done\n",
|
||||
"optimizer.stop() # make sure background optimization stopped"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# optimization is completed, print the top performing experiments id\n",
|
||||
"k = 3\n",
|
||||
"top_exp = optimizer.get_top_experiments(top_k=k)\n",
|
||||
"print('Top {} experiments are:'.format(k))\n",
|
||||
"for n, t in enumerate(top_exp, 1):\n",
|
||||
" print('Rank {}: task id={} |result={}'\n",
|
||||
" .format(n, t.id, t.get_last_scalar_metrics()['accuracy']['total']['last']))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# execute this in command line before initiating the notebook: \n",
|
||||
"# pip install -U pip\n",
|
||||
"# pip install -U ipywidgets==7.5.1\n",
|
||||
"# jupyter nbextension enable --py widgetsnbextension\n",
|
||||
"\n",
|
||||
"# pip install with locked versions\n",
|
||||
"! pip install -U torch==1.5.0\n",
|
||||
"! pip install -U torchvision==0.6.0\n",
|
||||
"! pip install -U numpy==1.18.4\n",
|
||||
"! pip install -U trains==0.15.0\n",
|
||||
"! pip install -U tensorboard==2.2.1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"import torch.optim as optim\n",
|
||||
"from torch.utils.tensorboard import SummaryWriter\n",
|
||||
"\n",
|
||||
"import torchvision.datasets as datasets\n",
|
||||
"import torchvision.transforms as transforms\n",
|
||||
"\n",
|
||||
"from trains import Task"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task = Task.init(project_name='Hyper-Parameter Search', task_name='image_classification_CIFAR10')\n",
|
||||
"configuration_dict = {'number_of_epochs': 3, 'batch_size': 4, 'dropout': 0.25, 'base_lr': 0.001}\n",
|
||||
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"transform = transforms.Compose([transforms.ToTensor()])\n",
|
||||
"\n",
|
||||
"trainset = datasets.CIFAR10(root='./data', train=True,\n",
|
||||
" download=True, transform=transform)\n",
|
||||
"trainloader = torch.utils.data.DataLoader(trainset, batch_size=configuration_dict.get('batch_size', 4),\n",
|
||||
" shuffle=True, num_workers=2)\n",
|
||||
"\n",
|
||||
"testset = datasets.CIFAR10(root='./data', train=False,\n",
|
||||
" download=True, transform=transform)\n",
|
||||
"testloader = torch.utils.data.DataLoader(testset, batch_size=configuration_dict.get('batch_size', 4),\n",
|
||||
" shuffle=False, num_workers=2)\n",
|
||||
"\n",
|
||||
"classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')\n",
|
||||
"\n",
|
||||
"device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class Net(nn.Module):\n",
|
||||
" def __init__(self):\n",
|
||||
" super(Net, self).__init__()\n",
|
||||
" self.conv1 = nn.Conv2d(3, 6, 5)\n",
|
||||
" self.conv2 = nn.Conv2d(3, 6, 5)\n",
|
||||
" self.pool = nn.MaxPool2d(2, 2)\n",
|
||||
" self.conv2 = nn.Conv2d(6, 16, 5)\n",
|
||||
" self.fc1 = nn.Linear(16 * 5 * 5, 120)\n",
|
||||
" self.fc2 = nn.Linear(120, 84)\n",
|
||||
" self.dorpout = nn.Dropout(p=configuration_dict.get('dropout', 0.25))\n",
|
||||
" self.fc3 = nn.Linear(84, 10)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.pool(F.relu(self.conv1(x)))\n",
|
||||
" x = self.pool(F.relu(self.conv2(x)))\n",
|
||||
" x = x.view(-1, 16 * 5 * 5)\n",
|
||||
" x = F.relu(self.fc1(x))\n",
|
||||
" x = F.relu(self.fc2(x))\n",
|
||||
" x = self.fc3(self.dorpout(x))\n",
|
||||
" return x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"net = Net().to(device)\n",
|
||||
"criterion = nn.CrossEntropyLoss()\n",
|
||||
"optimizer = optim.SGD(net.parameters(), lr=configuration_dict.get('base_lr', 0.001), momentum=0.9)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tensorboard_writer = SummaryWriter('./tensorboard_logs')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def test_model(test_dataloader, iteration):\n",
|
||||
" class_correct = list(0. for i in range(10))\n",
|
||||
" class_total = list(0. for i in range(10))\n",
|
||||
" with torch.no_grad():\n",
|
||||
" for j, data in enumerate(test_dataloader, 1):\n",
|
||||
" images, labels = data\n",
|
||||
" images = images.to(device)\n",
|
||||
" labels = labels.to(device)\n",
|
||||
" \n",
|
||||
" outputs = net(images)\n",
|
||||
" _, predicted = torch.max(outputs, 1)\n",
|
||||
" c = (predicted == labels).squeeze()\n",
|
||||
" for i in range(len(images)):\n",
|
||||
" label = labels[i].item()\n",
|
||||
" class_correct[label] += c[i].item()\n",
|
||||
" class_total[label] += 1\n",
|
||||
" \n",
|
||||
" if j % 500 == 0: # report debug image every 500 mini-batches\n",
|
||||
" for n, (img, pred, label) in enumerate(zip(images, predicted, labels)):\n",
|
||||
" tensorboard_writer.add_image(\"testing/{}-{}_GT_{}_pred_{}\"\n",
|
||||
" .format(j, n, classes[label], classes[pred]), img, iteration)\n",
|
||||
"\n",
|
||||
" for i in range(len(classes)):\n",
|
||||
" class_accuracy = 100 * class_correct[i] / class_total[i]\n",
|
||||
" print('[Iteration {}] Accuracy of {} : {}%'.format(iteration, classes[i], class_accuracy))\n",
|
||||
" tensorboard_writer.add_scalar('accuracy per class/{}'.format(classes[i]), class_accuracy, iteration)\n",
|
||||
"\n",
|
||||
" total_accuracy = 100 * sum(class_correct)/sum(class_total)\n",
|
||||
" print('[Iteration {}] Accuracy on the {} test images: {}%\\n'.format(iteration, sum(class_total), total_accuracy))\n",
|
||||
" tensorboard_writer.add_scalar('accuracy/total', total_accuracy, iteration)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for epoch in range(configuration_dict.get('number_of_epochs', 3)): # loop over the dataset multiple times\n",
|
||||
"\n",
|
||||
" running_loss = 0.0\n",
|
||||
" for i, data in enumerate(trainloader, 1):\n",
|
||||
" # get the inputs; data is a list of [inputs, labels]\n",
|
||||
" inputs, labels = data\n",
|
||||
" inputs = inputs.to(device)\n",
|
||||
" labels = labels.to(device)\n",
|
||||
"\n",
|
||||
" # zero the parameter gradients\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
"\n",
|
||||
" # forward + backward + optimize\n",
|
||||
" outputs = net(inputs)\n",
|
||||
" loss = criterion(outputs, labels)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
" # print statistics\n",
|
||||
" running_loss += loss.item()\n",
|
||||
" \n",
|
||||
" iteration = epoch * len(trainloader) + i\n",
|
||||
" if i % 2000 == 0: # report loss every 2000 mini-batches\n",
|
||||
" print('[Epoch %d, Iteration %5d] loss: %.3f' %(epoch + 1, i + 1, running_loss / 2000))\n",
|
||||
" tensorboard_writer.add_scalar('training loss', running_loss / 2000, iteration)\n",
|
||||
" running_loss = 0.0\n",
|
||||
" \n",
|
||||
" test_model(testloader, iteration)\n",
|
||||
"\n",
|
||||
"print('Finished Training')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"PATH = './cifar_net.pth'\n",
|
||||
"torch.save(net.state_dict(), PATH)\n",
|
||||
"tensorboard_writer.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('Task ID number is: {}'.format(task.id))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
180
examples/frameworks/pytorch/pytorch_distributed_example.py
Normal file
180
examples/frameworks/pytorch/pytorch_distributed_example.py
Normal file
@@ -0,0 +1,180 @@
|
||||
# TRAINS - example of TRAINS torch distributed support
|
||||
# notice all nodes will be reporting to the master Task (experiment)
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from math import ceil
|
||||
from random import Random
|
||||
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
import torch.distributed as dist
|
||||
import torch.nn.functional as F
|
||||
from torch import optim
|
||||
from torchvision import datasets, transforms
|
||||
|
||||
from trains import Task
|
||||
|
||||
|
||||
local_dataset_path = './MNIST_data'
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
def __init__(self):
|
||||
super(Net, self).__init__()
|
||||
self.conv1 = nn.Conv2d(1, 32, 3, 1)
|
||||
self.conv2 = nn.Conv2d(32, 64, 3, 1)
|
||||
self.dropout1 = nn.Dropout2d(0.25)
|
||||
self.dropout2 = nn.Dropout2d(0.5)
|
||||
self.fc1 = nn.Linear(9216, 128)
|
||||
self.fc2 = nn.Linear(128, 10)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = F.relu(x)
|
||||
x = self.conv2(x)
|
||||
x = F.max_pool2d(x, 2)
|
||||
x = self.dropout1(x)
|
||||
x = th.flatten(x, 1)
|
||||
x = self.fc1(x)
|
||||
x = F.relu(x)
|
||||
x = self.dropout2(x)
|
||||
x = self.fc2(x)
|
||||
output = F.log_softmax(x, dim=1)
|
||||
return output
|
||||
|
||||
|
||||
class Partition(object):
|
||||
""" Dataset partitioning helper """
|
||||
def __init__(self, data, index):
|
||||
self.data = data
|
||||
self.index = index
|
||||
|
||||
def __len__(self):
|
||||
return len(self.index)
|
||||
|
||||
def __getitem__(self, index):
|
||||
data_idx = self.index[index]
|
||||
return self.data[data_idx]
|
||||
|
||||
|
||||
class DataPartitioner(object):
|
||||
def __init__(self, data, sizes=(0.7, 0.2, 0.1), seed=1234):
|
||||
self.data = data
|
||||
self.partitions = []
|
||||
rng = Random()
|
||||
rng.seed(seed)
|
||||
data_len = len(data)
|
||||
indexes = [x for x in range(0, data_len)]
|
||||
rng.shuffle(indexes)
|
||||
|
||||
for frac in sizes:
|
||||
part_len = int(frac * data_len)
|
||||
self.partitions.append(indexes[0:part_len])
|
||||
indexes = indexes[part_len:]
|
||||
|
||||
def use(self, partition):
|
||||
return Partition(self.data, self.partitions[partition])
|
||||
|
||||
|
||||
def partition_dataset(num_workers=4):
|
||||
""" Partitioning MNIST """
|
||||
dataset = datasets.MNIST(root=local_dataset_path, train=True, download=True,
|
||||
transform=transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))
|
||||
]))
|
||||
size = dist.get_world_size()
|
||||
bsz = int(128 / float(size))
|
||||
partition_sizes = [1.0 / size for _ in range(size)]
|
||||
partition = DataPartitioner(dataset, partition_sizes)
|
||||
partition = partition.use(dist.get_rank())
|
||||
train_set = th.utils.data.DataLoader(
|
||||
partition, num_workers=num_workers, batch_size=bsz, shuffle=True)
|
||||
return train_set, bsz
|
||||
|
||||
|
||||
def run(num_workers):
|
||||
""" Distributed Synchronous SGD Example """
|
||||
th.manual_seed(1234)
|
||||
train_set, bsz = partition_dataset(num_workers)
|
||||
model = Net()
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
|
||||
|
||||
num_batches = ceil(len(train_set.dataset) / float(bsz))
|
||||
|
||||
from random import randint
|
||||
param = {'worker_{}_stuff'.format(dist.get_rank()): 'some stuff ' + str(randint(0, 100))}
|
||||
Task.current_task().connect(param)
|
||||
Task.current_task().upload_artifact(
|
||||
'temp {:02d}'.format(dist.get_rank()), artifact_object={'worker_rank': dist.get_rank()})
|
||||
|
||||
for epoch in range(2):
|
||||
epoch_loss = 0.0
|
||||
for i, (data, target) in enumerate(train_set):
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = F.nll_loss(output, target)
|
||||
epoch_loss += loss.item()
|
||||
loss.backward()
|
||||
average_gradients(model)
|
||||
optimizer.step()
|
||||
if i % 10 == 0:
|
||||
print('{}] Train Epoch {} - {} \tLoss {:.6f}'.format(dist.get_rank(), epoch, i, loss))
|
||||
Task.current_task().get_logger().report_scalar(
|
||||
'loss', 'worker {:02d}'.format(dist.get_rank()), value=loss.item(), iteration=i)
|
||||
if i > 100:
|
||||
break
|
||||
print('Rank ', dist.get_rank(), ', epoch ',
|
||||
epoch, ': ', epoch_loss / num_batches)
|
||||
|
||||
|
||||
def average_gradients(model):
|
||||
""" Gradient averaging. """
|
||||
size = float(dist.get_world_size())
|
||||
for param in model.parameters():
|
||||
dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM)
|
||||
param.grad.data /= size
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument('--nodes', help='number of nodes', type=int, default=10)
|
||||
parser.add_argument('--workers_in_node', help='number of workers per node', type=int, default=3)
|
||||
# this argument we will not be logging, see below Task.init
|
||||
parser.add_argument('--rank', help='current rank', type=int)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# We have to initialize the task in the master process,
|
||||
# it will make sure that any sub-process calling Task.init will get the master task object
|
||||
# notice that we exclude the `rank` argument, so we can launch multiple sub-processes with trains-agent
|
||||
# otherwise, the `rank` will always be set to the original value.
|
||||
task = Task.init("examples", "test torch distributed", auto_connect_arg_parser={'rank': False})
|
||||
|
||||
if os.environ.get('MASTER_ADDR'):
|
||||
dist.init_process_group(backend='gloo', rank=args.rank, world_size=args.nodes)
|
||||
run(args.workers_in_node)
|
||||
else:
|
||||
# first let's download the dataset, if we have multiple machines,
|
||||
# they will take care of it when they get there
|
||||
datasets.MNIST(root=local_dataset_path, train=True, download=True)
|
||||
|
||||
os.environ['MASTER_ADDR'] = '127.0.0.1'
|
||||
os.environ['MASTER_PORT'] = '29500'
|
||||
|
||||
print(os.getpid(), 'ARGS:', args)
|
||||
processes = []
|
||||
for rank in range(args.nodes):
|
||||
cmd = [sys.executable, sys.argv[0],
|
||||
'--nodes', str(args.nodes),
|
||||
'--workers_in_node', str(args.workers_in_node),
|
||||
'--rank', str(rank)]
|
||||
print(cmd)
|
||||
p = subprocess.Popen(cmd, cwd=os.getcwd(), pass_fds=[], close_fds=True)
|
||||
processes.append(p)
|
||||
|
||||
for p in processes:
|
||||
p.wait()
|
||||
482
examples/frameworks/pytorch/pytorch_matplotlib.py
Normal file
482
examples/frameworks/pytorch/pytorch_matplotlib.py
Normal file
@@ -0,0 +1,482 @@
|
||||
# TRAINS - Example of Pytorch and matplotlib integration and reporting
|
||||
#
|
||||
"""
|
||||
Neural Transfer Using PyTorch
|
||||
=============================
|
||||
**Author**: `Alexis Jacq <https://alexis-jacq.github.io>`_
|
||||
|
||||
**Edited by**: `Winston Herring <https://github.com/winston6>`_
|
||||
Introduction
|
||||
------------
|
||||
This tutorial explains how to implement the `Neural-Style algorithm <https://arxiv.org/abs/1508.06576>`__
|
||||
developed by Leon A. Gatys, Alexander S. Ecker and Matthias Bethge.
|
||||
Neural-Style, or Neural-Transfer, allows you to take an image and
|
||||
reproduce it with a new artistic style. The algorithm takes three images,
|
||||
an input image, a content-image, and a style-image, and changes the input
|
||||
to resemble the content of the content-image and the artistic style of the style-image.
|
||||
|
||||
.. figure:: /_static/img/neural-style/neuralstyle.png
|
||||
:alt: content1
|
||||
"""
|
||||
|
||||
######################################################################
|
||||
# Underlying Principle
|
||||
# --------------------
|
||||
#
|
||||
# The principle is simple: we define two distances, one for the content
|
||||
# (:math:`D_C`) and one for the style (:math:`D_S`). :math:`D_C` measures how different the content
|
||||
# is between two images while :math:`D_S` measures how different the style is
|
||||
# between two images. Then, we take a third image, the input, and
|
||||
# transform it to minimize both its content-distance with the
|
||||
# content-image and its style-distance with the style-image. Now we can
|
||||
# import the necessary packages and begin the neural transfer.
|
||||
#
|
||||
# Importing Packages and Selecting a Device
|
||||
# -----------------------------------------
|
||||
# Below is a list of the packages needed to implement the neural transfer.
|
||||
#
|
||||
# - ``torch``, ``torch.nn``, ``numpy`` (indispensables packages for
|
||||
# neural networks with PyTorch)
|
||||
# - ``torch.optim`` (efficient gradient descents)
|
||||
# - ``PIL``, ``PIL.Image``, ``matplotlib.pyplot`` (load and display
|
||||
# images)
|
||||
# - ``torchvision.transforms`` (transform PIL images into tensors)
|
||||
# - ``torchvision.models`` (train or load pre-trained models)
|
||||
# - ``copy`` (to deep copy the models; system package)
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
|
||||
from PIL import Image
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import torchvision.transforms as transforms
|
||||
import torchvision.models as models
|
||||
|
||||
import copy
|
||||
from trains import Task
|
||||
|
||||
|
||||
task = Task.init(project_name='examples', task_name='pytorch with matplotlib example', task_type=Task.TaskTypes.testing)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Next, we need to choose which device to run the network on and import the
|
||||
# content and style images. Running the neural transfer algorithm on large
|
||||
# images takes longer and will go much faster when running on a GPU. We can
|
||||
# use ``torch.cuda.is_available()`` to detect if there is a GPU available.
|
||||
# Next, we set the ``torch.device`` for use throughout the tutorial. Also the ``.to(device)``
|
||||
# method is used to move tensors or modules to a desired device.
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
######################################################################
|
||||
# Loading the Images
|
||||
# ------------------
|
||||
#
|
||||
# Now we will import the style and content images. The original PIL images have values between 0 and 255, but when
|
||||
# transformed into torch tensors, their values are converted to be between
|
||||
# 0 and 1. The images also need to be resized to have the same dimensions.
|
||||
# An important detail to note is that neural networks from the
|
||||
# torch library are trained with tensor values ranging from 0 to 1. If you
|
||||
# try to feed the networks with 0 to 255 tensor images, then the activated
|
||||
# feature maps will be unable sense the intended content and style.
|
||||
# However, pre-trained networks from the Caffe library are trained with 0
|
||||
# to 255 tensor images.
|
||||
#
|
||||
#
|
||||
# .. Note::
|
||||
# Here are links to download the images required to run the tutorial:
|
||||
# `picasso.jpg <https://pytorch.org/tutorials/_static/img/neural-style/picasso.jpg>`__ and
|
||||
# `dancing.jpg <https://pytorch.org/tutorials/_static/img/neural-style/dancing.jpg>`__.
|
||||
# Download these two images and add them to a directory
|
||||
# with name ``images`` in your current working directory.
|
||||
|
||||
# desired size of the output image
|
||||
imsize = 512 if torch.cuda.is_available() else 128 # use small size if no gpu
|
||||
|
||||
loader = transforms.Compose([
|
||||
transforms.Resize(imsize), # scale imported image
|
||||
transforms.ToTensor()]) # transform it into a torch tensor
|
||||
|
||||
|
||||
def image_loader(image_name):
|
||||
image = Image.open(image_name)
|
||||
# fake batch dimension required to fit network's input dimensions
|
||||
image = loader(image).unsqueeze(0)
|
||||
return image.to(device, torch.float)
|
||||
|
||||
|
||||
style_img = image_loader(os.path.join("..", "..", "reporting", "data_samples", "picasso.jpg"))
|
||||
content_img = image_loader(os.path.join("..", "..", "reporting", "data_samples", "dancing.jpg"))
|
||||
|
||||
assert style_img.size() == content_img.size(), \
|
||||
"we need to import style and content images of the same size"
|
||||
|
||||
######################################################################
|
||||
# Now, let's create a function that displays an image by reconverting a
|
||||
# copy of it to PIL format and displaying the copy using
|
||||
# ``plt.imshow``. We will try displaying the content and style images
|
||||
# to ensure they were imported correctly.
|
||||
|
||||
unloader = transforms.ToPILImage() # reconvert into PIL image
|
||||
|
||||
plt.ion()
|
||||
|
||||
|
||||
def imshow(tensor, title=None):
|
||||
image = tensor.cpu().clone() # we clone the tensor to not do changes on it
|
||||
image = image.squeeze(0) # remove the fake batch dimension
|
||||
image = unloader(image)
|
||||
plt.imshow(image)
|
||||
if title is not None:
|
||||
plt.title(title)
|
||||
plt.pause(0.001) # pause a bit so that plots are updated
|
||||
|
||||
|
||||
plt.figure()
|
||||
imshow(style_img, title='Style Image')
|
||||
|
||||
plt.figure()
|
||||
imshow(content_img, title='Content Image')
|
||||
|
||||
|
||||
######################################################################
|
||||
# Loss Functions
|
||||
# --------------
|
||||
# Content Loss
|
||||
# ~~~~~~~~~~~~
|
||||
#
|
||||
# The content loss is a function that represents a weighted version of the
|
||||
# content distance for an individual layer. The function takes the feature
|
||||
# maps :math:`F_{XL}` of a layer :math:`L` in a network processing input :math:`X` and returns the
|
||||
# weighted content distance :math:`w_{CL}.D_C^L(X,C)` between the image :math:`X` and the
|
||||
# content image :math:`C`. The feature maps of the content image(:math:`F_{CL}`) must be
|
||||
# known by the function in order to calculate the content distance. We
|
||||
# implement this function as a torch module with a constructor that takes
|
||||
# :math:`F_{CL}` as an input. The distance :math:`\|F_{XL} - F_{CL}\|^2` is the mean square error
|
||||
# between the two sets of feature maps, and can be computed using ``nn.MSELoss``.
|
||||
#
|
||||
# We will add this content loss module directly after the convolution
|
||||
# layer(s) that are being used to compute the content distance. This way
|
||||
# each time the network is fed an input image the content losses will be
|
||||
# computed at the desired layers and because of auto grad, all the
|
||||
# gradients will be computed. Now, in order to make the content loss layer
|
||||
# transparent we must define a ``forward`` method that computes the content
|
||||
# loss and then returns the layer's input. The computed loss is saved as a
|
||||
# parameter of the module.
|
||||
#
|
||||
|
||||
class ContentLoss(nn.Module):
|
||||
|
||||
def __init__(self, target, ):
|
||||
super(ContentLoss, self).__init__()
|
||||
# we 'detach' the target content from the tree used
|
||||
# to dynamically compute the gradient: this is a stated value,
|
||||
# not a variable. Otherwise the forward method of the criterion
|
||||
# will throw an error.
|
||||
self.target = target.detach()
|
||||
|
||||
def forward(self, input):
|
||||
self.loss = F.mse_loss(input, self.target)
|
||||
return input
|
||||
|
||||
|
||||
######################################################################
|
||||
# .. Note::
|
||||
# **Important detail**: although this module is named ``ContentLoss``, it
|
||||
# is not a true PyTorch Loss function. If you want to define your content
|
||||
# loss as a PyTorch Loss function, you have to create a PyTorch autograd function
|
||||
# to recompute/implement the gradient manually in the ``backward``
|
||||
# method.
|
||||
|
||||
######################################################################
|
||||
# Style Loss
|
||||
# ~~~~~~~~~~
|
||||
#
|
||||
# The style loss module is implemented similarly to the content loss
|
||||
# module. It will act as a transparent layer in a
|
||||
# network that computes the style loss of that layer. In order to
|
||||
# calculate the style loss, we need to compute the gram matrix :math:`G_{XL}`. A gram
|
||||
# matrix is the result of multiplying a given matrix by its transposed
|
||||
# matrix. In this application the given matrix is a reshaped version of
|
||||
# the feature maps :math:`F_{XL}` of a layer :math:`L`. :math:`F_{XL}` is reshaped to form :math:`\hat{F}_{XL}`, a :math:`K`\ x\ :math:`N`
|
||||
# matrix, where :math:`K` is the number of feature maps at layer :math:`L` and :math:`N` is the
|
||||
# length of any vectorized feature map :math:`F_{XL}^k`. For example, the first line
|
||||
# of :math:`\hat{F}_{XL}` corresponds to the first vectorized feature map :math:`F_{XL}^1`.
|
||||
#
|
||||
# Finally, the gram matrix must be normalized by dividing each element by
|
||||
# the total number of elements in the matrix. This normalization is to
|
||||
# counteract the fact that :math:`\hat{F}_{XL}` matrices with a large :math:`N` dimension yield
|
||||
# larger values in the Gram matrix. These larger values will cause the
|
||||
# first layers (before pooling layers) to have a larger impact during the
|
||||
# gradient descent. Style features tend to be in the deeper layers of the
|
||||
# network so this normalization step is crucial.
|
||||
#
|
||||
|
||||
def gram_matrix(input):
|
||||
a, b, c, d = input.size() # a=batch size(=1)
|
||||
# b=number of feature maps
|
||||
# (c,d)=dimensions of a f. map (N=c*d)
|
||||
|
||||
features = input.view(a * b, c * d) # resise F_XL into \hat F_XL
|
||||
|
||||
G = torch.mm(features, features.t()) # compute the gram product
|
||||
|
||||
# we 'normalize' the values of the gram matrix
|
||||
# by dividing by the number of element in each feature maps.
|
||||
return G.div(a * b * c * d)
|
||||
|
||||
|
||||
######################################################################
|
||||
# Now the style loss module looks almost exactly like the content loss
|
||||
# module. The style distance is also computed using the mean square
|
||||
# error between :math:`G_{XL}` and :math:`G_{SL}`.
|
||||
#
|
||||
|
||||
class StyleLoss(nn.Module):
|
||||
|
||||
def __init__(self, target_feature):
|
||||
super(StyleLoss, self).__init__()
|
||||
self.target = gram_matrix(target_feature).detach()
|
||||
|
||||
def forward(self, input):
|
||||
G = gram_matrix(input)
|
||||
self.loss = F.mse_loss(G, self.target)
|
||||
return input
|
||||
|
||||
|
||||
######################################################################
|
||||
# Importing the Model
|
||||
# -------------------
|
||||
#
|
||||
# Now we need to import a pre-trained neural network. We will use a 19
|
||||
# layer VGG network like the one used in the paper.
|
||||
#
|
||||
# PyTorch's implementation of VGG is a module divided into two child
|
||||
# ``Sequential`` modules: ``features`` (containing convolution and pooling layers),
|
||||
# and ``classifier`` (containing fully connected layers). We will use the
|
||||
# ``features`` module because we need the output of the individual
|
||||
# convolution layers to measure content and style loss. Some layers have
|
||||
# different behavior during training than evaluation, so we must set the
|
||||
# network to evaluation mode using ``.eval()``.
|
||||
#
|
||||
|
||||
cnn = models.vgg19(pretrained=True).features.to(device).eval()
|
||||
|
||||
######################################################################
|
||||
# Additionally, VGG networks are trained on images with each channel
|
||||
# normalized by mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225].
|
||||
# We will use them to normalize the image before sending it into the network.
|
||||
#
|
||||
|
||||
cnn_normalization_mean = torch.tensor([0.485, 0.456, 0.406]).to(device)
|
||||
cnn_normalization_std = torch.tensor([0.229, 0.224, 0.225]).to(device)
|
||||
|
||||
|
||||
# create a module to normalize input image so we can easily put it in a
|
||||
# nn.Sequential
|
||||
class Normalization(nn.Module):
|
||||
def __init__(self, mean, std):
|
||||
super(Normalization, self).__init__()
|
||||
# .view the mean and std to make them [C x 1 x 1] so that they can
|
||||
# directly work with image Tensor of shape [B x C x H x W].
|
||||
# B is batch size. C is number of channels. H is height and W is width.
|
||||
self.mean = torch.tensor(mean).view(-1, 1, 1)
|
||||
self.std = torch.tensor(std).view(-1, 1, 1)
|
||||
|
||||
def forward(self, img):
|
||||
# normalize img
|
||||
return (img - self.mean) / self.std
|
||||
|
||||
|
||||
######################################################################
|
||||
# A ``Sequential`` module contains an ordered list of child modules. For
|
||||
# instance, ``vgg19.features`` contains a sequence (Conv2d, ReLU, MaxPool2d,
|
||||
# Conv2d, ReLU...) aligned in the right order of depth. We need to add our
|
||||
# content loss and style loss layers immediately after the convolution
|
||||
# layer they are detecting. To do this we must create a new ``Sequential``
|
||||
# module that has content loss and style loss modules correctly inserted.
|
||||
#
|
||||
|
||||
# desired depth layers to compute style/content losses :
|
||||
content_layers_default = ['conv_4']
|
||||
style_layers_default = ['conv_1', 'conv_2', 'conv_3', 'conv_4', 'conv_5']
|
||||
|
||||
|
||||
def get_style_model_and_losses(cnn, normalization_mean, normalization_std,
|
||||
style_img, content_img,
|
||||
content_layers=content_layers_default,
|
||||
style_layers=style_layers_default):
|
||||
cnn = copy.deepcopy(cnn)
|
||||
|
||||
# normalization module
|
||||
normalization = Normalization(normalization_mean, normalization_std).to(device)
|
||||
|
||||
# just in order to have an iterable access to or list of content/syle
|
||||
# losses
|
||||
content_losses = []
|
||||
style_losses = []
|
||||
|
||||
# assuming that cnn is a nn.Sequential, so we make a new nn.Sequential
|
||||
# to put in modules that are supposed to be activated sequentially
|
||||
model = nn.Sequential(normalization)
|
||||
|
||||
i = 0 # increment every time we see a conv
|
||||
for layer in cnn.children():
|
||||
if isinstance(layer, nn.Conv2d):
|
||||
i += 1
|
||||
name = 'conv_{}'.format(i)
|
||||
elif isinstance(layer, nn.ReLU):
|
||||
name = 'relu_{}'.format(i)
|
||||
# The in-place version doesn't play very nicely with the ContentLoss
|
||||
# and StyleLoss we insert below. So we replace with out-of-place
|
||||
# ones here.
|
||||
layer = nn.ReLU(inplace=False)
|
||||
elif isinstance(layer, nn.MaxPool2d):
|
||||
name = 'pool_{}'.format(i)
|
||||
elif isinstance(layer, nn.BatchNorm2d):
|
||||
name = 'bn_{}'.format(i)
|
||||
else:
|
||||
raise RuntimeError('Unrecognized layer: {}'.format(layer.__class__.__name__))
|
||||
|
||||
model.add_module(name, layer)
|
||||
|
||||
if name in content_layers:
|
||||
# add content loss:
|
||||
target = model(content_img).detach()
|
||||
content_loss = ContentLoss(target)
|
||||
model.add_module("content_loss_{}".format(i), content_loss)
|
||||
content_losses.append(content_loss)
|
||||
|
||||
if name in style_layers:
|
||||
# add style loss:
|
||||
target_feature = model(style_img).detach()
|
||||
style_loss = StyleLoss(target_feature)
|
||||
model.add_module("style_loss_{}".format(i), style_loss)
|
||||
style_losses.append(style_loss)
|
||||
|
||||
# now we trim off the layers after the last content and style losses
|
||||
for i in range(len(model) - 1, -1, -1):
|
||||
if isinstance(model[i], ContentLoss) or isinstance(model[i], StyleLoss):
|
||||
break
|
||||
|
||||
model = model[:(i + 1)]
|
||||
|
||||
return model, style_losses, content_losses
|
||||
|
||||
|
||||
######################################################################
|
||||
# Next, we select the input image. You can use a copy of the content image
|
||||
# or white noise.
|
||||
#
|
||||
|
||||
input_img = content_img.clone()
|
||||
# if you want to use white noise instead uncomment the below line:
|
||||
# input_img = torch.randn(content_img.data.size(), device=device)
|
||||
|
||||
# add the original input image to the figure:
|
||||
plt.figure()
|
||||
imshow(input_img, title='Input Image')
|
||||
|
||||
|
||||
######################################################################
|
||||
# Gradient Descent
|
||||
# ----------------
|
||||
#
|
||||
# As Leon Gatys, the author of the algorithm, suggested `here <https://discuss.pytorch.org/t/pytorch-tutorial-for-neural-transfert-of-artistic-style/336/20?u=alexis-jacq>`__, we will use
|
||||
# L-BFGS algorithm to run our gradient descent. Unlike training a network,
|
||||
# we want to train the input image in order to minimise the content/style
|
||||
# losses. We will create a PyTorch L-BFGS optimizer ``optim.LBFGS`` and pass
|
||||
# our image to it as the tensor to optimize.
|
||||
#
|
||||
|
||||
def get_input_optimizer(input_img):
|
||||
# this line to show that input is a parameter that requires a gradient
|
||||
optimizer = optim.LBFGS([input_img.requires_grad_()])
|
||||
return optimizer
|
||||
|
||||
|
||||
######################################################################
|
||||
# Finally, we must define a function that performs the neural transfer. For
|
||||
# each iteration of the networks, it is fed an updated input and computes
|
||||
# new losses. We will run the ``backward`` methods of each loss module to
|
||||
# dynamicaly compute their gradients. The optimizer requires a "closure"
|
||||
# function, which reevaluates the modul and returns the loss.
|
||||
#
|
||||
# We still have one final constraint to address. The network may try to
|
||||
# optimize the input with values that exceed the 0 to 1 tensor range for
|
||||
# the image. We can address this by correcting the input values to be
|
||||
# between 0 to 1 each time the network is run.
|
||||
#
|
||||
|
||||
def run_style_transfer(cnn, normalization_mean, normalization_std,
|
||||
content_img, style_img, input_img, num_steps=300,
|
||||
style_weight=1000000, content_weight=1):
|
||||
"""Run the style transfer."""
|
||||
print('Building the style transfer model..')
|
||||
model, style_losses, content_losses = get_style_model_and_losses(cnn,
|
||||
normalization_mean, normalization_std, style_img,
|
||||
content_img)
|
||||
optimizer = get_input_optimizer(input_img)
|
||||
|
||||
print('Optimizing..')
|
||||
run = [0]
|
||||
while run[0] <= num_steps:
|
||||
|
||||
def closure():
|
||||
# correct the values of updated input image
|
||||
input_img.data.clamp_(0, 1)
|
||||
|
||||
optimizer.zero_grad()
|
||||
model(input_img)
|
||||
style_score = 0
|
||||
content_score = 0
|
||||
|
||||
for sl in style_losses:
|
||||
style_score += sl.loss
|
||||
for cl in content_losses:
|
||||
content_score += cl.loss
|
||||
|
||||
style_score *= style_weight
|
||||
content_score *= content_weight
|
||||
|
||||
loss = style_score + content_score
|
||||
loss.backward()
|
||||
|
||||
run[0] += 1
|
||||
if run[0] % 50 == 0:
|
||||
print("run {}:".format(run))
|
||||
print('Style Loss : {:4f} Content Loss: {:4f}'.format(
|
||||
style_score.item(), content_score.item()))
|
||||
print()
|
||||
|
||||
return style_score + content_score
|
||||
|
||||
optimizer.step(closure)
|
||||
|
||||
# a last correction...
|
||||
input_img.data.clamp_(0, 1)
|
||||
|
||||
return input_img
|
||||
|
||||
|
||||
######################################################################
|
||||
# Finally, we can run the algorithm.
|
||||
#
|
||||
|
||||
output = run_style_transfer(cnn, cnn_normalization_mean, cnn_normalization_std,
|
||||
content_img, style_img, input_img)
|
||||
|
||||
plt.figure()
|
||||
imshow(output, title='Output Image')
|
||||
|
||||
# sphinx_gallery_thumbnail_number = 4
|
||||
plt.ioff()
|
||||
plt.show()
|
||||
134
examples/frameworks/pytorch/pytorch_mnist.py
Normal file
134
examples/frameworks/pytorch/pytorch_mnist.py
Normal file
@@ -0,0 +1,134 @@
|
||||
# TRAINS - Example of Pytorch mnist training integration
|
||||
#
|
||||
from __future__ import print_function
|
||||
import argparse
|
||||
import os
|
||||
from tempfile import gettempdir
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
from torchvision import datasets, transforms
|
||||
|
||||
from trains import Task, Logger
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
def __init__(self):
|
||||
super(Net, self).__init__()
|
||||
self.conv1 = nn.Conv2d(1, 20, 5, 1)
|
||||
self.conv2 = nn.Conv2d(20, 50, 5, 1)
|
||||
self.fc1 = nn.Linear(4 * 4 * 50, 500)
|
||||
self.fc2 = nn.Linear(500, 10)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.relu(self.conv1(x))
|
||||
x = F.max_pool2d(x, 2, 2)
|
||||
x = F.relu(self.conv2(x))
|
||||
x = F.max_pool2d(x, 2, 2)
|
||||
x = x.view(-1, 4 * 4 * 50)
|
||||
x = F.relu(self.fc1(x))
|
||||
x = self.fc2(x)
|
||||
return F.log_softmax(x, dim=1)
|
||||
|
||||
|
||||
def train(args, model, device, train_loader, optimizer, epoch):
|
||||
model.train()
|
||||
for batch_idx, (data, target) in enumerate(train_loader):
|
||||
data, target = data.to(device), target.to(device)
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = F.nll_loss(output, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
if batch_idx % args.log_interval == 0:
|
||||
Logger.current_logger().report_scalar(
|
||||
"train", "loss", iteration=(epoch * len(train_loader) + batch_idx), value=loss.item())
|
||||
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
||||
epoch, batch_idx * len(data), len(train_loader.dataset),
|
||||
100. * batch_idx / len(train_loader), loss.item()))
|
||||
|
||||
|
||||
def test(args, model, device, test_loader, epoch):
|
||||
model.eval()
|
||||
test_loss = 0
|
||||
correct = 0
|
||||
with torch.no_grad():
|
||||
for data, target in test_loader:
|
||||
data, target = data.to(device), target.to(device)
|
||||
output = model(data)
|
||||
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
|
||||
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
|
||||
correct += pred.eq(target.view_as(pred)).sum().item()
|
||||
|
||||
test_loss /= len(test_loader.dataset)
|
||||
|
||||
Logger.current_logger().report_scalar(
|
||||
"test", "loss", iteration=epoch, value=test_loss)
|
||||
Logger.current_logger().report_scalar(
|
||||
"test", "accuracy", iteration=epoch, value=(correct / len(test_loader.dataset)))
|
||||
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
|
||||
test_loss, correct, len(test_loader.dataset),
|
||||
100. * correct / len(test_loader.dataset)))
|
||||
|
||||
|
||||
def main():
|
||||
task = Task.init(project_name='examples', task_name='pytorch mnist train')
|
||||
|
||||
# Training settings
|
||||
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
|
||||
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
|
||||
help='input batch size for training (default: 64)')
|
||||
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
|
||||
help='input batch size for testing (default: 1000)')
|
||||
parser.add_argument('--epochs', type=int, default=10, metavar='N',
|
||||
help='number of epochs to train (default: 10)')
|
||||
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
|
||||
help='learning rate (default: 0.01)')
|
||||
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
|
||||
help='SGD momentum (default: 0.5)')
|
||||
parser.add_argument('--no-cuda', action='store_true', default=False,
|
||||
help='disables CUDA training')
|
||||
parser.add_argument('--seed', type=int, default=1, metavar='S',
|
||||
help='random seed (default: 1)')
|
||||
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
|
||||
help='how many batches to wait before logging training status')
|
||||
|
||||
parser.add_argument('--save-model', action='store_true', default=True,
|
||||
help='For Saving the current Model')
|
||||
args = parser.parse_args()
|
||||
use_cuda = not args.no_cuda and torch.cuda.is_available()
|
||||
|
||||
torch.manual_seed(args.seed)
|
||||
|
||||
device = torch.device("cuda" if use_cuda else "cpu")
|
||||
|
||||
kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
|
||||
train_loader = torch.utils.data.DataLoader(
|
||||
datasets.MNIST(os.path.join('..', 'data'), train=True, download=True,
|
||||
transform=transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))
|
||||
])),
|
||||
batch_size=args.batch_size, shuffle=True, **kwargs)
|
||||
test_loader = torch.utils.data.DataLoader(
|
||||
datasets.MNIST(os.path.join('..', 'data'), train=False, transform=transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))
|
||||
])),
|
||||
batch_size=args.test_batch_size, shuffle=True, **kwargs)
|
||||
|
||||
model = Net().to(device)
|
||||
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
|
||||
|
||||
for epoch in range(1, args.epochs + 1):
|
||||
train(args, model, device, train_loader, optimizer, epoch)
|
||||
test(args, model, device, test_loader, epoch)
|
||||
|
||||
if (args.save_model):
|
||||
torch.save(model.state_dict(), os.path.join(gettempdir(), "mnist_cnn.pt"))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
136
examples/frameworks/pytorch/pytorch_tensorboard.py
Normal file
136
examples/frameworks/pytorch/pytorch_tensorboard.py
Normal file
@@ -0,0 +1,136 @@
|
||||
# TRAINS - Example of pytorch with tensorboard>=v1.14
|
||||
#
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from tempfile import gettempdir
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
from torchvision import datasets, transforms
|
||||
from torch.autograd import Variable
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from trains import Task
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(Net, self).__init__()
|
||||
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
|
||||
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
|
||||
self.conv2_drop = nn.Dropout2d()
|
||||
self.fc1 = nn.Linear(320, 50)
|
||||
self.fc2 = nn.Linear(50, 10)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.relu(F.max_pool2d(self.conv1(x), 2))
|
||||
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
|
||||
x = x.view(-1, 320)
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.dropout(x, training=self.training)
|
||||
x = self.fc2(x)
|
||||
return F.log_softmax(x, dim=1)
|
||||
|
||||
|
||||
def train(model, epoch, train_loader, args, optimizer, writer):
|
||||
model.train()
|
||||
for batch_idx, (data, target) in enumerate(train_loader):
|
||||
if args.cuda:
|
||||
data, target = data.cuda(), target.cuda()
|
||||
data, target = Variable(data), Variable(target)
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = F.nll_loss(output, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
if batch_idx % args.log_interval == 0:
|
||||
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
|
||||
epoch, batch_idx * len(data), len(train_loader.dataset),
|
||||
100. * batch_idx / len(train_loader), loss.data.item()))
|
||||
niter = epoch*len(train_loader)+batch_idx
|
||||
writer.add_scalar('Train/Loss', loss.data.item(), niter)
|
||||
|
||||
|
||||
def test(model, test_loader, args, optimizer, writer):
|
||||
model.eval()
|
||||
test_loss = 0
|
||||
correct = 0
|
||||
for niter, (data, target) in enumerate(test_loader):
|
||||
if args.cuda:
|
||||
data, target = data.cuda(), target.cuda()
|
||||
data, target = Variable(data), Variable(target)
|
||||
output = model(data)
|
||||
test_loss += F.nll_loss(output, target, reduction='sum').data.item() # sum up batch loss
|
||||
pred = output.data.max(1)[1] # get the index of the max log-probability
|
||||
pred = pred.eq(target.data).cpu().sum()
|
||||
writer.add_scalar('Test/Loss', pred, niter)
|
||||
correct += pred
|
||||
if niter % 100 == 0:
|
||||
writer.add_image('test', data[0, :, :, :], niter)
|
||||
|
||||
test_loss /= len(test_loader.dataset)
|
||||
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
|
||||
test_loss, correct, len(test_loader.dataset),
|
||||
100. * correct / len(test_loader.dataset)))
|
||||
|
||||
|
||||
def main():
|
||||
# Training settings
|
||||
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
|
||||
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
|
||||
help='input batch size for training (default: 64)')
|
||||
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
|
||||
help='input batch size for testing (default: 1000)')
|
||||
parser.add_argument('--epochs', type=int, default=10, metavar='N',
|
||||
help='number of epochs to train (default: 10)')
|
||||
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
|
||||
help='learning rate (default: 0.01)')
|
||||
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
|
||||
help='SGD momentum (default: 0.5)')
|
||||
parser.add_argument('--no-cuda', action='store_true', default=False,
|
||||
help='disables CUDA training')
|
||||
parser.add_argument('--seed', type=int, default=1, metavar='S',
|
||||
help='random seed (default: 1)')
|
||||
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
|
||||
help='how many batches to wait before logging training status')
|
||||
args = parser.parse_args()
|
||||
task = Task.init(project_name='examples', task_name='pytorch with tensorboard') # noqa: F841
|
||||
writer = SummaryWriter('runs')
|
||||
writer.add_text('TEXT', 'This is some text', 0)
|
||||
args.cuda = not args.no_cuda and torch.cuda.is_available()
|
||||
|
||||
torch.manual_seed(args.seed)
|
||||
if args.cuda:
|
||||
torch.cuda.manual_seed(args.seed)
|
||||
|
||||
kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
|
||||
train_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=True, download=True,
|
||||
transform=transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])),
|
||||
batch_size=args.batch_size, shuffle=True, **kwargs)
|
||||
test_loader = torch.utils.data.DataLoader(datasets.MNIST('../data', train=False,
|
||||
transform=transforms.Compose([
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize((0.1307,), (0.3081,))])),
|
||||
batch_size=args.batch_size, shuffle=True, **kwargs)
|
||||
|
||||
model = Net()
|
||||
if args.cuda:
|
||||
model.cuda()
|
||||
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
|
||||
|
||||
for epoch in range(1, args.epochs + 1):
|
||||
train(model, epoch, train_loader, args, optimizer, writer)
|
||||
torch.save(model, os.path.join(gettempdir(), 'model{}'.format(epoch)))
|
||||
test(model, test_loader, args, optimizer, writer)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Hack for supporting Windows OS - https://pytorch.org/docs/stable/notes/windows.html#usage-multiprocessing
|
||||
main()
|
||||
1
examples/frameworks/pytorch/pytorch_tensorboardx.py
Symbolic link
1
examples/frameworks/pytorch/pytorch_tensorboardx.py
Symbolic link
@@ -0,0 +1 @@
|
||||
../tensorboardx/pytorch_tensorboardX.py
|
||||
6
examples/frameworks/pytorch/requirements.txt
Normal file
6
examples/frameworks/pytorch/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
matplotlib
|
||||
# tensorboardX
|
||||
tensorboard>=1.14.0
|
||||
torch>=1.1.0
|
||||
torchvision>=0.3.0
|
||||
trains
|
||||
29
examples/frameworks/pytorch/tensorboard_toy_pytorch.py
Normal file
29
examples/frameworks/pytorch/tensorboard_toy_pytorch.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import os
|
||||
from tempfile import gettempdir
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from trains import Task
|
||||
task = Task.init(project_name='examples', task_name='pytorch tensorboard toy example')
|
||||
|
||||
|
||||
writer = SummaryWriter(log_dir=os.path.join(gettempdir(), 'tensorboard_logs'))
|
||||
|
||||
# convert to 4d [batch, col, row, RGB-channels]
|
||||
image_open = Image.open(os.path.join("..", "..", "reporting", "data_samples", "picasso.jpg"))
|
||||
image = np.asarray(image_open)
|
||||
image_gray = image[:, :, 0][np.newaxis, :, :, np.newaxis]
|
||||
image_rgba = np.concatenate((image, 255*np.atleast_3d(np.ones(shape=image.shape[:2], dtype=np.uint8))), axis=2)
|
||||
image_rgba = image_rgba[np.newaxis, :, :, :]
|
||||
image = image[np.newaxis, :, :, :]
|
||||
|
||||
writer.add_image("test/first", image[0], dataformats='HWC')
|
||||
writer.add_image("test_gray/second", image_gray[0], dataformats='HWC')
|
||||
writer.add_image("test_rgba/third", image_rgba[0], dataformats='HWC')
|
||||
# writer.add_image("image/first_series", image, max_outputs=10)
|
||||
# writer.add_image("image_gray/second_series", image_gray, max_outputs=10)
|
||||
# writer.add_image("image_rgba/third_series", image_rgba, max_outputs=10)
|
||||
|
||||
print('Done!')
|
||||
Reference in New Issue
Block a user