From f74c89a25dd5e835eaf0fb9aa981fb98de62ef9a Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Mon, 10 Aug 2020 08:04:58 +0300 Subject: [PATCH] Update pytorch examples --- .../pytorch/notebooks/audio/README.md | 1 + ...nb => audio_classifier_UrbanSound8K.ipynb} | 28 +- .../audio/audio_preprocessing_example.ipynb | 5 +- .../image/hyperparameter_search.ipynb | 16 +- .../image/image_classification_CIFAR10.ipynb | 10 +- .../table/download_and_preprocessing.ipynb | 310 ++++++++++++++++++ .../table/train_tabular_predictor.ipynb | 295 +++++++++++++++++ .../text/text_classification_AG_NEWS.ipynb | 8 +- 8 files changed, 640 insertions(+), 33 deletions(-) create mode 100644 examples/frameworks/pytorch/notebooks/audio/README.md rename examples/frameworks/pytorch/notebooks/audio/{audio_classification_UrbanSound8K.ipynb => audio_classifier_UrbanSound8K.ipynb} (94%) create mode 100644 examples/frameworks/pytorch/notebooks/table/download_and_preprocessing.ipynb create mode 100644 examples/frameworks/pytorch/notebooks/table/train_tabular_predictor.ipynb diff --git a/examples/frameworks/pytorch/notebooks/audio/README.md b/examples/frameworks/pytorch/notebooks/audio/README.md new file mode 100644 index 00000000..18d23aa4 --- /dev/null +++ b/examples/frameworks/pytorch/notebooks/audio/README.md @@ -0,0 +1 @@ +The `audio_classifier_UrbanSound8K.ipynb` example uses a small dataset based on [UrbanSound8K dataset](https://urbansounddataset.weebly.com/urbansound8k.html). \ No newline at end of file diff --git a/examples/frameworks/pytorch/notebooks/audio/audio_classification_UrbanSound8K.ipynb b/examples/frameworks/pytorch/notebooks/audio/audio_classifier_UrbanSound8K.ipynb similarity index 94% rename from examples/frameworks/pytorch/notebooks/audio/audio_classification_UrbanSound8K.ipynb rename to examples/frameworks/pytorch/notebooks/audio/audio_classifier_UrbanSound8K.ipynb index 782d2d72..6220025b 100644 --- a/examples/frameworks/pytorch/notebooks/audio/audio_classification_UrbanSound8K.ipynb +++ b/examples/frameworks/pytorch/notebooks/audio/audio_classifier_UrbanSound8K.ipynb @@ -35,7 +35,6 @@ "import io\n", "\n", "import pandas as pd\n", - "import numpy as np\n", "from pathlib2 import Path\n", "import matplotlib.pyplot as plt\n", "\n", @@ -50,6 +49,7 @@ "from torchvision.transforms import ToTensor\n", "\n", "from trains import Task\n", + "from trains.storage import StorageManager\n", "\n", "%matplotlib inline" ] @@ -60,7 +60,7 @@ "metadata": {}, "outputs": [], "source": [ - "task = Task.init(project_name='Audio Example', task_name='audio classifier')\n", + "task = Task.init(project_name='Audio Example', task_name='audio classification UrbanSound8K')\n", "configuration_dict = {'number_of_epochs': 10, 'batch_size': 4, 'dropout': 0.25, 'base_lr': 0.001}\n", "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n", "print(configuration_dict) # printing actual configuration (after override in remote mode)" @@ -77,8 +77,8 @@ }, "outputs": [], "source": [ - "# Download UrbanSound8K dataset (https://urbansounddataset.weebly.com/urbansound8k.html)\n", - "path_to_UrbanSound8K = './data/UrbanSound8K'" + "# Download a sample dataset (https://allegro-datasets.s3.amazonaws.com/trains/UrbanSound8K.zip)based on UrbanSound8K dataset (https://urbansounddataset.weebly.com/urbansound8k.html)\n", + "path_to_UrbanSound8K = StorageManager.get_local_copy(\"https://allegro-datasets.s3.amazonaws.com/trains/UrbanSound8K.zip\", extract_archive=True, )" ] }, { @@ -134,8 +134,8 @@ " return len(self.file_names)\n", "\n", "\n", - "csv_path = Path(path_to_UrbanSound8K) / 'metadata' / 'UrbanSound8K.csv'\n", - "file_path = Path(path_to_UrbanSound8K) / 'audio'\n", + "csv_path = Path(path_to_UrbanSound8K) / 'UrbanSound8K' / 'metadata' / 'UrbanSound8K.csv'\n", + "file_path = Path(path_to_UrbanSound8K) / 'UrbanSound8K' / 'audio'\n", "\n", "train_set = UrbanSoundDataset(csv_path, file_path, range(1,10))\n", "test_set = UrbanSoundDataset(csv_path, file_path, [10])\n", @@ -338,18 +338,24 @@ "metadata": { "colab": {}, "colab_type": "code", - "id": "X5lx3g_5zNey", - "scrolled": false + "id": "X5lx3g_5zNey" }, "outputs": [], "source": [ - "log_interval = 100\n", - "debug_interval = 200\n", + "log_interval = 10\n", + "debug_interval = 20\n", "for epoch in range(configuration_dict.get('number_of_epochs', 10)):\n", " train(model, epoch)\n", " test(model, epoch)\n", " scheduler.step()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -376,5 +382,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/examples/frameworks/pytorch/notebooks/audio/audio_preprocessing_example.ipynb b/examples/frameworks/pytorch/notebooks/audio/audio_preprocessing_example.ipynb index 6baae9fe..cd56102e 100644 --- a/examples/frameworks/pytorch/notebooks/audio/audio_preprocessing_example.ipynb +++ b/examples/frameworks/pytorch/notebooks/audio/audio_preprocessing_example.ipynb @@ -23,7 +23,6 @@ "outputs": [], "source": [ "import os\n", - "import torch\n", "import torchaudio\n", "from torch.utils.tensorboard import SummaryWriter\n", "import matplotlib.pyplot as plt\n", @@ -87,10 +86,10 @@ "cell_type": "code", "execution_count": null, "metadata": { + "scrolled": true, "pycharm": { "name": "#%%\n" - }, - "scrolled": true + } }, "outputs": [], "source": [ diff --git a/examples/frameworks/pytorch/notebooks/image/hyperparameter_search.ipynb b/examples/frameworks/pytorch/notebooks/image/hyperparameter_search.ipynb index 8baf115a..9cf3aeb8 100644 --- a/examples/frameworks/pytorch/notebooks/image/hyperparameter_search.ipynb +++ b/examples/frameworks/pytorch/notebooks/image/hyperparameter_search.ipynb @@ -12,8 +12,8 @@ "\n", "# pip install with locked versions\n", "! pip install -U pandas==1.0.3\n", - "! pip install -U trains==0.15.0\n", - "! pip install -U hpbandster==0.7.4 # Needed only for Bayesian optimization Hyper-Band" + "! pip install -U trains>=0.15.0\n", + "! pip install -U optuna==2.0.0rc0" ] }, { @@ -23,8 +23,8 @@ "outputs": [], "source": [ "from trains.automation import UniformParameterRange, UniformIntegerParameterRange\n", - "from trains.automation import RandomSearch, HyperParameterOptimizer\n", - "from trains.automation.hpbandster import OptimizerBOHB # Needed only for Bayesian optimization Hyper-Band\n", + "from trains.automation import HyperParameterOptimizer\n", + "from trains.automation.optuna import OptimizerOptuna\n", "\n", "from trains import Task" ] @@ -35,7 +35,7 @@ "metadata": {}, "outputs": [], "source": [ - "task = Task.init(project_name='Hyper-Parameter Search', task_name='Hyper-Parameter Optimization')" + "task = Task.init(project_name='Hyper-Parameter Search', task_name='Hyper-Parameter Optimization')\n" ] }, { @@ -47,7 +47,7 @@ "#####################################################################\n", "### Don't forget to replace this default id with your own task id ###\n", "#####################################################################\n", - "TEMPLATE_TASK_ID = 'd8e928460f98437c998f3597768597f8'" + "TEMPLATE_TASK_ID = 'd551a9990cb5451c9c744cc58201c612'" ] }, { @@ -71,7 +71,7 @@ " objective_metric_sign='max', # maximize or minimize the objective metric\n", " max_number_of_concurrent_tasks=3, # number of concurrent experiments\n", " # setting optimizer - trains supports GridSearch, RandomSearch or OptimizerBOHB\n", - " optimizer_class=OptimizerBOHB, # can be replaced with OptimizerBOHB\n", + " optimizer_class=OptimizerOptuna, # can be replaced with OptimizerBOHB\n", " execution_queue='default', # queue to schedule the experiments for execution\n", " optimization_time_limit=30., # time limit for each experiment (optional, ignored by OptimizerBOHB)\n", " pool_period_min=1, # Check the experiments every x minutes\n", @@ -90,7 +90,7 @@ "metadata": {}, "outputs": [], "source": [ - "optimizer.set_time_limit(in_minutes=120.0) # set the time limit for the optimization process\n", + "optimizer.set_time_limit(in_minutes=90.0) # set the time limit for the optimization process\n", "optimizer.start() \n", "optimizer.wait() # wait until process is done\n", "optimizer.stop() # make sure background optimization stopped" diff --git a/examples/frameworks/pytorch/notebooks/image/image_classification_CIFAR10.ipynb b/examples/frameworks/pytorch/notebooks/image/image_classification_CIFAR10.ipynb index c4c41703..00329805 100644 --- a/examples/frameworks/pytorch/notebooks/image/image_classification_CIFAR10.ipynb +++ b/examples/frameworks/pytorch/notebooks/image/image_classification_CIFAR10.ipynb @@ -12,10 +12,10 @@ "# jupyter nbextension enable --py widgetsnbextension\n", "\n", "# pip install with locked versions\n", - "! pip install -U torch==1.5.0\n", - "! pip install -U torchvision==0.6.0\n", + "! pip install -U torch==1.5.1\n", + "! pip install -U torchvision==0.6.1\n", "! pip install -U numpy==1.18.4\n", - "! pip install -U trains==0.15.0\n", + "! pip install -U trains>=0.15.0\n", "! pip install -U tensorboard==2.2.1" ] }, @@ -45,7 +45,7 @@ "metadata": {}, "outputs": [], "source": [ - "task = Task.init(project_name='Image Example', task_name='image_classification_CIFAR10')\n", + "task = Task.init(project_name='Image Example', task_name='image classification CIFAR10')\n", "configuration_dict = {'number_of_epochs': 3, 'batch_size': 4, 'dropout': 0.25, 'base_lr': 0.001}\n", "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n", "print(configuration_dict) # printing actual configuration (after override in remote mode)" @@ -240,4 +240,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/examples/frameworks/pytorch/notebooks/table/download_and_preprocessing.ipynb b/examples/frameworks/pytorch/notebooks/table/download_and_preprocessing.ipynb new file mode 100644 index 00000000..f57a3ccc --- /dev/null +++ b/examples/frameworks/pytorch/notebooks/table/download_and_preprocessing.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install -U pip\n", + "! pip install -U torch==1.5.1\n", + "! pip install -U trains>=0.15.1\n", + "! pip install -U pandas==1.0.4\n", + "! pip install -U numpy==1.18.4\n", + "! pip install -U pathlib2==2.3.5\n", + "! pip install -U scikit-learn==0.23.1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from collections import Counter\n", + "from sklearn.model_selection import train_test_split\n", + "import torch\n", + "from datetime import datetime\n", + "from pathlib2 import Path\n", + "from trains import Task" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "task = Task.init(project_name='Table Example', task_name='tabular preprocessing')\n", + "logger = task.get_logger()\n", + "configuration_dict = {'test_size': 0.1, 'split_random_state': 0}\n", + "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n", + "print(configuration_dict) # printing actual configuration (after override in remote mode)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)\n", + "# This dataset aims to improve understanding trends in animal outcome,\n", + "# Which could help shelters focus their energy on specific animals who need extra help finding a new home. \n", + "path_to_ShelterAnimal = './data'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_set = pd.read_csv(Path(path_to_ShelterAnimal) / 'train.csv')\n", + "logger.report_table(title='Trainset - raw',series='pandas DataFrame',iteration=0, table_plot=train_set.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Pre-processing**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove hour and year from DateTime data\n", + "timestamp = pd.to_datetime(train_set['DateTime'])\n", + "months = [d.month for d in timestamp]\n", + "train_set['Month'] = pd.DataFrame(months).astype('object')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "age = train_set['AgeuponOutcome']\n", + "months_age = []\n", + "for val in age:\n", + " if pd.isnull(val):\n", + " months_age.append(val)\n", + " else:\n", + " amount, time_type = val.split(' ')\n", + " if 'day' in time_type:\n", + " mult = 1./30\n", + " if 'week' in time_type:\n", + " mult = 1./4\n", + " if 'month' in time_type:\n", + " mult = 1.\n", + " if 'year' in time_type:\n", + " mult = 12.\n", + " months_age.append(int(amount) * mult)\n", + "train_set['Age'] = pd.DataFrame(months_age).astype(np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sex_neutered = train_set['SexuponOutcome']\n", + "sex = []\n", + "neutered = []\n", + "for val in sex_neutered:\n", + " if pd.isnull(val):\n", + " sex.append(val)\n", + " neutered.append(val)\n", + " elif 'Unknown' in val:\n", + " sex.append(np.nan)\n", + " neutered.append(np.nan)\n", + " else:\n", + " n, s = val.split(' ')\n", + " if n in ['Neutered', 'Spayed']:\n", + " neutered.append('Yes')\n", + " else:\n", + " neutered.append('No')\n", + " sex.append(s)\n", + "\n", + "train_set['Sex'] = pd.DataFrame(sex)\n", + "train_set['Neutered'] = pd.DataFrame(neutered)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove irrelevant columns\n", + "train_set.drop(columns= ['Name', 'OutcomeSubtype', 'AnimalID', 'DateTime', 'AgeuponOutcome', 'SexuponOutcome'], inplace=True)\n", + "logger.report_table(title='Trainset - after preprocessing',series='pandas DataFrame',iteration=0, table_plot=train_set.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## *Fill NA Values*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "object_columns = train_set.select_dtypes(include=['object']).copy()\n", + "numerical_columns = train_set.select_dtypes(include=['number']).copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for col in object_columns.columns:\n", + " if object_columns[col].isnull().sum() > 0:\n", + " most_common = Counter(object_columns[col]).most_common(1)[0][0]\n", + " print('Column \"{}\": replacing null values with \"{}\"'.format(col, most_common))\n", + " train_set[col].fillna(most_common, inplace=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for col in numerical_columns.columns:\n", + " if numerical_columns[col].isnull().sum() > 0:\n", + " median_val = numerical_columns[col].median()\n", + " print('Column \"{}\": replacing null values with \"{}\"'.format(col, median_val))\n", + " train_set[col].fillna(median_val, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logger.report_table(title='Trainset - after filling missing values',series='pandas DataFrame',iteration=0, table_plot=train_set.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## *Labels Encoding*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "out_encoding = train_set['OutcomeType'].astype('category').cat.categories\n", + "outcome_dict = {key: val for val,key in enumerate(out_encoding)}\n", + "task.upload_artifact('Outcome dictionary', outcome_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for col in object_columns.columns:\n", + " train_set[col] = train_set[col].astype('category').cat.codes\n", + "logger.report_table(title='Trainset - after labels encoding',series='pandas DataFrame',iteration=0, table_plot=train_set.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## *Splitting dataset*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = train_set.drop(columns= ['OutcomeType'])\n", + "Y = train_set['OutcomeType']\n", + "X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=configuration_dict.get('test_size', 0.1), \n", + " random_state=configuration_dict.get('split_random_state', 0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# making all variables categorical\n", + "object_columns_names = object_columns.drop(columns= ['OutcomeType']).columns\n", + "for col in object_columns_names:\n", + " X[col] = X[col].astype('category')\n", + "columns_categries = {col: len(X[col].cat.categories) for col in object_columns_names}\n", + "task.upload_artifact('Categries per column', columns_categries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df = X_train.join(Y_train)\n", + "train_df.to_csv(Path(path_to_ShelterAnimal) / 'train_processed.csv', index=False)\n", + "val_df = X_val.join(Y_val)\n", + "val_df.to_csv(Path(path_to_ShelterAnimal) / 'val_processed.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "paths = {'train_data': str(Path(path_to_ShelterAnimal) / 'train_processed.csv'), 'val_data': str(Path(path_to_ShelterAnimal) / 'val_processed.csv')}\n", + "task.upload_artifact('Processed data', paths)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/frameworks/pytorch/notebooks/table/train_tabular_predictor.ipynb b/examples/frameworks/pytorch/notebooks/table/train_tabular_predictor.ipynb new file mode 100644 index 00000000..78c49e0b --- /dev/null +++ b/examples/frameworks/pytorch/notebooks/table/train_tabular_predictor.ipynb @@ -0,0 +1,295 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install -U pip\n", + "! pip install -U torch==1.5.1\n", + "! pip install -U trains>=0.15.1\n", + "! pip install -U pandas==1.0.4\n", + "! pip install -U numpy==1.18.4\n", + "! pip install -U tensorboard==2.2.1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torch.utils.data import Dataset\n", + "from torch.utils.tensorboard import SummaryWriter\n", + "\n", + "from trains import Task" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "task = Task.init(project_name='Table Example', task_name='tabular prediction')\n", + "logger = task.get_logger()\n", + "configuration_dict = {'number_of_epochs': 30, 'batch_size': 100, 'dropout': 0.3, 'base_lr': 0.1}\n", + "configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n", + "print(configuration_dict) # printing actual configuration (after override in remote mode)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "previous_task = Task.get_task('ed7570e1e12d41e5a06557c81fdf1046')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preprocessed_data = previous_task.artifacts['Processed data'].get()\n", + "train_set = pd.read_csv(preprocessed_data['train_data'])\n", + "test_set = pd.read_csv(preprocessed_data['val_data'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "columns_categories = previous_task.artifacts['Categries per column'].get()\n", + "columns_categories_ordered = {key: columns_categories[key] for key in train_set.columns if key in columns_categories.keys()}\n", + "columns_numerical = [key for key in train_set.drop(columns= ['OutcomeType']).drop(columns=columns_categories_ordered).keys()]\n", + "embedding_sizes = [(n_categories, min(32, (n_categories+1)//2)) for _,n_categories in columns_categories_ordered.items()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "outcome_dict = previous_task.artifacts['Outcome dictionary'].get()\n", + "reveresed_outcome_dict = {val: key for key, val in outcome_dict.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ShelterDataset(Dataset):\n", + " def __init__(self, X, Y, embedded_col_names):\n", + " X = X.copy()\n", + " self.X1 = X.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns\n", + " self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns\n", + " self.y = Y\n", + " \n", + " def __len__(self):\n", + " return len(self.y)\n", + " \n", + " def __getitem__(self, idx):\n", + " return self.X1[idx], self.X2[idx], self.y[idx]\n", + "\n", + "#creating train and valid datasets\n", + "train_ds = ShelterDataset(train_set.drop(columns= ['OutcomeType']), train_set['OutcomeType'], columns_categories_ordered.keys())\n", + "valid_ds = ShelterDataset(test_set.drop(columns= ['OutcomeType']), test_set['OutcomeType'], columns_categories_ordered.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ShelterModel(nn.Module):\n", + " def __init__(self, embedding_sizes, n_cont):\n", + " super().__init__()\n", + " self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])\n", + " n_emb = sum(e.embedding_dim for e in self.embeddings)\n", + " self.n_emb, self.n_cont = n_emb, n_cont\n", + " self.lin1 = nn.Linear(self.n_emb + self.n_cont, 200)\n", + " self.lin2 = nn.Linear(200, 70)\n", + " self.lin3 = nn.Linear(70, 5)\n", + " self.bn1 = nn.BatchNorm1d(self.n_cont)\n", + " self.bn2 = nn.BatchNorm1d(200)\n", + " self.bn3 = nn.BatchNorm1d(70)\n", + " self.emb_drop = nn.Dropout(0.6)\n", + " self.drops = nn.Dropout(configuration_dict.get('dropout', 0.25))\n", + "\n", + " def forward(self, x_cat, x_cont):\n", + " x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]\n", + " x = torch.cat(x, 1)\n", + " x = self.emb_drop(x)\n", + " x2 = self.bn1(x_cont)\n", + " x = torch.cat([x, x2], 1)\n", + " x = F.relu(self.lin1(x))\n", + " x = self.drops(x)\n", + " x = self.bn2(x)\n", + " x = F.relu(self.lin2(x))\n", + " x = self.drops(x)\n", + " x = self.bn3(x)\n", + " x = self.lin3(x)\n", + " return x\n", + "\n", + "model = ShelterModel(embedding_sizes, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.SGD(model.parameters(), lr = configuration_dict.get('base_lr', 0.1), momentum = 0.9)\n", + "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = configuration_dict.get('number_of_epochs', 15)//3, gamma = 0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')\n", + "print('Device to use: {}'.format(device))\n", + "model.to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tensorboard_writer = SummaryWriter('./tensorboard_logs')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train_model(model, optim, train_dl):\n", + " model.train()\n", + " total = 0\n", + " sum_loss = 0\n", + " for x1, x2, y in train_dl:\n", + " batch = y.shape[0]\n", + " output = model(x1.to(device), x2.to(device))\n", + " loss = F.cross_entropy(output, y.to(device)) \n", + " optim.zero_grad()\n", + " loss.backward()\n", + " optim.step()\n", + " total += batch\n", + " sum_loss += batch*(loss.item())\n", + " return sum_loss/total" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def val_loss(model, valid_dl, epoch):\n", + " model.eval()\n", + " total = 0\n", + " sum_loss = 0\n", + " correct = 0\n", + " with torch.no_grad():\n", + " for x1, x2, y in valid_dl:\n", + " current_batch_size = y.shape[0]\n", + " out = model(x1.to(device), x2.to(device))\n", + " loss = F.cross_entropy(out, y.to(device))\n", + " sum_loss += current_batch_size*(loss.item())\n", + " total += current_batch_size\n", + " pred = torch.max(out, 1)[1]\n", + " correct += (pred.cpu() == y).float().sum().item()\n", + " print(\"\\t valid loss %.3f and accuracy %.3f\" % (sum_loss/total, correct/total))\n", + " tensorboard_writer.add_scalar('accuracy/total', correct/total, epoch)\n", + " \n", + " debug_categories = pd.DataFrame(x1.numpy(), columns=columns_categories_ordered.keys())\n", + " debug_numercal = pd.DataFrame(x2.numpy(), columns=columns_numerical)\n", + " debug_gt = pd.DataFrame(np.array([reveresed_outcome_dict[int(e)] for e in y]), columns=['GT'])\n", + " debug_pred = pd.DataFrame(np.array([reveresed_outcome_dict[int(e)] for e in pred.cpu()]), columns=['Pred'])\n", + " debug_table = debug_categories.join([debug_numercal, debug_gt, debug_pred])\n", + " logger.report_table(title='Trainset - after labels encoding',series='pandas DataFrame',iteration=epoch, table_plot=debug_table.head())\n", + " return sum_loss/total, correct/total" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train_loop(model, epochs):\n", + " for i in range(epochs): \n", + " loss = train_model(model, optimizer, train_dl)\n", + " print(\"Epoch {}: training loss {}\".format(i, loss))\n", + " tensorboard_writer.add_scalar('training loss/loss', loss, i)\n", + " tensorboard_writer.add_scalar('learning rate/lr', optimizer.param_groups[0]['lr'], i)\n", + " \n", + " val_loss(model, valid_dl, i)\n", + " scheduler.step()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_dl = torch.utils.data.DataLoader(train_ds, batch_size=configuration_dict.get('batch_size', 100), shuffle=True, pin_memory=True, num_workers=1)\n", + "valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=configuration_dict.get('batch_size', 100), shuffle=False, pin_memory=True, num_workers=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_loop(model, epochs=configuration_dict.get('number_of_epochs', 30))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/frameworks/pytorch/notebooks/text/text_classification_AG_NEWS.ipynb b/examples/frameworks/pytorch/notebooks/text/text_classification_AG_NEWS.ipynb index 1990f94d..5ab63d2b 100644 --- a/examples/frameworks/pytorch/notebooks/text/text_classification_AG_NEWS.ipynb +++ b/examples/frameworks/pytorch/notebooks/text/text_classification_AG_NEWS.ipynb @@ -25,10 +25,7 @@ "\n", "import torch\n", "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "import torchtext\n", "from torchtext.datasets import text_classification\n", - "from torch.utils.data import DataLoader\n", "from torch.utils.tensorboard import SummaryWriter\n", "\n", "from trains import Task\n", @@ -264,7 +261,6 @@ }, "outputs": [], "source": [ - "import re\n", "from torchtext.data.utils import ngrams_iterator\n", "from torchtext.data.utils import get_tokenizer\n", "\n", @@ -313,5 +309,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 -} \ No newline at end of file + "nbformat_minor": 4 +}