mirror of
https://github.com/clearml/clearml
synced 2025-04-02 20:11:00 +00:00
Update pytorch examples
This commit is contained in:
parent
73bd8c2714
commit
f74c89a25d
1
examples/frameworks/pytorch/notebooks/audio/README.md
Normal file
1
examples/frameworks/pytorch/notebooks/audio/README.md
Normal file
@ -0,0 +1 @@
|
||||
The `audio_classifier_UrbanSound8K.ipynb` example uses a small dataset based on [UrbanSound8K dataset](https://urbansounddataset.weebly.com/urbansound8k.html).
|
@ -35,7 +35,6 @@
|
||||
"import io\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from pathlib2 import Path\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
@ -50,6 +49,7 @@
|
||||
"from torchvision.transforms import ToTensor\n",
|
||||
"\n",
|
||||
"from trains import Task\n",
|
||||
"from trains.storage import StorageManager\n",
|
||||
"\n",
|
||||
"%matplotlib inline"
|
||||
]
|
||||
@ -60,7 +60,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task = Task.init(project_name='Audio Example', task_name='audio classifier')\n",
|
||||
"task = Task.init(project_name='Audio Example', task_name='audio classification UrbanSound8K')\n",
|
||||
"configuration_dict = {'number_of_epochs': 10, 'batch_size': 4, 'dropout': 0.25, 'base_lr': 0.001}\n",
|
||||
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||
@ -77,8 +77,8 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download UrbanSound8K dataset (https://urbansounddataset.weebly.com/urbansound8k.html)\n",
|
||||
"path_to_UrbanSound8K = './data/UrbanSound8K'"
|
||||
"# Download a sample dataset (https://allegro-datasets.s3.amazonaws.com/trains/UrbanSound8K.zip)based on UrbanSound8K dataset (https://urbansounddataset.weebly.com/urbansound8k.html)\n",
|
||||
"path_to_UrbanSound8K = StorageManager.get_local_copy(\"https://allegro-datasets.s3.amazonaws.com/trains/UrbanSound8K.zip\", extract_archive=True, )"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -134,8 +134,8 @@
|
||||
" return len(self.file_names)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"csv_path = Path(path_to_UrbanSound8K) / 'metadata' / 'UrbanSound8K.csv'\n",
|
||||
"file_path = Path(path_to_UrbanSound8K) / 'audio'\n",
|
||||
"csv_path = Path(path_to_UrbanSound8K) / 'UrbanSound8K' / 'metadata' / 'UrbanSound8K.csv'\n",
|
||||
"file_path = Path(path_to_UrbanSound8K) / 'UrbanSound8K' / 'audio'\n",
|
||||
"\n",
|
||||
"train_set = UrbanSoundDataset(csv_path, file_path, range(1,10))\n",
|
||||
"test_set = UrbanSoundDataset(csv_path, file_path, [10])\n",
|
||||
@ -338,18 +338,24 @@
|
||||
"metadata": {
|
||||
"colab": {},
|
||||
"colab_type": "code",
|
||||
"id": "X5lx3g_5zNey",
|
||||
"scrolled": false
|
||||
"id": "X5lx3g_5zNey"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"log_interval = 100\n",
|
||||
"debug_interval = 200\n",
|
||||
"log_interval = 10\n",
|
||||
"debug_interval = 20\n",
|
||||
"for epoch in range(configuration_dict.get('number_of_epochs', 10)):\n",
|
||||
" train(model, epoch)\n",
|
||||
" test(model, epoch)\n",
|
||||
" scheduler.step()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -376,5 +382,5 @@
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -23,7 +23,6 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import torch\n",
|
||||
"import torchaudio\n",
|
||||
"from torch.utils.tensorboard import SummaryWriter\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
@ -87,10 +86,10 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"pycharm": {
|
||||
"name": "#%%\n"
|
||||
},
|
||||
"scrolled": true
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -12,8 +12,8 @@
|
||||
"\n",
|
||||
"# pip install with locked versions\n",
|
||||
"! pip install -U pandas==1.0.3\n",
|
||||
"! pip install -U trains==0.15.0\n",
|
||||
"! pip install -U hpbandster==0.7.4 # Needed only for Bayesian optimization Hyper-Band"
|
||||
"! pip install -U trains>=0.15.0\n",
|
||||
"! pip install -U optuna==2.0.0rc0"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -23,8 +23,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from trains.automation import UniformParameterRange, UniformIntegerParameterRange\n",
|
||||
"from trains.automation import RandomSearch, HyperParameterOptimizer\n",
|
||||
"from trains.automation.hpbandster import OptimizerBOHB # Needed only for Bayesian optimization Hyper-Band\n",
|
||||
"from trains.automation import HyperParameterOptimizer\n",
|
||||
"from trains.automation.optuna import OptimizerOptuna\n",
|
||||
"\n",
|
||||
"from trains import Task"
|
||||
]
|
||||
@ -35,7 +35,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task = Task.init(project_name='Hyper-Parameter Search', task_name='Hyper-Parameter Optimization')"
|
||||
"task = Task.init(project_name='Hyper-Parameter Search', task_name='Hyper-Parameter Optimization')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -47,7 +47,7 @@
|
||||
"#####################################################################\n",
|
||||
"### Don't forget to replace this default id with your own task id ###\n",
|
||||
"#####################################################################\n",
|
||||
"TEMPLATE_TASK_ID = 'd8e928460f98437c998f3597768597f8'"
|
||||
"TEMPLATE_TASK_ID = 'd551a9990cb5451c9c744cc58201c612'"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -71,7 +71,7 @@
|
||||
" objective_metric_sign='max', # maximize or minimize the objective metric\n",
|
||||
" max_number_of_concurrent_tasks=3, # number of concurrent experiments\n",
|
||||
" # setting optimizer - trains supports GridSearch, RandomSearch or OptimizerBOHB\n",
|
||||
" optimizer_class=OptimizerBOHB, # can be replaced with OptimizerBOHB\n",
|
||||
" optimizer_class=OptimizerOptuna, # can be replaced with OptimizerBOHB\n",
|
||||
" execution_queue='default', # queue to schedule the experiments for execution\n",
|
||||
" optimization_time_limit=30., # time limit for each experiment (optional, ignored by OptimizerBOHB)\n",
|
||||
" pool_period_min=1, # Check the experiments every x minutes\n",
|
||||
@ -90,7 +90,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer.set_time_limit(in_minutes=120.0) # set the time limit for the optimization process\n",
|
||||
"optimizer.set_time_limit(in_minutes=90.0) # set the time limit for the optimization process\n",
|
||||
"optimizer.start() \n",
|
||||
"optimizer.wait() # wait until process is done\n",
|
||||
"optimizer.stop() # make sure background optimization stopped"
|
||||
|
@ -12,10 +12,10 @@
|
||||
"# jupyter nbextension enable --py widgetsnbextension\n",
|
||||
"\n",
|
||||
"# pip install with locked versions\n",
|
||||
"! pip install -U torch==1.5.0\n",
|
||||
"! pip install -U torchvision==0.6.0\n",
|
||||
"! pip install -U torch==1.5.1\n",
|
||||
"! pip install -U torchvision==0.6.1\n",
|
||||
"! pip install -U numpy==1.18.4\n",
|
||||
"! pip install -U trains==0.15.0\n",
|
||||
"! pip install -U trains>=0.15.0\n",
|
||||
"! pip install -U tensorboard==2.2.1"
|
||||
]
|
||||
},
|
||||
@ -45,7 +45,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task = Task.init(project_name='Image Example', task_name='image_classification_CIFAR10')\n",
|
||||
"task = Task.init(project_name='Image Example', task_name='image classification CIFAR10')\n",
|
||||
"configuration_dict = {'number_of_epochs': 3, 'batch_size': 4, 'dropout': 0.25, 'base_lr': 0.001}\n",
|
||||
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||
@ -240,4 +240,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,310 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install -U pip\n",
|
||||
"! pip install -U torch==1.5.1\n",
|
||||
"! pip install -U trains>=0.15.1\n",
|
||||
"! pip install -U pandas==1.0.4\n",
|
||||
"! pip install -U numpy==1.18.4\n",
|
||||
"! pip install -U pathlib2==2.3.5\n",
|
||||
"! pip install -U scikit-learn==0.23.1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from collections import Counter\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"import torch\n",
|
||||
"from datetime import datetime\n",
|
||||
"from pathlib2 import Path\n",
|
||||
"from trains import Task"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task = Task.init(project_name='Table Example', task_name='tabular preprocessing')\n",
|
||||
"logger = task.get_logger()\n",
|
||||
"configuration_dict = {'test_size': 0.1, 'split_random_state': 0}\n",
|
||||
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Download shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)\n",
|
||||
"# This dataset aims to improve understanding trends in animal outcome,\n",
|
||||
"# Which could help shelters focus their energy on specific animals who need extra help finding a new home. \n",
|
||||
"path_to_ShelterAnimal = './data'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_set = pd.read_csv(Path(path_to_ShelterAnimal) / 'train.csv')\n",
|
||||
"logger.report_table(title='Trainset - raw',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# **Pre-processing**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Remove hour and year from DateTime data\n",
|
||||
"timestamp = pd.to_datetime(train_set['DateTime'])\n",
|
||||
"months = [d.month for d in timestamp]\n",
|
||||
"train_set['Month'] = pd.DataFrame(months).astype('object')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"age = train_set['AgeuponOutcome']\n",
|
||||
"months_age = []\n",
|
||||
"for val in age:\n",
|
||||
" if pd.isnull(val):\n",
|
||||
" months_age.append(val)\n",
|
||||
" else:\n",
|
||||
" amount, time_type = val.split(' ')\n",
|
||||
" if 'day' in time_type:\n",
|
||||
" mult = 1./30\n",
|
||||
" if 'week' in time_type:\n",
|
||||
" mult = 1./4\n",
|
||||
" if 'month' in time_type:\n",
|
||||
" mult = 1.\n",
|
||||
" if 'year' in time_type:\n",
|
||||
" mult = 12.\n",
|
||||
" months_age.append(int(amount) * mult)\n",
|
||||
"train_set['Age'] = pd.DataFrame(months_age).astype(np.float32)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sex_neutered = train_set['SexuponOutcome']\n",
|
||||
"sex = []\n",
|
||||
"neutered = []\n",
|
||||
"for val in sex_neutered:\n",
|
||||
" if pd.isnull(val):\n",
|
||||
" sex.append(val)\n",
|
||||
" neutered.append(val)\n",
|
||||
" elif 'Unknown' in val:\n",
|
||||
" sex.append(np.nan)\n",
|
||||
" neutered.append(np.nan)\n",
|
||||
" else:\n",
|
||||
" n, s = val.split(' ')\n",
|
||||
" if n in ['Neutered', 'Spayed']:\n",
|
||||
" neutered.append('Yes')\n",
|
||||
" else:\n",
|
||||
" neutered.append('No')\n",
|
||||
" sex.append(s)\n",
|
||||
"\n",
|
||||
"train_set['Sex'] = pd.DataFrame(sex)\n",
|
||||
"train_set['Neutered'] = pd.DataFrame(neutered)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Remove irrelevant columns\n",
|
||||
"train_set.drop(columns= ['Name', 'OutcomeSubtype', 'AnimalID', 'DateTime', 'AgeuponOutcome', 'SexuponOutcome'], inplace=True)\n",
|
||||
"logger.report_table(title='Trainset - after preprocessing',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## *Fill NA Values*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"object_columns = train_set.select_dtypes(include=['object']).copy()\n",
|
||||
"numerical_columns = train_set.select_dtypes(include=['number']).copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for col in object_columns.columns:\n",
|
||||
" if object_columns[col].isnull().sum() > 0:\n",
|
||||
" most_common = Counter(object_columns[col]).most_common(1)[0][0]\n",
|
||||
" print('Column \"{}\": replacing null values with \"{}\"'.format(col, most_common))\n",
|
||||
" train_set[col].fillna(most_common, inplace=True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for col in numerical_columns.columns:\n",
|
||||
" if numerical_columns[col].isnull().sum() > 0:\n",
|
||||
" median_val = numerical_columns[col].median()\n",
|
||||
" print('Column \"{}\": replacing null values with \"{}\"'.format(col, median_val))\n",
|
||||
" train_set[col].fillna(median_val, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"logger.report_table(title='Trainset - after filling missing values',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## *Labels Encoding*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"out_encoding = train_set['OutcomeType'].astype('category').cat.categories\n",
|
||||
"outcome_dict = {key: val for val,key in enumerate(out_encoding)}\n",
|
||||
"task.upload_artifact('Outcome dictionary', outcome_dict)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for col in object_columns.columns:\n",
|
||||
" train_set[col] = train_set[col].astype('category').cat.codes\n",
|
||||
"logger.report_table(title='Trainset - after labels encoding',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## *Splitting dataset*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X = train_set.drop(columns= ['OutcomeType'])\n",
|
||||
"Y = train_set['OutcomeType']\n",
|
||||
"X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=configuration_dict.get('test_size', 0.1), \n",
|
||||
" random_state=configuration_dict.get('split_random_state', 0))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# making all variables categorical\n",
|
||||
"object_columns_names = object_columns.drop(columns= ['OutcomeType']).columns\n",
|
||||
"for col in object_columns_names:\n",
|
||||
" X[col] = X[col].astype('category')\n",
|
||||
"columns_categries = {col: len(X[col].cat.categories) for col in object_columns_names}\n",
|
||||
"task.upload_artifact('Categries per column', columns_categries)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_df = X_train.join(Y_train)\n",
|
||||
"train_df.to_csv(Path(path_to_ShelterAnimal) / 'train_processed.csv', index=False)\n",
|
||||
"val_df = X_val.join(Y_val)\n",
|
||||
"val_df.to_csv(Path(path_to_ShelterAnimal) / 'val_processed.csv', index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"paths = {'train_data': str(Path(path_to_ShelterAnimal) / 'train_processed.csv'), 'val_data': str(Path(path_to_ShelterAnimal) / 'val_processed.csv')}\n",
|
||||
"task.upload_artifact('Processed data', paths)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -0,0 +1,295 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install -U pip\n",
|
||||
"! pip install -U torch==1.5.1\n",
|
||||
"! pip install -U trains>=0.15.1\n",
|
||||
"! pip install -U pandas==1.0.4\n",
|
||||
"! pip install -U numpy==1.18.4\n",
|
||||
"! pip install -U tensorboard==2.2.1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"from torch.utils.data import Dataset\n",
|
||||
"from torch.utils.tensorboard import SummaryWriter\n",
|
||||
"\n",
|
||||
"from trains import Task"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"task = Task.init(project_name='Table Example', task_name='tabular prediction')\n",
|
||||
"logger = task.get_logger()\n",
|
||||
"configuration_dict = {'number_of_epochs': 30, 'batch_size': 100, 'dropout': 0.3, 'base_lr': 0.1}\n",
|
||||
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"previous_task = Task.get_task('ed7570e1e12d41e5a06557c81fdf1046')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"preprocessed_data = previous_task.artifacts['Processed data'].get()\n",
|
||||
"train_set = pd.read_csv(preprocessed_data['train_data'])\n",
|
||||
"test_set = pd.read_csv(preprocessed_data['val_data'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"columns_categories = previous_task.artifacts['Categries per column'].get()\n",
|
||||
"columns_categories_ordered = {key: columns_categories[key] for key in train_set.columns if key in columns_categories.keys()}\n",
|
||||
"columns_numerical = [key for key in train_set.drop(columns= ['OutcomeType']).drop(columns=columns_categories_ordered).keys()]\n",
|
||||
"embedding_sizes = [(n_categories, min(32, (n_categories+1)//2)) for _,n_categories in columns_categories_ordered.items()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"outcome_dict = previous_task.artifacts['Outcome dictionary'].get()\n",
|
||||
"reveresed_outcome_dict = {val: key for key, val in outcome_dict.items()}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class ShelterDataset(Dataset):\n",
|
||||
" def __init__(self, X, Y, embedded_col_names):\n",
|
||||
" X = X.copy()\n",
|
||||
" self.X1 = X.loc[:,embedded_col_names].copy().values.astype(np.int64) #categorical columns\n",
|
||||
" self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32) #numerical columns\n",
|
||||
" self.y = Y\n",
|
||||
" \n",
|
||||
" def __len__(self):\n",
|
||||
" return len(self.y)\n",
|
||||
" \n",
|
||||
" def __getitem__(self, idx):\n",
|
||||
" return self.X1[idx], self.X2[idx], self.y[idx]\n",
|
||||
"\n",
|
||||
"#creating train and valid datasets\n",
|
||||
"train_ds = ShelterDataset(train_set.drop(columns= ['OutcomeType']), train_set['OutcomeType'], columns_categories_ordered.keys())\n",
|
||||
"valid_ds = ShelterDataset(test_set.drop(columns= ['OutcomeType']), test_set['OutcomeType'], columns_categories_ordered.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class ShelterModel(nn.Module):\n",
|
||||
" def __init__(self, embedding_sizes, n_cont):\n",
|
||||
" super().__init__()\n",
|
||||
" self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])\n",
|
||||
" n_emb = sum(e.embedding_dim for e in self.embeddings)\n",
|
||||
" self.n_emb, self.n_cont = n_emb, n_cont\n",
|
||||
" self.lin1 = nn.Linear(self.n_emb + self.n_cont, 200)\n",
|
||||
" self.lin2 = nn.Linear(200, 70)\n",
|
||||
" self.lin3 = nn.Linear(70, 5)\n",
|
||||
" self.bn1 = nn.BatchNorm1d(self.n_cont)\n",
|
||||
" self.bn2 = nn.BatchNorm1d(200)\n",
|
||||
" self.bn3 = nn.BatchNorm1d(70)\n",
|
||||
" self.emb_drop = nn.Dropout(0.6)\n",
|
||||
" self.drops = nn.Dropout(configuration_dict.get('dropout', 0.25))\n",
|
||||
"\n",
|
||||
" def forward(self, x_cat, x_cont):\n",
|
||||
" x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]\n",
|
||||
" x = torch.cat(x, 1)\n",
|
||||
" x = self.emb_drop(x)\n",
|
||||
" x2 = self.bn1(x_cont)\n",
|
||||
" x = torch.cat([x, x2], 1)\n",
|
||||
" x = F.relu(self.lin1(x))\n",
|
||||
" x = self.drops(x)\n",
|
||||
" x = self.bn2(x)\n",
|
||||
" x = F.relu(self.lin2(x))\n",
|
||||
" x = self.drops(x)\n",
|
||||
" x = self.bn3(x)\n",
|
||||
" x = self.lin3(x)\n",
|
||||
" return x\n",
|
||||
"\n",
|
||||
"model = ShelterModel(embedding_sizes, 1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = torch.optim.SGD(model.parameters(), lr = configuration_dict.get('base_lr', 0.1), momentum = 0.9)\n",
|
||||
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = configuration_dict.get('number_of_epochs', 15)//3, gamma = 0.1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')\n",
|
||||
"print('Device to use: {}'.format(device))\n",
|
||||
"model.to(device)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tensorboard_writer = SummaryWriter('./tensorboard_logs')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_model(model, optim, train_dl):\n",
|
||||
" model.train()\n",
|
||||
" total = 0\n",
|
||||
" sum_loss = 0\n",
|
||||
" for x1, x2, y in train_dl:\n",
|
||||
" batch = y.shape[0]\n",
|
||||
" output = model(x1.to(device), x2.to(device))\n",
|
||||
" loss = F.cross_entropy(output, y.to(device)) \n",
|
||||
" optim.zero_grad()\n",
|
||||
" loss.backward()\n",
|
||||
" optim.step()\n",
|
||||
" total += batch\n",
|
||||
" sum_loss += batch*(loss.item())\n",
|
||||
" return sum_loss/total"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def val_loss(model, valid_dl, epoch):\n",
|
||||
" model.eval()\n",
|
||||
" total = 0\n",
|
||||
" sum_loss = 0\n",
|
||||
" correct = 0\n",
|
||||
" with torch.no_grad():\n",
|
||||
" for x1, x2, y in valid_dl:\n",
|
||||
" current_batch_size = y.shape[0]\n",
|
||||
" out = model(x1.to(device), x2.to(device))\n",
|
||||
" loss = F.cross_entropy(out, y.to(device))\n",
|
||||
" sum_loss += current_batch_size*(loss.item())\n",
|
||||
" total += current_batch_size\n",
|
||||
" pred = torch.max(out, 1)[1]\n",
|
||||
" correct += (pred.cpu() == y).float().sum().item()\n",
|
||||
" print(\"\\t valid loss %.3f and accuracy %.3f\" % (sum_loss/total, correct/total))\n",
|
||||
" tensorboard_writer.add_scalar('accuracy/total', correct/total, epoch)\n",
|
||||
" \n",
|
||||
" debug_categories = pd.DataFrame(x1.numpy(), columns=columns_categories_ordered.keys())\n",
|
||||
" debug_numercal = pd.DataFrame(x2.numpy(), columns=columns_numerical)\n",
|
||||
" debug_gt = pd.DataFrame(np.array([reveresed_outcome_dict[int(e)] for e in y]), columns=['GT'])\n",
|
||||
" debug_pred = pd.DataFrame(np.array([reveresed_outcome_dict[int(e)] for e in pred.cpu()]), columns=['Pred'])\n",
|
||||
" debug_table = debug_categories.join([debug_numercal, debug_gt, debug_pred])\n",
|
||||
" logger.report_table(title='Trainset - after labels encoding',series='pandas DataFrame',iteration=epoch, table_plot=debug_table.head())\n",
|
||||
" return sum_loss/total, correct/total"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def train_loop(model, epochs):\n",
|
||||
" for i in range(epochs): \n",
|
||||
" loss = train_model(model, optimizer, train_dl)\n",
|
||||
" print(\"Epoch {}: training loss {}\".format(i, loss))\n",
|
||||
" tensorboard_writer.add_scalar('training loss/loss', loss, i)\n",
|
||||
" tensorboard_writer.add_scalar('learning rate/lr', optimizer.param_groups[0]['lr'], i)\n",
|
||||
" \n",
|
||||
" val_loss(model, valid_dl, i)\n",
|
||||
" scheduler.step()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dl = torch.utils.data.DataLoader(train_ds, batch_size=configuration_dict.get('batch_size', 100), shuffle=True, pin_memory=True, num_workers=1)\n",
|
||||
"valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size=configuration_dict.get('batch_size', 100), shuffle=False, pin_memory=True, num_workers=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_loop(model, epochs=configuration_dict.get('number_of_epochs', 30))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -25,10 +25,7 @@
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"import torchtext\n",
|
||||
"from torchtext.datasets import text_classification\n",
|
||||
"from torch.utils.data import DataLoader\n",
|
||||
"from torch.utils.tensorboard import SummaryWriter\n",
|
||||
"\n",
|
||||
"from trains import Task\n",
|
||||
@ -264,7 +261,6 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"from torchtext.data.utils import ngrams_iterator\n",
|
||||
"from torchtext.data.utils import get_tokenizer\n",
|
||||
"\n",
|
||||
@ -313,5 +309,5 @@
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user