mirror of
https://github.com/clearml/clearml
synced 2025-03-03 10:42:00 +00:00
Update tabular example scripts for the ML pipeline blog
This commit is contained in:
parent
8ec6bba4d9
commit
519677d987
@ -0,0 +1,135 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"! pip install -U pip\n",
|
||||||
|
"! pip install -U trains==0.16.2rc0\n",
|
||||||
|
"! pip install -U pandas==1.0.4\n",
|
||||||
|
"! pip install -U scikit-learn==0.23.1\n",
|
||||||
|
"! pip install -U pathlib2==2.3.5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from pathlib2 import Path\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"from trains import Task"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"task = Task.init(project_name='Tabular Example', task_name='Download and split tabular dataset')\n",
|
||||||
|
"logger = task.get_logger()\n",
|
||||||
|
"configuration_dict = {'test_size': 0.1, 'split_random_state': 0}\n",
|
||||||
|
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||||
|
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# **Downloading**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Download the shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)\n",
|
||||||
|
"# and save it to your cloud storage or your mounted local storage\n",
|
||||||
|
"# If the data is on your cloud storage, you can use trains' storage manager to get a local copy of it:\n",
|
||||||
|
"# from trains.storage import StorageManager\n",
|
||||||
|
"# path_to_ShelterAnimal = StorageManager.get_local_copy(\"https://allegro-datasets.s3.amazonaws.com/trains/UrbanSound8K.zip\", \n",
|
||||||
|
"# extract_archive=True)\n",
|
||||||
|
"path_to_ShelterAnimal = '/home/sam/Datasets/shelter-animal-outcomes'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_set = pd.read_csv(Path(path_to_ShelterAnimal) / 'train.csv')\n",
|
||||||
|
"logger.report_table(title='Trainset - raw',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# **Splitting to train and val**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X = train_set.drop(columns= ['OutcomeType'])\n",
|
||||||
|
"Y = train_set['OutcomeType']\n",
|
||||||
|
"X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=configuration_dict.get('test_size', 0.1), \n",
|
||||||
|
" random_state=configuration_dict.get('split_random_state', 0))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_df = X_train.join(Y_train)\n",
|
||||||
|
"val_df = X_val.join(Y_val)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"task.upload_artifact('train_data', artifact_object=train_df)\n",
|
||||||
|
"task.upload_artifact('val_data', artifact_object=val_df)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -0,0 +1,96 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"! pip install -U pip\n",
|
||||||
|
"! pip install -U trains==0.16.2rc0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from trains import Task, OutputModel"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"task = Task.init(project_name='Tabular Example', task_name='pick best model')\n",
|
||||||
|
"configuration_dict = {'train_tasks_ids': ['c9bff3d15309487a9e5aaa00358ff091', 'c9bff3d15309487a9e5aaa00358ff091']}\n",
|
||||||
|
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||||
|
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"results = {}\n",
|
||||||
|
"for task_id in configuration_dict.get('train_tasks_ids'):\n",
|
||||||
|
" train_task = Task.get_task(task_id)\n",
|
||||||
|
" results[task_id] = train_task.get_last_scalar_metrics()['accuracy']['total']['last']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(results)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"best_model_task_id = max(results.items(), key=lambda x: x[1])[0]\n",
|
||||||
|
"best_model_id = Task.get_task(best_model_task_id).output_model_id"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"OutputModel(base_model_id=best_model_id)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -0,0 +1,300 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"! pip install -U pip\n",
|
||||||
|
"! pip install -U trains==0.16.2rc0\n",
|
||||||
|
"! pip install -U pandas==1.0.4\n",
|
||||||
|
"! pip install -U numpy==1.18.4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"\n",
|
||||||
|
"from trains import Task"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"task = Task.init(project_name='Tabular Example', task_name='tabular preprocessing')\n",
|
||||||
|
"logger = task.get_logger()\n",
|
||||||
|
"configuration_dict = {'data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n",
|
||||||
|
" 'fill_categorical_NA': True, 'fill_numerical_NA': True}\n",
|
||||||
|
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||||
|
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data_task = Task.get_task(configuration_dict.get('data_task_id'))\n",
|
||||||
|
"train_set = data_task.artifacts['train_data'].get().drop(columns=['Unnamed: 0'])\n",
|
||||||
|
"val_set = data_task.artifacts['val_data'].get().drop(columns=['Unnamed: 0'])\n",
|
||||||
|
"logger.report_table(title='Trainset - raw',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# **Pre-processing**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Remove hour and year from DateTime data\n",
|
||||||
|
"def change_time_format(data_frame):\n",
|
||||||
|
" timestamp = pd.to_datetime(data_frame['DateTime'])\n",
|
||||||
|
" months = [d.month for d in timestamp]\n",
|
||||||
|
" data_frame['Month'] = pd.DataFrame(months).astype('object')\n",
|
||||||
|
" data_frame.drop(columns= ['DateTime'], inplace=True)\n",
|
||||||
|
" return data_frame\n",
|
||||||
|
"\n",
|
||||||
|
"train_set = change_time_format(train_set)\n",
|
||||||
|
"val_set = change_time_format(val_set)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def change_age_format(data_frame): \n",
|
||||||
|
" age = data_frame['AgeuponOutcome']\n",
|
||||||
|
" months_age = []\n",
|
||||||
|
" for val in age:\n",
|
||||||
|
" if pd.isnull(val):\n",
|
||||||
|
" months_age.append(val)\n",
|
||||||
|
" else:\n",
|
||||||
|
" amount, time_type = val.split(' ')\n",
|
||||||
|
" if 'day' in time_type:\n",
|
||||||
|
" mult = 1./30\n",
|
||||||
|
" if 'week' in time_type:\n",
|
||||||
|
" mult = 1./4\n",
|
||||||
|
" if 'month' in time_type:\n",
|
||||||
|
" mult = 1.\n",
|
||||||
|
" if 'year' in time_type:\n",
|
||||||
|
" mult = 12.\n",
|
||||||
|
" months_age.append(int(amount) * mult)\n",
|
||||||
|
" data_frame['Age'] = pd.DataFrame(months_age).astype(np.float32)\n",
|
||||||
|
" data_frame.drop(columns= ['AgeuponOutcome'], inplace=True)\n",
|
||||||
|
" return data_frame\n",
|
||||||
|
" \n",
|
||||||
|
"train_set = change_age_format(train_set)\n",
|
||||||
|
"val_set = change_age_format(val_set)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def change_sex_format(data_frame): \n",
|
||||||
|
" sex_neutered = data_frame['SexuponOutcome']\n",
|
||||||
|
" sex = []\n",
|
||||||
|
" neutered = []\n",
|
||||||
|
" for val in sex_neutered:\n",
|
||||||
|
" if pd.isnull(val):\n",
|
||||||
|
" sex.append(val)\n",
|
||||||
|
" neutered.append(val)\n",
|
||||||
|
" elif 'Unknown' in val:\n",
|
||||||
|
" sex.append(np.nan)\n",
|
||||||
|
" neutered.append(np.nan)\n",
|
||||||
|
" else:\n",
|
||||||
|
" n, s = val.split(' ')\n",
|
||||||
|
" if n in ['Neutered', 'Spayed']:\n",
|
||||||
|
" neutered.append('Yes')\n",
|
||||||
|
" else:\n",
|
||||||
|
" neutered.append('No')\n",
|
||||||
|
" sex.append(s)\n",
|
||||||
|
"\n",
|
||||||
|
" data_frame['Sex'] = pd.DataFrame(sex)\n",
|
||||||
|
" data_frame['Neutered'] = pd.DataFrame(neutered)\n",
|
||||||
|
" data_frame.drop(columns= ['SexuponOutcome'], inplace=True)\n",
|
||||||
|
" return data_frame\n",
|
||||||
|
"\n",
|
||||||
|
"train_set = change_sex_format(train_set)\n",
|
||||||
|
"val_set = change_sex_format(val_set)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Remove irrelevant columns\n",
|
||||||
|
"def remove_columns(data_frame, list_columns_names=None):\n",
|
||||||
|
" if list_columns_names is not None:\n",
|
||||||
|
" data_frame.drop(columns= list_columns_names, inplace=True)\n",
|
||||||
|
" return data_frame\n",
|
||||||
|
"\n",
|
||||||
|
"train_set = remove_columns(train_set, ['Name', 'OutcomeSubtype', 'AnimalID'])\n",
|
||||||
|
"val_set = remove_columns(val_set, ['Name', 'OutcomeSubtype', 'AnimalID'])\n",
|
||||||
|
"\n",
|
||||||
|
"logger.report_table(title='Trainset - after preprocessing',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## *Fill NA Values*"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"object_columns = train_set.select_dtypes(include=['object']).copy()\n",
|
||||||
|
"numerical_columns = train_set.select_dtypes(include=['number']).copy()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if configuration_dict.get('fill_categorical_NA', True):\n",
|
||||||
|
" for col in object_columns.columns:\n",
|
||||||
|
" if object_columns[col].isnull().sum() > 0:\n",
|
||||||
|
" most_common = Counter(object_columns[col]).most_common(1)[0][0]\n",
|
||||||
|
" print('Column \"{}\": replacing null values with \"{}\"'.format(col, most_common))\n",
|
||||||
|
" train_set[col].fillna(most_common, inplace=True)\n",
|
||||||
|
" val_set[col].fillna(most_common, inplace=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if configuration_dict.get('fill_numerical_NA', True):\n",
|
||||||
|
" for col in numerical_columns.columns:\n",
|
||||||
|
" if numerical_columns[col].isnull().sum() > 0:\n",
|
||||||
|
" median_val = numerical_columns[col].median()\n",
|
||||||
|
" print('Column \"{}\": replacing null values with \"{}\"'.format(col, median_val))\n",
|
||||||
|
" train_set[col].fillna(median_val, inplace=True)\n",
|
||||||
|
" val_set[col].fillna(median_val, inplace=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Drop rows with NA values if were chosen not to be filled\n",
|
||||||
|
"train_set.dropna(inplace=True)\n",
|
||||||
|
"val_set.dropna(inplace=True)\n",
|
||||||
|
"if configuration_dict.get('fill_categorical_NA', True) or configuration_dict.get('fill_numerical_NA', True):\n",
|
||||||
|
" logger.report_table(title='Trainset - after filling missing values',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## *Labels Encoding*"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"all_data = pd.concat([train_set, val_set])\n",
|
||||||
|
"outcome_categories = all_data['OutcomeType'].astype('category').cat.categories\n",
|
||||||
|
"outcome_dict = {key: val for val,key in enumerate(outcome_categories)}\n",
|
||||||
|
"task.upload_artifact('Outcome dictionary', outcome_dict)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for col in object_columns.columns:\n",
|
||||||
|
" all_data[col] = all_data[col].astype('category').cat.codes\n",
|
||||||
|
"train_set = all_data.iloc[:len(train_set.index), :]\n",
|
||||||
|
"val_set = all_data.iloc[len(train_set.index):, :]\n",
|
||||||
|
"logger.report_table(title='Trainset - after labels encoding',series='pandas DataFrame',iteration=0, table_plot=train_set.head())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# making all variables categorical\n",
|
||||||
|
"object_columns_names = object_columns.drop(columns= ['OutcomeType']).columns\n",
|
||||||
|
"for col in object_columns_names:\n",
|
||||||
|
" all_data[col] = all_data[col].astype('category')\n",
|
||||||
|
"columns_categries = {col: len(all_data[col].cat.categories) for col in object_columns_names}\n",
|
||||||
|
"task.upload_artifact('Categries per column', columns_categries)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"task.upload_artifact('train_data', artifact_object=train_set)\n",
|
||||||
|
"task.upload_artifact('val_data', artifact_object=val_set)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -0,0 +1,97 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# pip install with locked versions\n",
|
||||||
|
"! pip install -U pip\n",
|
||||||
|
"! pip install -U trains==0.16.2rc0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from trains import Task\n",
|
||||||
|
"from trains.automation.controller import PipelineController"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"task = Task.init(project_name='Tabular Example', task_name='tabular training pipeline', task_type=Task.TaskTypes.controller)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pipe = PipelineController(default_execution_queue='dan_queue', add_pipeline_tags=True)\n",
|
||||||
|
"pipe.add_step(name='preprocessing_1', base_task_project='Tabular Example', base_task_name='tabular preprocessing',\n",
|
||||||
|
" parameter_override={'General/data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n",
|
||||||
|
" 'General/fill_categorical_NA': 'True',\n",
|
||||||
|
" 'General/fill_numerical_NA': 'True'})\n",
|
||||||
|
"pipe.add_step(name='preprocessing_2', base_task_project='Tabular Example', base_task_name='tabular preprocessing',\n",
|
||||||
|
" parameter_override={'General/data_task_id': '39fbf86fc4a341359ac6df4aa70ff91b',\n",
|
||||||
|
" 'General/fill_categorical_NA': 'False',\n",
|
||||||
|
" 'General/fill_numerical_NA': 'True'})\n",
|
||||||
|
" \n",
|
||||||
|
"pipe.add_step(name='train_1', parents=['preprocessing_1'],\n",
|
||||||
|
" base_task_project='Tabular Example', base_task_name='tabular prediction',\n",
|
||||||
|
" parameter_override={'General/data_task_id': '${preprocessing_1.id}'})\n",
|
||||||
|
"pipe.add_step(name='train_2', parents=['preprocessing_2'],\n",
|
||||||
|
" base_task_project='Tabular Example', base_task_name='tabular prediction',\n",
|
||||||
|
" parameter_override={'General/data_task_id': '${preprocessing_2.id}'})\n",
|
||||||
|
" \n",
|
||||||
|
"pipe.add_step(name='pick_best', parents=['train_1', 'train_2'],\n",
|
||||||
|
" base_task_project='Tabular Example', base_task_name='pick best model',\n",
|
||||||
|
" parameter_override={'General/train_tasks_ids': '[${train_1.id}, ${train_2.id}]'}) "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Starting the pipeline (in the background)\n",
|
||||||
|
"pipe.start()\n",
|
||||||
|
"# Wait until pipeline terminates\n",
|
||||||
|
"pipe.wait()\n",
|
||||||
|
"# cleanup everything\n",
|
||||||
|
"pipe.stop()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -8,7 +8,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"! pip install -U pip\n",
|
"! pip install -U pip\n",
|
||||||
"! pip install -U torch==1.5.1\n",
|
"! pip install -U torch==1.5.1\n",
|
||||||
"! pip install -U trains>=0.15.1\n",
|
"! pip install -U trains==0.16.2rc0\n",
|
||||||
"! pip install -U pandas==1.0.4\n",
|
"! pip install -U pandas==1.0.4\n",
|
||||||
"! pip install -U numpy==1.18.4\n",
|
"! pip install -U numpy==1.18.4\n",
|
||||||
"! pip install -U tensorboard==2.2.1"
|
"! pip install -U tensorboard==2.2.1"
|
||||||
@ -38,9 +38,10 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"task = Task.init(project_name='Table Example', task_name='tabular prediction')\n",
|
"task = Task.init(project_name='Tabular Example', task_name='tabular prediction')\n",
|
||||||
"logger = task.get_logger()\n",
|
"logger = task.get_logger()\n",
|
||||||
"configuration_dict = {'number_of_epochs': 30, 'batch_size': 100, 'dropout': 0.3, 'base_lr': 0.1}\n",
|
"configuration_dict = {'data_task_id': 'b605d76398f941e69fc91b43420151d2', \n",
|
||||||
|
" 'number_of_epochs': 15, 'batch_size': 100, 'dropout': 0.3, 'base_lr': 0.1}\n",
|
||||||
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
"configuration_dict = task.connect(configuration_dict) # enabling configuration override by trains\n",
|
||||||
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
"print(configuration_dict) # printing actual configuration (after override in remote mode)"
|
||||||
]
|
]
|
||||||
@ -51,7 +52,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"previous_task = Task.get_task('ed7570e1e12d41e5a06557c81fdf1046')"
|
"data_task = Task.get_task(configuration_dict.get('data_task_id'))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -60,9 +61,8 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"preprocessed_data = previous_task.artifacts['Processed data'].get()\n",
|
"train_set = data_task.artifacts['train_data'].get().drop(columns=['Unnamed: 0'])\n",
|
||||||
"train_set = pd.read_csv(preprocessed_data['train_data'])\n",
|
"test_set = data_task.artifacts['val_data'].get().drop(columns=['Unnamed: 0'])"
|
||||||
"test_set = pd.read_csv(preprocessed_data['val_data'])"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -71,7 +71,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"columns_categories = previous_task.artifacts['Categries per column'].get()\n",
|
"columns_categories = data_task.artifacts['Categries per column'].get()\n",
|
||||||
"columns_categories_ordered = {key: columns_categories[key] for key in train_set.columns if key in columns_categories.keys()}\n",
|
"columns_categories_ordered = {key: columns_categories[key] for key in train_set.columns if key in columns_categories.keys()}\n",
|
||||||
"columns_numerical = [key for key in train_set.drop(columns= ['OutcomeType']).drop(columns=columns_categories_ordered).keys()]\n",
|
"columns_numerical = [key for key in train_set.drop(columns= ['OutcomeType']).drop(columns=columns_categories_ordered).keys()]\n",
|
||||||
"embedding_sizes = [(n_categories, min(32, (n_categories+1)//2)) for _,n_categories in columns_categories_ordered.items()]"
|
"embedding_sizes = [(n_categories, min(32, (n_categories+1)//2)) for _,n_categories in columns_categories_ordered.items()]"
|
||||||
@ -83,7 +83,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"outcome_dict = previous_task.artifacts['Outcome dictionary'].get()\n",
|
"outcome_dict = data_task.artifacts['Outcome dictionary'].get()\n",
|
||||||
"reveresed_outcome_dict = {val: key for key, val in outcome_dict.items()}"
|
"reveresed_outcome_dict = {val: key for key, val in outcome_dict.items()}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -269,6 +269,17 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"train_loop(model, epochs=configuration_dict.get('number_of_epochs', 30))"
|
"train_loop(model, epochs=configuration_dict.get('number_of_epochs', 30))"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"PATH = './model_checkpoint.pth'\n",
|
||||||
|
"torch.save(model.state_dict(), PATH)\n",
|
||||||
|
"tensorboard_writer.close()"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
Loading…
Reference in New Issue
Block a user