Refactor examples

This commit is contained in:
allegroai
2020-06-15 22:48:51 +03:00
parent bec31c7ac4
commit 99368abb1c
78 changed files with 3505 additions and 1294 deletions

View File

@@ -0,0 +1 @@
trains

View File

@@ -0,0 +1,86 @@
# TRAINS - Keras with Tensorboard example code, automatic logging model and Tensorboard outputs
#
# Train a simple deep NN on the MNIST dataset.
# Gets to 98.40% test accuracy after 20 epochs
# (there is *a lot* of margin for parameter tuning).
# 2 seconds per epoch on a K520 GPU.
from __future__ import print_function
import tempfile
import os
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import RMSprop
from keras.utils import np_utils
import tensorflow as tf # noqa: F401
from trains import Task, Logger
# Connecting TRAINS
task = Task.init(project_name='examples', task_name='Keras HP optimization base')
# the data, shuffled and split between train and test sets
nb_classes = 10
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784).astype('float32')/255.
X_test = X_test.reshape(10000, 784).astype('float32')/255.
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)
args = {'batch_size': 128,
'epochs': 6,
'layer_1': 512,
'layer_2': 512,
'layer_3': 10,
'layer_4': 512,
}
args = task.connect(args)
model = Sequential()
model.add(Dense(args['layer_1'], input_shape=(784,)))
model.add(Activation('relu'))
# model.add(Dropout(0.2))
model.add(Dense(args['layer_2']))
model.add(Activation('relu'))
# model.add(Dropout(0.2))
model.add(Dense(args['layer_3']))
model.add(Activation('softmax'))
model2 = Sequential()
model2.add(Dense(args['layer_4'], input_shape=(784,)))
model2.add(Activation('relu'))
model.summary()
model.compile(loss='categorical_crossentropy',
optimizer=RMSprop(),
metrics=['accuracy'])
# Advanced: setting model class enumeration
labels = dict(('digit_%d' % i, i) for i in range(10))
task.set_model_label_enumeration(labels)
output_folder = os.path.join(tempfile.gettempdir(), 'keras_example')
board = TensorBoard(log_dir=output_folder, write_images=False)
model_store = ModelCheckpoint(filepath=os.path.join(output_folder, 'weight.hdf5'))
history = model.fit(X_train, Y_train,
batch_size=args['batch_size'], epochs=args['epochs'],
callbacks=[board, model_store],
validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])
Logger.current_logger().report_scalar(title='evaluate', series='score', value=score[0], iteration=args['epochs'])
Logger.current_logger().report_scalar(title='evaluate', series='accuracy', value=score[1], iteration=args['epochs'])

View File

@@ -0,0 +1,114 @@
import logging
from trains import Task
from trains.automation import DiscreteParameterRange, HyperParameterOptimizer, RandomSearch, \
UniformIntegerParameterRange
try:
from trains.automation.hpbandster import OptimizerBOHB
Our_SearchStrategy = OptimizerBOHB
except ValueError:
logging.getLogger().warning(
'Apologies, it seems you do not have \'hpbandster\' installed, '
'we will be using RandomSearch strategy instead\n'
'If you like to try ' '{{BOHB}: Robust and Efficient Hyperparameter Optimization at Scale},\n'
'run: pip install hpbandster')
Our_SearchStrategy = RandomSearch
def job_complete_callback(
job_id, # type: str
objective_value, # type: float
objective_iteration, # type: int
job_parameters, # type: dict
top_performance_job_id # type: str
):
print('Job completed!', job_id, objective_value, objective_iteration, job_parameters)
if job_id == top_performance_job_id:
print('WOOT WOOT we broke the record! Objective reached {}'.format(objective_value))
# Connecting TRAINS
task = Task.init(project_name='Hyper-Parameter Optimization',
task_name='Automatic Hyper-Parameter Optimization',
task_type=Task.TaskTypes.optimizer,
reuse_last_task_id=False)
# experiment template to optimize in the hyper-parameter optimization
args = {
'template_task_id': None,
'run_as_service': False,
}
args = task.connect(args)
# Get the template task experiment that we want to optimize
if not args['template_task_id']:
args['template_task_id'] = Task.get_task(
project_name='examples', task_name='Keras HP optimization base').id
# Example use case:
an_optimizer = HyperParameterOptimizer(
# This is the experiment we want to optimize
base_task_id=args['template_task_id'],
# here we define the hyper-parameters to optimize
hyper_parameters=[
UniformIntegerParameterRange('layer_1', min_value=128, max_value=512, step_size=128),
UniformIntegerParameterRange('layer_2', min_value=128, max_value=512, step_size=128),
DiscreteParameterRange('batch_size', values=[96, 128, 160]),
DiscreteParameterRange('epochs', values=[30]),
],
# this is the objective metric we want to maximize/minimize
objective_metric_title='val_acc',
objective_metric_series='val_acc',
# now we decide if we want to maximize it or minimize it (accuracy we maximize)
objective_metric_sign='max',
# let us limit the number of concurrent experiments,
# this in turn will make sure we do dont bombard the scheduler with experiments.
# if we have an auto-scaler connected, this, by proxy, will limit the number of machine
max_number_of_concurrent_tasks=2,
# this is the optimizer class (actually doing the optimization)
# Currently, we can choose from GridSearch, RandomSearch or OptimizerBOHB (Bayesian optimization Hyper-Band)
# more are coming soon...
optimizer_class=Our_SearchStrategy,
# Select an execution queue to schedule the experiments for execution
execution_queue='moshik',
# Optional: Limit the execution time of a single experiment, in minutes.
# (this is optional, and if using OptimizerBOHB, it is ignored)
time_limit_per_job=10.,
# Check the experiments every 6 seconds is way too often, we should probably set it to 5 min,
# assuming a single experiment is usually hours...
pool_period_min=0.1,
# set the maximum number of jobs to launch for the optimization, default (None) unlimited
# If OptimizerBOHB is used, it defined the maximum budget in terms of full jobs
# basically the cumulative number of iterations will not exceed total_max_jobs * max_iteration_per_job
total_max_jobs=10,
# This is only applicable for OptimizerBOHB and ignore by the rest
# set the minimum number of iterations for an experiment, before early stopping
min_iteration_per_job=10,
# Set the maximum number of iterations for an experiment to execute
# (This is optional, unless using OptimizerBOHB where this is a must)
max_iteration_per_job=30,
)
# if we are running as a service, just enqueue ourselves into the services queue and let it run the optimization
if args['run_as_service']:
# if this code is executed by `trains-agent` the function call does nothing.
# if executed locally, the local process will be terminated, and a remote copy will be executed instead
task.execute_remotely(queue_name='services', exit_process=True)
# report every 12 seconds, this is way too often, but we are testing here J
an_optimizer.set_report_period(2.2)
# start the optimization process, callback function to be called every time an experiment is completed
# this function returns immediately
an_optimizer.start(job_complete_callback=job_complete_callback)
# set the time limit for the optimization process (2 hours)
an_optimizer.set_time_limit(in_minutes=120.0)
# wait until process is done (notice we are controlling the optimization process in the background)
an_optimizer.wait()
# optimization is completed, print the top performing experiments id
top_exp = an_optimizer.get_top_experiments(top_k=3)
print([t.id for t in top_exp])
# make sure background optimization stopped
an_optimizer.stop()
print('We are done, good bye')

View File

@@ -0,0 +1,3 @@
keras
tensorflow
trains

View File

@@ -0,0 +1,154 @@
import os
import socket
import subprocess
import sys
from copy import deepcopy
from tempfile import mkstemp
import psutil
# make sure we have jupyter in the auto requirements
from trains import Task
# set default docker image, with network configuration
os.environ["TRAINS_DOCKER_IMAGE"] = "nvidia/cuda --network host"
# initialize TRAINS
task = Task.init(project_name="examples", task_name="Remote Jupyter NoteBook")
# get rid of all the runtime TRAINS
preserve = (
"TRAINS_API_HOST",
"TRAINS_WEB_HOST",
"TRAINS_FILES_HOST",
"TRAINS_CONFIG_FILE",
"TRAINS_API_ACCESS_KEY",
"TRAINS_API_SECRET_KEY",
"TRAINS_API_HOST_VERIFY_CERT",
)
# setup os environment
env = deepcopy(os.environ)
for key in os.environ:
if key.startswith("TRAINS") and key not in preserve:
env.pop(key, None)
# Add jupyter server base folder
param = {
"jupyter_server_base_directory": "~/",
"ssh_server": True,
"ssh_password": "training",
}
task.connect(param)
# noinspection PyBroadException
try:
hostname = socket.gethostname()
hostnames = socket.gethostbyname(socket.gethostname())
except Exception:
def get_ip_addresses(family):
for interface, snics in psutil.net_if_addrs().items():
for snic in snics:
if snic.family == family:
yield snic.address
hostnames = list(get_ip_addresses(socket.AF_INET))
hostname = hostnames[0]
if param.get("ssh_server"):
print("Installing SSH Server on {} [{}]".format(hostname, hostnames))
ssh_password = param.get("ssh_password", "training")
# noinspection PyBroadException
try:
used_ports = [i.laddr.port for i in psutil.net_connections()]
port = [i for i in range(10022, 15000) if i not in used_ports][0]
result = os.system(
"apt-get install -y openssh-server && "
"mkdir -p /var/run/sshd && "
"echo 'root:{password}' | chpasswd && "
"echo 'PermitRootLogin yes' >> /etc/ssh/sshd_config && "
"sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && "
"sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && " # noqa: W605
'echo "export VISIBLE=now" >> /etc/profile && '
'echo "export TRAINS_CONFIG_FILE={trains_config_file}" >> /etc/profile && '
"/usr/sbin/sshd -p {port}".format(
password=ssh_password,
port=port,
trains_config_file=os.environ.get("TRAINS_CONFIG_FILE"),
)
)
if result == 0:
print(
"\n#\n# SSH Server running on {} [{}] port {}\n# LOGIN u:root p:{}\n#\n".format(
hostname, hostnames, port, ssh_password
)
)
else:
raise ValueError()
except Exception:
print("\n#\n# Error: SSH server could not be launched\n#\n")
# execute jupyter notebook
fd, local_filename = mkstemp()
cwd = (
os.path.expandvars(os.path.expanduser(param["jupyter_server_base_directory"]))
if param["jupyter_server_base_directory"]
else os.getcwd()
)
print(
"Running Jupyter Notebook Server on {} [{}] at {}".format(hostname, hostnames, cwd)
)
process = subprocess.Popen(
[
sys.executable,
"-m",
"jupyter",
"notebook",
"--no-browser",
"--allow-root",
"--ip",
"0.0.0.0",
],
env=env,
stdout=fd,
stderr=fd,
cwd=cwd,
)
# print stdout/stderr
prev_line_count = 0
process_running = True
while process_running:
process_running = False
try:
process.wait(timeout=2.0 if prev_line_count == 0 else 15.0)
except subprocess.TimeoutExpired:
process_running = True
with open(local_filename, "rt") as f:
# read new lines
new_lines = f.readlines()
if not new_lines:
continue
output = "".join(new_lines)
print(output)
# update task comment with jupyter notebook server links
if prev_line_count == 0:
task.comment += "\n" + "".join(
line for line in new_lines if "http://" in line or "https://" in line
)
prev_line_count += len(new_lines)
os.lseek(fd, 0, 0)
os.ftruncate(fd, 0)
# cleanup
os.close(fd)
# noinspection PyBroadException
try:
os.unlink(local_filename)
except Exception:
pass