Refactor examples

2025-06-26 18:16:07 +00:00 · 2020-06-15 22:48:51 +03:00
parent bec31c7ac4
commit 99368abb1c
78 changed files with 3505 additions and 1294 deletions
--- a/examples/services/cleanup/cleanup_service.py
+++ b/examples/services/cleanup/cleanup_service.py
--- a/examples/services/cleanup/requirements.txt
+++ b/examples/services/cleanup/requirements.txt
@@ -0,0 +1 @@
+trains
--- a/examples/services/hyper-parameter-optimization/base_template_keras_simple.py
+++ b/examples/services/hyper-parameter-optimization/base_template_keras_simple.py
@@ -0,0 +1,86 @@
+# TRAINS - Keras with Tensorboard example code, automatic logging model and Tensorboard outputs
+#
+# Train a simple deep NN on the MNIST dataset.
+# Gets to 98.40% test accuracy after 20 epochs
+# (there is *a lot* of margin for parameter tuning).
+# 2 seconds per epoch on a K520 GPU.
+from __future__ import print_function
+
+import tempfile
+import os
+
+from keras.callbacks import TensorBoard, ModelCheckpoint
+from keras.datasets import mnist
+from keras.models import Sequential
+from keras.layers.core import Dense, Activation
+from keras.optimizers import RMSprop
+from keras.utils import np_utils
+import tensorflow as tf  # noqa: F401
+
+from trains import Task, Logger
+
+
+# Connecting TRAINS
+task = Task.init(project_name='examples', task_name='Keras HP optimization base')
+
+
+# the data, shuffled and split between train and test sets
+nb_classes = 10
+(X_train, y_train), (X_test, y_test) = mnist.load_data()
+
+X_train = X_train.reshape(60000, 784).astype('float32')/255.
+X_test = X_test.reshape(10000, 784).astype('float32')/255.
+print(X_train.shape[0], 'train samples')
+print(X_test.shape[0], 'test samples')
+
+# convert class vectors to binary class matrices
+Y_train = np_utils.to_categorical(y_train, nb_classes)
+Y_test = np_utils.to_categorical(y_test, nb_classes)
+
+args = {'batch_size': 128,
+        'epochs': 6,
+        'layer_1': 512,
+        'layer_2': 512,
+        'layer_3': 10,
+        'layer_4': 512,
+        }
+args = task.connect(args)
+
+model = Sequential()
+model.add(Dense(args['layer_1'], input_shape=(784,)))
+model.add(Activation('relu'))
+# model.add(Dropout(0.2))
+model.add(Dense(args['layer_2']))
+model.add(Activation('relu'))
+# model.add(Dropout(0.2))
+model.add(Dense(args['layer_3']))
+model.add(Activation('softmax'))
+
+model2 = Sequential()
+model2.add(Dense(args['layer_4'], input_shape=(784,)))
+model2.add(Activation('relu'))
+
+model.summary()
+
+model.compile(loss='categorical_crossentropy',
+              optimizer=RMSprop(),
+              metrics=['accuracy'])
+
+# Advanced: setting model class enumeration
+labels = dict(('digit_%d' % i, i) for i in range(10))
+task.set_model_label_enumeration(labels)
+
+output_folder = os.path.join(tempfile.gettempdir(), 'keras_example')
+
+board = TensorBoard(log_dir=output_folder, write_images=False)
+model_store = ModelCheckpoint(filepath=os.path.join(output_folder, 'weight.hdf5'))
+
+history = model.fit(X_train, Y_train,
+                    batch_size=args['batch_size'], epochs=args['epochs'],
+                    callbacks=[board, model_store],
+                    validation_data=(X_test, Y_test))
+score = model.evaluate(X_test, Y_test, verbose=0)
+print('Test score:', score[0])
+print('Test accuracy:', score[1])
+Logger.current_logger().report_scalar(title='evaluate', series='score', value=score[0], iteration=args['epochs'])
+Logger.current_logger().report_scalar(title='evaluate', series='accuracy', value=score[1], iteration=args['epochs'])
--- a/examples/services/hyper-parameter-optimization/hyper_parameter_optimizer.py
+++ b/examples/services/hyper-parameter-optimization/hyper_parameter_optimizer.py
@@ -0,0 +1,114 @@
+import logging
+
+from trains import Task
+from trains.automation import DiscreteParameterRange, HyperParameterOptimizer, RandomSearch, \
+    UniformIntegerParameterRange
+
+try:
+    from trains.automation.hpbandster import OptimizerBOHB
+    Our_SearchStrategy = OptimizerBOHB
+except ValueError:
+    logging.getLogger().warning(
+        'Apologies, it seems you do not have \'hpbandster\' installed, '
+        'we will be using RandomSearch strategy instead\n'
+        'If you like to try ' '{{BOHB}: Robust and Efficient Hyperparameter Optimization at Scale},\n'
+        'run: pip install hpbandster')
+Our_SearchStrategy = RandomSearch
+
+
+def job_complete_callback(
+    job_id,                 # type: str
+    objective_value,        # type: float
+    objective_iteration,    # type: int
+    job_parameters,         # type: dict
+    top_performance_job_id  # type: str
+):
+    print('Job completed!', job_id, objective_value, objective_iteration, job_parameters)
+    if job_id == top_performance_job_id:
+        print('WOOT WOOT we broke the record! Objective reached {}'.format(objective_value))
+
+
+# Connecting TRAINS
+task = Task.init(project_name='Hyper-Parameter Optimization',
+                 task_name='Automatic Hyper-Parameter Optimization',
+                 task_type=Task.TaskTypes.optimizer,
+                 reuse_last_task_id=False)
+
+# experiment template to optimize in the hyper-parameter optimization
+args = {
+    'template_task_id': None,
+    'run_as_service': False,
+}
+args = task.connect(args)
+
+# Get the template task experiment that we want to optimize
+if not args['template_task_id']:
+    args['template_task_id'] = Task.get_task(
+        project_name='examples', task_name='Keras HP optimization base').id
+
+# Example use case:
+an_optimizer = HyperParameterOptimizer(
+    # This is the experiment we want to optimize
+    base_task_id=args['template_task_id'],
+    # here we define the hyper-parameters to optimize
+    hyper_parameters=[
+        UniformIntegerParameterRange('layer_1', min_value=128, max_value=512, step_size=128),
+        UniformIntegerParameterRange('layer_2', min_value=128, max_value=512, step_size=128),
+        DiscreteParameterRange('batch_size', values=[96, 128, 160]),
+        DiscreteParameterRange('epochs', values=[30]),
+    ],
+    # this is the objective metric we want to maximize/minimize
+    objective_metric_title='val_acc',
+    objective_metric_series='val_acc',
+    # now we decide if we want to maximize it or minimize it (accuracy we maximize)
+    objective_metric_sign='max',
+    # let us limit the number of concurrent experiments,
+    # this in turn will make sure we do dont bombard the scheduler with experiments.
+    # if we have an auto-scaler connected, this, by proxy, will limit the number of machine
+    max_number_of_concurrent_tasks=2,
+    # this is the optimizer class (actually doing the optimization)
+    # Currently, we can choose from GridSearch, RandomSearch or OptimizerBOHB (Bayesian optimization Hyper-Band)
+    # more are coming soon...
+    optimizer_class=Our_SearchStrategy,
+    # Select an execution queue to schedule the experiments for execution
+    execution_queue='moshik',
+    # Optional: Limit the execution time of a single experiment, in minutes.
+    # (this is optional, and if using  OptimizerBOHB, it is ignored)
+    time_limit_per_job=10.,
+    # Check the experiments every 6 seconds is way too often, we should probably set it to 5 min,
+    # assuming a single experiment is usually hours...
+    pool_period_min=0.1,
+    # set the maximum number of jobs to launch for the optimization, default (None) unlimited
+    # If OptimizerBOHB is used, it defined the maximum budget in terms of full jobs
+    # basically the cumulative number of iterations will not exceed total_max_jobs * max_iteration_per_job
+    total_max_jobs=10,
+    # This is only applicable for OptimizerBOHB and ignore by the rest
+    # set the minimum number of iterations for an experiment, before early stopping
+    min_iteration_per_job=10,
+    # Set the maximum number of iterations for an experiment to execute
+    # (This is optional, unless using OptimizerBOHB where this is a must)
+    max_iteration_per_job=30,
+)
+
+# if we are running as a service, just enqueue ourselves into the services queue and let it run the optimization
+if args['run_as_service']:
+    # if this code is executed by `trains-agent` the function call does nothing.
+    # if executed locally, the local process will be terminated, and a remote copy will be executed instead
+    task.execute_remotely(queue_name='services', exit_process=True)
+
+# report every 12 seconds, this is way too often, but we are testing here J
+an_optimizer.set_report_period(2.2)
+# start the optimization process, callback function to be called every time an experiment is completed
+# this function returns immediately
+an_optimizer.start(job_complete_callback=job_complete_callback)
+# set the time limit for the optimization process (2 hours)
+an_optimizer.set_time_limit(in_minutes=120.0)
+# wait until process is done (notice we are controlling the optimization process in the background)
+an_optimizer.wait()
+# optimization is completed, print the top performing experiments id
+top_exp = an_optimizer.get_top_experiments(top_k=3)
+print([t.id for t in top_exp])
+# make sure background optimization stopped
+an_optimizer.stop()
+
+print('We are done, good bye')
--- a/examples/services/hyper-parameter-optimization/requirements.txt
+++ b/examples/services/hyper-parameter-optimization/requirements.txt
@@ -0,0 +1,3 @@
+keras
+tensorflow
+trains
--- a/examples/services/jupyter-service/execute_jupyter_notebook_server.py
+++ b/examples/services/jupyter-service/execute_jupyter_notebook_server.py
@@ -0,0 +1,154 @@
+import os
+import socket
+import subprocess
+import sys
+from copy import deepcopy
+from tempfile import mkstemp
+
+import psutil
+
+# make sure we have jupyter in the auto requirements
+from trains import Task
+
+# set default docker image, with network configuration
+os.environ["TRAINS_DOCKER_IMAGE"] = "nvidia/cuda --network host"
+
+# initialize TRAINS
+task = Task.init(project_name="examples", task_name="Remote Jupyter NoteBook")
+
+# get rid of all the runtime TRAINS
+preserve = (
+    "TRAINS_API_HOST",
+    "TRAINS_WEB_HOST",
+    "TRAINS_FILES_HOST",
+    "TRAINS_CONFIG_FILE",
+    "TRAINS_API_ACCESS_KEY",
+    "TRAINS_API_SECRET_KEY",
+    "TRAINS_API_HOST_VERIFY_CERT",
+)
+
+# setup os environment
+env = deepcopy(os.environ)
+for key in os.environ:
+    if key.startswith("TRAINS") and key not in preserve:
+        env.pop(key, None)
+
+# Add jupyter server base folder
+param = {
+    "jupyter_server_base_directory": "~/",
+    "ssh_server": True,
+    "ssh_password": "training",
+}
+task.connect(param)
+
+# noinspection PyBroadException
+try:
+    hostname = socket.gethostname()
+    hostnames = socket.gethostbyname(socket.gethostname())
+except Exception:
+
+    def get_ip_addresses(family):
+        for interface, snics in psutil.net_if_addrs().items():
+            for snic in snics:
+                if snic.family == family:
+                    yield snic.address
+
+    hostnames = list(get_ip_addresses(socket.AF_INET))
+    hostname = hostnames[0]
+
+if param.get("ssh_server"):
+    print("Installing SSH Server on {} [{}]".format(hostname, hostnames))
+    ssh_password = param.get("ssh_password", "training")
+    # noinspection PyBroadException
+    try:
+        used_ports = [i.laddr.port for i in psutil.net_connections()]
+        port = [i for i in range(10022, 15000) if i not in used_ports][0]
+
+        result = os.system(
+            "apt-get install -y openssh-server && "
+            "mkdir -p /var/run/sshd && "
+            "echo 'root:{password}' | chpasswd && "
+            "echo 'PermitRootLogin yes' >> /etc/ssh/sshd_config && "
+            "sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && "
+            "sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && "  # noqa: W605
+            'echo "export VISIBLE=now" >> /etc/profile && '
+            'echo "export TRAINS_CONFIG_FILE={trains_config_file}" >> /etc/profile && '
+            "/usr/sbin/sshd -p {port}".format(
+                password=ssh_password,
+                port=port,
+                trains_config_file=os.environ.get("TRAINS_CONFIG_FILE"),
+            )
+        )
+
+        if result == 0:
+            print(
+                "\n#\n# SSH Server running on {} [{}] port {}\n# LOGIN u:root p:{}\n#\n".format(
+                    hostname, hostnames, port, ssh_password
+                )
+            )
+        else:
+            raise ValueError()
+    except Exception:
+        print("\n#\n# Error: SSH server could not be launched\n#\n")
+
+# execute jupyter notebook
+fd, local_filename = mkstemp()
+cwd = (
+    os.path.expandvars(os.path.expanduser(param["jupyter_server_base_directory"]))
+    if param["jupyter_server_base_directory"]
+    else os.getcwd()
+)
+print(
+    "Running Jupyter Notebook Server on {} [{}] at {}".format(hostname, hostnames, cwd)
+)
+process = subprocess.Popen(
+    [
+        sys.executable,
+        "-m",
+        "jupyter",
+        "notebook",
+        "--no-browser",
+        "--allow-root",
+        "--ip",
+        "0.0.0.0",
+    ],
+    env=env,
+    stdout=fd,
+    stderr=fd,
+    cwd=cwd,
+)
+
+# print stdout/stderr
+prev_line_count = 0
+process_running = True
+while process_running:
+    process_running = False
+    try:
+        process.wait(timeout=2.0 if prev_line_count == 0 else 15.0)
+    except subprocess.TimeoutExpired:
+        process_running = True
+
+    with open(local_filename, "rt") as f:
+        # read new lines
+        new_lines = f.readlines()
+        if not new_lines:
+            continue
+        output = "".join(new_lines)
+        print(output)
+        # update task comment with jupyter notebook server links
+        if prev_line_count == 0:
+            task.comment += "\n" + "".join(
+                line for line in new_lines if "http://" in line or "https://" in line
+            )
+        prev_line_count += len(new_lines)
+
+    os.lseek(fd, 0, 0)
+    os.ftruncate(fd, 0)
+
+# cleanup
+os.close(fd)
+# noinspection PyBroadException
+try:
+    os.unlink(local_filename)
+except Exception:
+    pass