Add --docker-args --debug options. Change --debugging to --debugging-session. Issue #1

2025-05-05 12:34:34 +00:00 · 2021-03-18 03:02:17 +02:00 · 2021-03-18 03:02:17 +02:00 · 60b49768ac
commit 60b49768ac
parent bf1851cd38
1 changed files with 62 additions and 23 deletions
--- a/clearml_session/main.py
+++ b/clearml_session/main.py
@ -6,7 +6,7 @@ import subprocess
 import sys
 from argparse import ArgumentParser, FileType
 from functools import reduce
-from io import TextIOBase
+from io import TextIOBase, StringIO
 from time import time, sleep

 if sys.platform == 'win32':
@ -343,7 +343,7 @@ def get_user_inputs(args, parser, state, client):
        queues_list = '\n'.join('{}] {}'.format(i, q) for i, q in enumerate(queues))
        while True:
            try:
-                choice = int(input(queues_list+'\nSelect a queue [0-{}] '.format(len(queues)-1)))
+                choice = int(input(queues_list+'\nSelect a queue [0-{}]: '.format(len(queues)-1)))
                assert 0 <= choice < len(queues)
                break
            except (TypeError, ValueError, AssertionError):
@ -364,7 +364,7 @@ def get_user_inputs(args, parser, state, client):
 def save_state(state, state_file):
    # if we are running in debugging mode,
    # only store the current task (do not change the defaults)
-    if state.get('debugging'):
+    if state.get('debugging_session'):
        # noinspection PyBroadException
        base_state = load_state(state_file)
        base_state['task_id'] = state.get('task_id')
@ -383,14 +383,16 @@ def load_state(state_file):
            state = json.load(f)
    except Exception:
        state = {}
+    # never reload --debug state
+    state.pop('debug', None)
    return state


 def clone_task(state, project_id):
    new_task = False
-    if state.get('debugging'):
-        print('Starting new debugging session to {}'.format(state.get('debugging')))
-        task = create_debugging_task(state, state.get('debugging'))
+    if state.get('debugging_session'):
+        print('Starting new debugging session to {}'.format(state.get('debugging_session')))
+        task = create_debugging_task(state, state.get('debugging_session'))
    elif state.get('base_task_id'):
        print('Cloning base session {}'.format(state['base_task_id']))
        task = Task.clone(source_task=state['base_task_id'], project=project_id, parent=state['base_task_id'])
@ -409,7 +411,10 @@ def clone_task(state, project_id):
    task_params['properties/jupyter_token'] = ''
    task_params['properties/jupyter_port'] = ''
    if state.get('remote_gateway') is not None:
-        task_params['properties/external_address'] = str(state.get('remote_gateway'))
+        remote_gateway_parts = str(state.get('remote_gateway')).split(':')
+        task_params['properties/external_address'] = remote_gateway_parts[0]
+        if len(remote_gateway_parts) > 1:
+            task_params['properties/external_ssh_port'] = remote_gateway_parts[1]
    task_params['{}/ssh_server'.format(section)] = str(True)
    task_params['{}/ssh_password'.format(section)] = state['password']
    task_params['{}/user_key'.format(section)] = config_obj.get("api.credentials.access_key")
@ -424,6 +429,8 @@ def clone_task(state, project_id):
        docker = default_docker_image
    if docker:
        task_params['{}/default_docker'.format(section)] = docker.replace('--network host', '').strip()
+        if state.get('docker_args'):
+            docker += ' {}'.format(state.get('docker_args'))
        task.set_base_docker(docker + (
            ' --network host' if not state.get('skip_docker_network') and '--network host' not in docker else ''))
    # set the bash init script
@ -477,6 +484,7 @@ def wait_for_machine(state, task):
            last_status = task._get_status()[1]
            print('Status [{}]{}'.format(status, ' - {}'.format(last_status) if last_status else ''))
        last_status = status
+
    print('Remote machine allocated')
    print('Setting remote environment [Task id={}]'.format(task.id))
    print('Setup process details: {}'.format(task.get_output_log_web_page()))
@ -500,8 +508,25 @@ def wait_for_machine(state, task):
    if state.get('vscode_server'):
        wait_properties += ['properties/vscode_port']

+    last_lines = []
+    period_counter = 0
    while any(bool(not task.get_parameter(p)) for p in wait_properties) and task.get_status() == 'in_progress':
-        print('.', end='', flush=True)
+        lines = task.get_reported_console_output(10) if state.get('debug') else []
+        if last_lines != lines:
+            # new line if we had '.' counter in the previous run
+            if period_counter:
+                print('')
+                period_counter = 0
+            try:
+                index = next(i for i, line in enumerate(lines) if last_lines and line == last_lines[-1])
+                print('> ' + ''.join(lines[index+1:]).rstrip().replace('\n', '\n> '))
+            except StopIteration:
+                print('> ' + ''.join(lines).rstrip().replace('\n', '\n> '))
+            last_lines = lines
+        else:
+            print('.', end='', flush=True)
+            period_counter += 1
+
        sleep(3.)
        task.reload()
    if task.get_status() != 'in_progress':
@ -512,7 +537,7 @@ def wait_for_machine(state, task):
    return task


-def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_remote_pair_list):
+def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_remote_pair_list, debug=False):
    print('Starting SSH tunnel')
    child = None
    args = ['-N', '-C',
@ -525,22 +550,26 @@ def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_rem
    for local, remote in local_remote_pair_list:
        args.extend(['-L', '{}:localhost:{}'.format(local, remote)])

+    # store SSH output
+    fd = StringIO() if debug else sys.stdout
+
    # noinspection PyBroadException
    try:
        child = pexpect.spawn(
            command=_check_ssh_executable(),
            args=args,
-            logfile=sys.stdout, timeout=20, encoding='utf-8')
+            logfile=fd, timeout=20, encoding='utf-8')
+
        i = child.expect([r'(?i)password:', r'\(yes\/no\)', r'.*[$#] ', pexpect.EOF])
        if i == 0:
            child.sendline(ssh_password)
            try:
                child.expect([r'(?i)password:'], timeout=5)
-                print('Error: incorrect password')
+                print('{}Error: incorrect password'.format(fd.read() + '\n' if debug else ''))
                ssh_password = input('Please enter password manually: ')
                child.sendline(ssh_password)
                child.expect([r'(?i)password:'], timeout=5)
-                print('Error: incorrect user input password')
+                print('{}Error: incorrect user input password'.format(fd.read() + '\n' if debug else ''))
                raise ValueError('Incorrect password')
            except pexpect.TIMEOUT:
                pass
@ -556,7 +585,7 @@ def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_rem
                    ssh_password = input('Please enter password manually: ')
                    child.sendline(ssh_password)
                    child.expect([r'(?i)password:'], timeout=5)
-                    print('Error: incorrect user input password')
+                    print('{}Error: incorrect user input password'.format(fd.read() + '\n' if debug else ''))
                    raise ValueError('Incorrect password')
                except pexpect.TIMEOUT:
                    pass
@ -638,7 +667,9 @@ def monitor_ssh_tunnel(state, task):
                ssh_process, ssh_password = start_ssh_tunnel(
                    state.get('username') or 'root',
                    remote_address, ssh_port, ssh_password,
-                    local_remote_pair_list=local_remote_pair_list)
+                    local_remote_pair_list=local_remote_pair_list,
+                    debug=state.get('debug', False),
+                )

                if ssh_process and ssh_process.isalive():
                    msg = \
@ -721,14 +752,17 @@ def setup_parser(parser):
                        help='Display the clearml-session utility version')
    parser.add_argument('--attach', default=False, nargs='?',
                        help='Attach to running interactive session (default: previous session)')
-    parser.add_argument('--debugging', type=str, default=None,
+    parser.add_argument('--debugging-session', type=str, default=None,
                        help='Pass existing Task id (experiment), create a copy of the experiment on a remote machine, '
-                             'and launch jupyter/ssh for interactive access. Example --debugging <task_id>')
+                             'and launch jupyter/ssh for interactive access. Example --debugging-session <task_id>')
    parser.add_argument('--queue', type=str, default=None,
                        help='Select the queue to launch the interactive session on (default: previously used queue)')
    parser.add_argument('--docker', type=str, default=None,
                        help='Select the docker image to use in the interactive session on '
                             '(default: previously used docker image or `{}`)'.format(default_docker_image))
+    parser.add_argument('--docker-args', type=str, default=None,
+                        help='Add additional arguments for the docker image to use in the interactive session on '
+                             '(default: previously used docker-args)')
    parser.add_argument('--public-ip', default=None, nargs='?', const='true', metavar='true/false',
                        type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
                        help='If True register the public IP of the remote machine. Set if running on the cloud. '
@ -761,7 +795,7 @@ def setup_parser(parser):
                        help='Advanced: Change the configuration file used to store the previous state '
                             '(default: ~/.clearml_session.json)')
    parser.add_argument('--remote-gateway', default=None, nargs='?',
-                        help='Advanced: Specify gateway ip/address to be passed to interactive session '
+                        help='Advanced: Specify gateway ip/address:port to be passed to interactive session '
                             '(for use with k8s ingestion / ELB)')
    parser.add_argument('--base-task-id', type=str, default=None,
                        help='Advanced: Set the base task ID for the interactive session. '
@ -784,6 +818,8 @@ def setup_parser(parser):
    parser.add_argument('--username', type=str, default=None,
                        help='Advanced: Select ssh username for the interactive session '
                             '(default: `root` or previously used one)')
+    parser.add_argument('--debug', action='store_true', default=None,
+                        help='Advanced: If set, print debugging information')


 def get_version():
@ -818,6 +854,9 @@ def cli():
    state_file = os.path.abspath(os.path.expandvars(os.path.expanduser(args.config_file)))
    state = load_state(state_file)

+    if args.debug:
+        state['debug'] = args.debug
+
    client = APIClient()

    # get previous session, if it is running
@ -882,7 +921,7 @@ def _check_previous_session(client, args, state):
            task = None
        status = task.get_status() if task else None
        if status == 'in_progress':
-            if not args.debugging or task.parent == args.debugging:
+            if not args.debugging_session or task.parent == args.debugging_session:
                # only ask if we were not requested directly
                print('Using active session id={}'.format(task_id))
                return task
@ -894,10 +933,10 @@ def _check_previous_session(client, args, state):
    if not running_task_ids_created:
        return None

-    if args.debugging:
-        running_task_ids_created = [t for t in running_task_ids_created if t[2] == args.debugging]
+    if args.debugging_session:
+        running_task_ids_created = [t for t in running_task_ids_created if t[2] == args.debugging_session]
        if not running_task_ids_created:
-            print('No active task={} debugging session found'.format(args.debugging))
+            print('No active task={} debugging session found'.format(args.debugging_session))
            return None

    # a single running session
@ -920,8 +959,8 @@ def _check_previous_session(client, args, state):
        for i, (tid, dt, _) in enumerate(running_task_ids_created))
    while True:
        try:
-            choice = input(session_list+'\nConnect to session [0-{}] or \'N\' to skip'.format(
-                len(running_task_ids_created)-1))
+            choice = input(session_list+'\nConnect to session [{}] or \'N\' to skip: '.format(
+                '0' if len(running_task_ids_created) <= 1 else '0-{}'.format(len(running_task_ids_created)-1)))
            if choice.strip().lower().startswith('n'):
                choice = None
            elif default_i is not None and not choice.strip():