Add --docker-args --debug options. Change --debugging to --debugging-session. Issue #1

This commit is contained in:
allegroai 2021-03-18 03:02:17 +02:00
parent bf1851cd38
commit 60b49768ac

View File

@ -6,7 +6,7 @@ import subprocess
import sys import sys
from argparse import ArgumentParser, FileType from argparse import ArgumentParser, FileType
from functools import reduce from functools import reduce
from io import TextIOBase from io import TextIOBase, StringIO
from time import time, sleep from time import time, sleep
if sys.platform == 'win32': if sys.platform == 'win32':
@ -343,7 +343,7 @@ def get_user_inputs(args, parser, state, client):
queues_list = '\n'.join('{}] {}'.format(i, q) for i, q in enumerate(queues)) queues_list = '\n'.join('{}] {}'.format(i, q) for i, q in enumerate(queues))
while True: while True:
try: try:
choice = int(input(queues_list+'\nSelect a queue [0-{}] '.format(len(queues)-1))) choice = int(input(queues_list+'\nSelect a queue [0-{}]: '.format(len(queues)-1)))
assert 0 <= choice < len(queues) assert 0 <= choice < len(queues)
break break
except (TypeError, ValueError, AssertionError): except (TypeError, ValueError, AssertionError):
@ -364,7 +364,7 @@ def get_user_inputs(args, parser, state, client):
def save_state(state, state_file): def save_state(state, state_file):
# if we are running in debugging mode, # if we are running in debugging mode,
# only store the current task (do not change the defaults) # only store the current task (do not change the defaults)
if state.get('debugging'): if state.get('debugging_session'):
# noinspection PyBroadException # noinspection PyBroadException
base_state = load_state(state_file) base_state = load_state(state_file)
base_state['task_id'] = state.get('task_id') base_state['task_id'] = state.get('task_id')
@ -383,14 +383,16 @@ def load_state(state_file):
state = json.load(f) state = json.load(f)
except Exception: except Exception:
state = {} state = {}
# never reload --debug state
state.pop('debug', None)
return state return state
def clone_task(state, project_id): def clone_task(state, project_id):
new_task = False new_task = False
if state.get('debugging'): if state.get('debugging_session'):
print('Starting new debugging session to {}'.format(state.get('debugging'))) print('Starting new debugging session to {}'.format(state.get('debugging_session')))
task = create_debugging_task(state, state.get('debugging')) task = create_debugging_task(state, state.get('debugging_session'))
elif state.get('base_task_id'): elif state.get('base_task_id'):
print('Cloning base session {}'.format(state['base_task_id'])) print('Cloning base session {}'.format(state['base_task_id']))
task = Task.clone(source_task=state['base_task_id'], project=project_id, parent=state['base_task_id']) task = Task.clone(source_task=state['base_task_id'], project=project_id, parent=state['base_task_id'])
@ -409,7 +411,10 @@ def clone_task(state, project_id):
task_params['properties/jupyter_token'] = '' task_params['properties/jupyter_token'] = ''
task_params['properties/jupyter_port'] = '' task_params['properties/jupyter_port'] = ''
if state.get('remote_gateway') is not None: if state.get('remote_gateway') is not None:
task_params['properties/external_address'] = str(state.get('remote_gateway')) remote_gateway_parts = str(state.get('remote_gateway')).split(':')
task_params['properties/external_address'] = remote_gateway_parts[0]
if len(remote_gateway_parts) > 1:
task_params['properties/external_ssh_port'] = remote_gateway_parts[1]
task_params['{}/ssh_server'.format(section)] = str(True) task_params['{}/ssh_server'.format(section)] = str(True)
task_params['{}/ssh_password'.format(section)] = state['password'] task_params['{}/ssh_password'.format(section)] = state['password']
task_params['{}/user_key'.format(section)] = config_obj.get("api.credentials.access_key") task_params['{}/user_key'.format(section)] = config_obj.get("api.credentials.access_key")
@ -424,6 +429,8 @@ def clone_task(state, project_id):
docker = default_docker_image docker = default_docker_image
if docker: if docker:
task_params['{}/default_docker'.format(section)] = docker.replace('--network host', '').strip() task_params['{}/default_docker'.format(section)] = docker.replace('--network host', '').strip()
if state.get('docker_args'):
docker += ' {}'.format(state.get('docker_args'))
task.set_base_docker(docker + ( task.set_base_docker(docker + (
' --network host' if not state.get('skip_docker_network') and '--network host' not in docker else '')) ' --network host' if not state.get('skip_docker_network') and '--network host' not in docker else ''))
# set the bash init script # set the bash init script
@ -477,6 +484,7 @@ def wait_for_machine(state, task):
last_status = task._get_status()[1] last_status = task._get_status()[1]
print('Status [{}]{}'.format(status, ' - {}'.format(last_status) if last_status else '')) print('Status [{}]{}'.format(status, ' - {}'.format(last_status) if last_status else ''))
last_status = status last_status = status
print('Remote machine allocated') print('Remote machine allocated')
print('Setting remote environment [Task id={}]'.format(task.id)) print('Setting remote environment [Task id={}]'.format(task.id))
print('Setup process details: {}'.format(task.get_output_log_web_page())) print('Setup process details: {}'.format(task.get_output_log_web_page()))
@ -500,8 +508,25 @@ def wait_for_machine(state, task):
if state.get('vscode_server'): if state.get('vscode_server'):
wait_properties += ['properties/vscode_port'] wait_properties += ['properties/vscode_port']
last_lines = []
period_counter = 0
while any(bool(not task.get_parameter(p)) for p in wait_properties) and task.get_status() == 'in_progress': while any(bool(not task.get_parameter(p)) for p in wait_properties) and task.get_status() == 'in_progress':
print('.', end='', flush=True) lines = task.get_reported_console_output(10) if state.get('debug') else []
if last_lines != lines:
# new line if we had '.' counter in the previous run
if period_counter:
print('')
period_counter = 0
try:
index = next(i for i, line in enumerate(lines) if last_lines and line == last_lines[-1])
print('> ' + ''.join(lines[index+1:]).rstrip().replace('\n', '\n> '))
except StopIteration:
print('> ' + ''.join(lines).rstrip().replace('\n', '\n> '))
last_lines = lines
else:
print('.', end='', flush=True)
period_counter += 1
sleep(3.) sleep(3.)
task.reload() task.reload()
if task.get_status() != 'in_progress': if task.get_status() != 'in_progress':
@ -512,7 +537,7 @@ def wait_for_machine(state, task):
return task return task
def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_remote_pair_list): def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_remote_pair_list, debug=False):
print('Starting SSH tunnel') print('Starting SSH tunnel')
child = None child = None
args = ['-N', '-C', args = ['-N', '-C',
@ -525,22 +550,26 @@ def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_rem
for local, remote in local_remote_pair_list: for local, remote in local_remote_pair_list:
args.extend(['-L', '{}:localhost:{}'.format(local, remote)]) args.extend(['-L', '{}:localhost:{}'.format(local, remote)])
# store SSH output
fd = StringIO() if debug else sys.stdout
# noinspection PyBroadException # noinspection PyBroadException
try: try:
child = pexpect.spawn( child = pexpect.spawn(
command=_check_ssh_executable(), command=_check_ssh_executable(),
args=args, args=args,
logfile=sys.stdout, timeout=20, encoding='utf-8') logfile=fd, timeout=20, encoding='utf-8')
i = child.expect([r'(?i)password:', r'\(yes\/no\)', r'.*[$#] ', pexpect.EOF]) i = child.expect([r'(?i)password:', r'\(yes\/no\)', r'.*[$#] ', pexpect.EOF])
if i == 0: if i == 0:
child.sendline(ssh_password) child.sendline(ssh_password)
try: try:
child.expect([r'(?i)password:'], timeout=5) child.expect([r'(?i)password:'], timeout=5)
print('Error: incorrect password') print('{}Error: incorrect password'.format(fd.read() + '\n' if debug else ''))
ssh_password = input('Please enter password manually: ') ssh_password = input('Please enter password manually: ')
child.sendline(ssh_password) child.sendline(ssh_password)
child.expect([r'(?i)password:'], timeout=5) child.expect([r'(?i)password:'], timeout=5)
print('Error: incorrect user input password') print('{}Error: incorrect user input password'.format(fd.read() + '\n' if debug else ''))
raise ValueError('Incorrect password') raise ValueError('Incorrect password')
except pexpect.TIMEOUT: except pexpect.TIMEOUT:
pass pass
@ -556,7 +585,7 @@ def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_rem
ssh_password = input('Please enter password manually: ') ssh_password = input('Please enter password manually: ')
child.sendline(ssh_password) child.sendline(ssh_password)
child.expect([r'(?i)password:'], timeout=5) child.expect([r'(?i)password:'], timeout=5)
print('Error: incorrect user input password') print('{}Error: incorrect user input password'.format(fd.read() + '\n' if debug else ''))
raise ValueError('Incorrect password') raise ValueError('Incorrect password')
except pexpect.TIMEOUT: except pexpect.TIMEOUT:
pass pass
@ -638,7 +667,9 @@ def monitor_ssh_tunnel(state, task):
ssh_process, ssh_password = start_ssh_tunnel( ssh_process, ssh_password = start_ssh_tunnel(
state.get('username') or 'root', state.get('username') or 'root',
remote_address, ssh_port, ssh_password, remote_address, ssh_port, ssh_password,
local_remote_pair_list=local_remote_pair_list) local_remote_pair_list=local_remote_pair_list,
debug=state.get('debug', False),
)
if ssh_process and ssh_process.isalive(): if ssh_process and ssh_process.isalive():
msg = \ msg = \
@ -721,14 +752,17 @@ def setup_parser(parser):
help='Display the clearml-session utility version') help='Display the clearml-session utility version')
parser.add_argument('--attach', default=False, nargs='?', parser.add_argument('--attach', default=False, nargs='?',
help='Attach to running interactive session (default: previous session)') help='Attach to running interactive session (default: previous session)')
parser.add_argument('--debugging', type=str, default=None, parser.add_argument('--debugging-session', type=str, default=None,
help='Pass existing Task id (experiment), create a copy of the experiment on a remote machine, ' help='Pass existing Task id (experiment), create a copy of the experiment on a remote machine, '
'and launch jupyter/ssh for interactive access. Example --debugging <task_id>') 'and launch jupyter/ssh for interactive access. Example --debugging-session <task_id>')
parser.add_argument('--queue', type=str, default=None, parser.add_argument('--queue', type=str, default=None,
help='Select the queue to launch the interactive session on (default: previously used queue)') help='Select the queue to launch the interactive session on (default: previously used queue)')
parser.add_argument('--docker', type=str, default=None, parser.add_argument('--docker', type=str, default=None,
help='Select the docker image to use in the interactive session on ' help='Select the docker image to use in the interactive session on '
'(default: previously used docker image or `{}`)'.format(default_docker_image)) '(default: previously used docker image or `{}`)'.format(default_docker_image))
parser.add_argument('--docker-args', type=str, default=None,
help='Add additional arguments for the docker image to use in the interactive session on '
'(default: previously used docker-args)')
parser.add_argument('--public-ip', default=None, nargs='?', const='true', metavar='true/false', parser.add_argument('--public-ip', default=None, nargs='?', const='true', metavar='true/false',
type=lambda x: (str(x).strip().lower() in ('true', 'yes')), type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
help='If True register the public IP of the remote machine. Set if running on the cloud. ' help='If True register the public IP of the remote machine. Set if running on the cloud. '
@ -761,7 +795,7 @@ def setup_parser(parser):
help='Advanced: Change the configuration file used to store the previous state ' help='Advanced: Change the configuration file used to store the previous state '
'(default: ~/.clearml_session.json)') '(default: ~/.clearml_session.json)')
parser.add_argument('--remote-gateway', default=None, nargs='?', parser.add_argument('--remote-gateway', default=None, nargs='?',
help='Advanced: Specify gateway ip/address to be passed to interactive session ' help='Advanced: Specify gateway ip/address:port to be passed to interactive session '
'(for use with k8s ingestion / ELB)') '(for use with k8s ingestion / ELB)')
parser.add_argument('--base-task-id', type=str, default=None, parser.add_argument('--base-task-id', type=str, default=None,
help='Advanced: Set the base task ID for the interactive session. ' help='Advanced: Set the base task ID for the interactive session. '
@ -784,6 +818,8 @@ def setup_parser(parser):
parser.add_argument('--username', type=str, default=None, parser.add_argument('--username', type=str, default=None,
help='Advanced: Select ssh username for the interactive session ' help='Advanced: Select ssh username for the interactive session '
'(default: `root` or previously used one)') '(default: `root` or previously used one)')
parser.add_argument('--debug', action='store_true', default=None,
help='Advanced: If set, print debugging information')
def get_version(): def get_version():
@ -818,6 +854,9 @@ def cli():
state_file = os.path.abspath(os.path.expandvars(os.path.expanduser(args.config_file))) state_file = os.path.abspath(os.path.expandvars(os.path.expanduser(args.config_file)))
state = load_state(state_file) state = load_state(state_file)
if args.debug:
state['debug'] = args.debug
client = APIClient() client = APIClient()
# get previous session, if it is running # get previous session, if it is running
@ -882,7 +921,7 @@ def _check_previous_session(client, args, state):
task = None task = None
status = task.get_status() if task else None status = task.get_status() if task else None
if status == 'in_progress': if status == 'in_progress':
if not args.debugging or task.parent == args.debugging: if not args.debugging_session or task.parent == args.debugging_session:
# only ask if we were not requested directly # only ask if we were not requested directly
print('Using active session id={}'.format(task_id)) print('Using active session id={}'.format(task_id))
return task return task
@ -894,10 +933,10 @@ def _check_previous_session(client, args, state):
if not running_task_ids_created: if not running_task_ids_created:
return None return None
if args.debugging: if args.debugging_session:
running_task_ids_created = [t for t in running_task_ids_created if t[2] == args.debugging] running_task_ids_created = [t for t in running_task_ids_created if t[2] == args.debugging_session]
if not running_task_ids_created: if not running_task_ids_created:
print('No active task={} debugging session found'.format(args.debugging)) print('No active task={} debugging session found'.format(args.debugging_session))
return None return None
# a single running session # a single running session
@ -920,8 +959,8 @@ def _check_previous_session(client, args, state):
for i, (tid, dt, _) in enumerate(running_task_ids_created)) for i, (tid, dt, _) in enumerate(running_task_ids_created))
while True: while True:
try: try:
choice = input(session_list+'\nConnect to session [0-{}] or \'N\' to skip'.format( choice = input(session_list+'\nConnect to session [{}] or \'N\' to skip: '.format(
len(running_task_ids_created)-1)) '0' if len(running_task_ids_created) <= 1 else '0-{}'.format(len(running_task_ids_created)-1)))
if choice.strip().lower().startswith('n'): if choice.strip().lower().startswith('n'):
choice = None choice = None
elif default_i is not None and not choice.strip(): elif default_i is not None and not choice.strip():