mirror of
https://github.com/clearml/clearml-session
synced 2025-05-05 20:44:57 +00:00
Add --docker-args
--debug
options. Change --debugging
to --debugging-session
. Issue #1
This commit is contained in:
parent
bf1851cd38
commit
60b49768ac
@ -6,7 +6,7 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
from argparse import ArgumentParser, FileType
|
from argparse import ArgumentParser, FileType
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from io import TextIOBase
|
from io import TextIOBase, StringIO
|
||||||
from time import time, sleep
|
from time import time, sleep
|
||||||
|
|
||||||
if sys.platform == 'win32':
|
if sys.platform == 'win32':
|
||||||
@ -343,7 +343,7 @@ def get_user_inputs(args, parser, state, client):
|
|||||||
queues_list = '\n'.join('{}] {}'.format(i, q) for i, q in enumerate(queues))
|
queues_list = '\n'.join('{}] {}'.format(i, q) for i, q in enumerate(queues))
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
choice = int(input(queues_list+'\nSelect a queue [0-{}] '.format(len(queues)-1)))
|
choice = int(input(queues_list+'\nSelect a queue [0-{}]: '.format(len(queues)-1)))
|
||||||
assert 0 <= choice < len(queues)
|
assert 0 <= choice < len(queues)
|
||||||
break
|
break
|
||||||
except (TypeError, ValueError, AssertionError):
|
except (TypeError, ValueError, AssertionError):
|
||||||
@ -364,7 +364,7 @@ def get_user_inputs(args, parser, state, client):
|
|||||||
def save_state(state, state_file):
|
def save_state(state, state_file):
|
||||||
# if we are running in debugging mode,
|
# if we are running in debugging mode,
|
||||||
# only store the current task (do not change the defaults)
|
# only store the current task (do not change the defaults)
|
||||||
if state.get('debugging'):
|
if state.get('debugging_session'):
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
base_state = load_state(state_file)
|
base_state = load_state(state_file)
|
||||||
base_state['task_id'] = state.get('task_id')
|
base_state['task_id'] = state.get('task_id')
|
||||||
@ -383,14 +383,16 @@ def load_state(state_file):
|
|||||||
state = json.load(f)
|
state = json.load(f)
|
||||||
except Exception:
|
except Exception:
|
||||||
state = {}
|
state = {}
|
||||||
|
# never reload --debug state
|
||||||
|
state.pop('debug', None)
|
||||||
return state
|
return state
|
||||||
|
|
||||||
|
|
||||||
def clone_task(state, project_id):
|
def clone_task(state, project_id):
|
||||||
new_task = False
|
new_task = False
|
||||||
if state.get('debugging'):
|
if state.get('debugging_session'):
|
||||||
print('Starting new debugging session to {}'.format(state.get('debugging')))
|
print('Starting new debugging session to {}'.format(state.get('debugging_session')))
|
||||||
task = create_debugging_task(state, state.get('debugging'))
|
task = create_debugging_task(state, state.get('debugging_session'))
|
||||||
elif state.get('base_task_id'):
|
elif state.get('base_task_id'):
|
||||||
print('Cloning base session {}'.format(state['base_task_id']))
|
print('Cloning base session {}'.format(state['base_task_id']))
|
||||||
task = Task.clone(source_task=state['base_task_id'], project=project_id, parent=state['base_task_id'])
|
task = Task.clone(source_task=state['base_task_id'], project=project_id, parent=state['base_task_id'])
|
||||||
@ -409,7 +411,10 @@ def clone_task(state, project_id):
|
|||||||
task_params['properties/jupyter_token'] = ''
|
task_params['properties/jupyter_token'] = ''
|
||||||
task_params['properties/jupyter_port'] = ''
|
task_params['properties/jupyter_port'] = ''
|
||||||
if state.get('remote_gateway') is not None:
|
if state.get('remote_gateway') is not None:
|
||||||
task_params['properties/external_address'] = str(state.get('remote_gateway'))
|
remote_gateway_parts = str(state.get('remote_gateway')).split(':')
|
||||||
|
task_params['properties/external_address'] = remote_gateway_parts[0]
|
||||||
|
if len(remote_gateway_parts) > 1:
|
||||||
|
task_params['properties/external_ssh_port'] = remote_gateway_parts[1]
|
||||||
task_params['{}/ssh_server'.format(section)] = str(True)
|
task_params['{}/ssh_server'.format(section)] = str(True)
|
||||||
task_params['{}/ssh_password'.format(section)] = state['password']
|
task_params['{}/ssh_password'.format(section)] = state['password']
|
||||||
task_params['{}/user_key'.format(section)] = config_obj.get("api.credentials.access_key")
|
task_params['{}/user_key'.format(section)] = config_obj.get("api.credentials.access_key")
|
||||||
@ -424,6 +429,8 @@ def clone_task(state, project_id):
|
|||||||
docker = default_docker_image
|
docker = default_docker_image
|
||||||
if docker:
|
if docker:
|
||||||
task_params['{}/default_docker'.format(section)] = docker.replace('--network host', '').strip()
|
task_params['{}/default_docker'.format(section)] = docker.replace('--network host', '').strip()
|
||||||
|
if state.get('docker_args'):
|
||||||
|
docker += ' {}'.format(state.get('docker_args'))
|
||||||
task.set_base_docker(docker + (
|
task.set_base_docker(docker + (
|
||||||
' --network host' if not state.get('skip_docker_network') and '--network host' not in docker else ''))
|
' --network host' if not state.get('skip_docker_network') and '--network host' not in docker else ''))
|
||||||
# set the bash init script
|
# set the bash init script
|
||||||
@ -477,6 +484,7 @@ def wait_for_machine(state, task):
|
|||||||
last_status = task._get_status()[1]
|
last_status = task._get_status()[1]
|
||||||
print('Status [{}]{}'.format(status, ' - {}'.format(last_status) if last_status else ''))
|
print('Status [{}]{}'.format(status, ' - {}'.format(last_status) if last_status else ''))
|
||||||
last_status = status
|
last_status = status
|
||||||
|
|
||||||
print('Remote machine allocated')
|
print('Remote machine allocated')
|
||||||
print('Setting remote environment [Task id={}]'.format(task.id))
|
print('Setting remote environment [Task id={}]'.format(task.id))
|
||||||
print('Setup process details: {}'.format(task.get_output_log_web_page()))
|
print('Setup process details: {}'.format(task.get_output_log_web_page()))
|
||||||
@ -500,8 +508,25 @@ def wait_for_machine(state, task):
|
|||||||
if state.get('vscode_server'):
|
if state.get('vscode_server'):
|
||||||
wait_properties += ['properties/vscode_port']
|
wait_properties += ['properties/vscode_port']
|
||||||
|
|
||||||
|
last_lines = []
|
||||||
|
period_counter = 0
|
||||||
while any(bool(not task.get_parameter(p)) for p in wait_properties) and task.get_status() == 'in_progress':
|
while any(bool(not task.get_parameter(p)) for p in wait_properties) and task.get_status() == 'in_progress':
|
||||||
print('.', end='', flush=True)
|
lines = task.get_reported_console_output(10) if state.get('debug') else []
|
||||||
|
if last_lines != lines:
|
||||||
|
# new line if we had '.' counter in the previous run
|
||||||
|
if period_counter:
|
||||||
|
print('')
|
||||||
|
period_counter = 0
|
||||||
|
try:
|
||||||
|
index = next(i for i, line in enumerate(lines) if last_lines and line == last_lines[-1])
|
||||||
|
print('> ' + ''.join(lines[index+1:]).rstrip().replace('\n', '\n> '))
|
||||||
|
except StopIteration:
|
||||||
|
print('> ' + ''.join(lines).rstrip().replace('\n', '\n> '))
|
||||||
|
last_lines = lines
|
||||||
|
else:
|
||||||
|
print('.', end='', flush=True)
|
||||||
|
period_counter += 1
|
||||||
|
|
||||||
sleep(3.)
|
sleep(3.)
|
||||||
task.reload()
|
task.reload()
|
||||||
if task.get_status() != 'in_progress':
|
if task.get_status() != 'in_progress':
|
||||||
@ -512,7 +537,7 @@ def wait_for_machine(state, task):
|
|||||||
return task
|
return task
|
||||||
|
|
||||||
|
|
||||||
def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_remote_pair_list):
|
def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_remote_pair_list, debug=False):
|
||||||
print('Starting SSH tunnel')
|
print('Starting SSH tunnel')
|
||||||
child = None
|
child = None
|
||||||
args = ['-N', '-C',
|
args = ['-N', '-C',
|
||||||
@ -525,22 +550,26 @@ def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_rem
|
|||||||
for local, remote in local_remote_pair_list:
|
for local, remote in local_remote_pair_list:
|
||||||
args.extend(['-L', '{}:localhost:{}'.format(local, remote)])
|
args.extend(['-L', '{}:localhost:{}'.format(local, remote)])
|
||||||
|
|
||||||
|
# store SSH output
|
||||||
|
fd = StringIO() if debug else sys.stdout
|
||||||
|
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
child = pexpect.spawn(
|
child = pexpect.spawn(
|
||||||
command=_check_ssh_executable(),
|
command=_check_ssh_executable(),
|
||||||
args=args,
|
args=args,
|
||||||
logfile=sys.stdout, timeout=20, encoding='utf-8')
|
logfile=fd, timeout=20, encoding='utf-8')
|
||||||
|
|
||||||
i = child.expect([r'(?i)password:', r'\(yes\/no\)', r'.*[$#] ', pexpect.EOF])
|
i = child.expect([r'(?i)password:', r'\(yes\/no\)', r'.*[$#] ', pexpect.EOF])
|
||||||
if i == 0:
|
if i == 0:
|
||||||
child.sendline(ssh_password)
|
child.sendline(ssh_password)
|
||||||
try:
|
try:
|
||||||
child.expect([r'(?i)password:'], timeout=5)
|
child.expect([r'(?i)password:'], timeout=5)
|
||||||
print('Error: incorrect password')
|
print('{}Error: incorrect password'.format(fd.read() + '\n' if debug else ''))
|
||||||
ssh_password = input('Please enter password manually: ')
|
ssh_password = input('Please enter password manually: ')
|
||||||
child.sendline(ssh_password)
|
child.sendline(ssh_password)
|
||||||
child.expect([r'(?i)password:'], timeout=5)
|
child.expect([r'(?i)password:'], timeout=5)
|
||||||
print('Error: incorrect user input password')
|
print('{}Error: incorrect user input password'.format(fd.read() + '\n' if debug else ''))
|
||||||
raise ValueError('Incorrect password')
|
raise ValueError('Incorrect password')
|
||||||
except pexpect.TIMEOUT:
|
except pexpect.TIMEOUT:
|
||||||
pass
|
pass
|
||||||
@ -556,7 +585,7 @@ def start_ssh_tunnel(username, remote_address, ssh_port, ssh_password, local_rem
|
|||||||
ssh_password = input('Please enter password manually: ')
|
ssh_password = input('Please enter password manually: ')
|
||||||
child.sendline(ssh_password)
|
child.sendline(ssh_password)
|
||||||
child.expect([r'(?i)password:'], timeout=5)
|
child.expect([r'(?i)password:'], timeout=5)
|
||||||
print('Error: incorrect user input password')
|
print('{}Error: incorrect user input password'.format(fd.read() + '\n' if debug else ''))
|
||||||
raise ValueError('Incorrect password')
|
raise ValueError('Incorrect password')
|
||||||
except pexpect.TIMEOUT:
|
except pexpect.TIMEOUT:
|
||||||
pass
|
pass
|
||||||
@ -638,7 +667,9 @@ def monitor_ssh_tunnel(state, task):
|
|||||||
ssh_process, ssh_password = start_ssh_tunnel(
|
ssh_process, ssh_password = start_ssh_tunnel(
|
||||||
state.get('username') or 'root',
|
state.get('username') or 'root',
|
||||||
remote_address, ssh_port, ssh_password,
|
remote_address, ssh_port, ssh_password,
|
||||||
local_remote_pair_list=local_remote_pair_list)
|
local_remote_pair_list=local_remote_pair_list,
|
||||||
|
debug=state.get('debug', False),
|
||||||
|
)
|
||||||
|
|
||||||
if ssh_process and ssh_process.isalive():
|
if ssh_process and ssh_process.isalive():
|
||||||
msg = \
|
msg = \
|
||||||
@ -721,14 +752,17 @@ def setup_parser(parser):
|
|||||||
help='Display the clearml-session utility version')
|
help='Display the clearml-session utility version')
|
||||||
parser.add_argument('--attach', default=False, nargs='?',
|
parser.add_argument('--attach', default=False, nargs='?',
|
||||||
help='Attach to running interactive session (default: previous session)')
|
help='Attach to running interactive session (default: previous session)')
|
||||||
parser.add_argument('--debugging', type=str, default=None,
|
parser.add_argument('--debugging-session', type=str, default=None,
|
||||||
help='Pass existing Task id (experiment), create a copy of the experiment on a remote machine, '
|
help='Pass existing Task id (experiment), create a copy of the experiment on a remote machine, '
|
||||||
'and launch jupyter/ssh for interactive access. Example --debugging <task_id>')
|
'and launch jupyter/ssh for interactive access. Example --debugging-session <task_id>')
|
||||||
parser.add_argument('--queue', type=str, default=None,
|
parser.add_argument('--queue', type=str, default=None,
|
||||||
help='Select the queue to launch the interactive session on (default: previously used queue)')
|
help='Select the queue to launch the interactive session on (default: previously used queue)')
|
||||||
parser.add_argument('--docker', type=str, default=None,
|
parser.add_argument('--docker', type=str, default=None,
|
||||||
help='Select the docker image to use in the interactive session on '
|
help='Select the docker image to use in the interactive session on '
|
||||||
'(default: previously used docker image or `{}`)'.format(default_docker_image))
|
'(default: previously used docker image or `{}`)'.format(default_docker_image))
|
||||||
|
parser.add_argument('--docker-args', type=str, default=None,
|
||||||
|
help='Add additional arguments for the docker image to use in the interactive session on '
|
||||||
|
'(default: previously used docker-args)')
|
||||||
parser.add_argument('--public-ip', default=None, nargs='?', const='true', metavar='true/false',
|
parser.add_argument('--public-ip', default=None, nargs='?', const='true', metavar='true/false',
|
||||||
type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
|
type=lambda x: (str(x).strip().lower() in ('true', 'yes')),
|
||||||
help='If True register the public IP of the remote machine. Set if running on the cloud. '
|
help='If True register the public IP of the remote machine. Set if running on the cloud. '
|
||||||
@ -761,7 +795,7 @@ def setup_parser(parser):
|
|||||||
help='Advanced: Change the configuration file used to store the previous state '
|
help='Advanced: Change the configuration file used to store the previous state '
|
||||||
'(default: ~/.clearml_session.json)')
|
'(default: ~/.clearml_session.json)')
|
||||||
parser.add_argument('--remote-gateway', default=None, nargs='?',
|
parser.add_argument('--remote-gateway', default=None, nargs='?',
|
||||||
help='Advanced: Specify gateway ip/address to be passed to interactive session '
|
help='Advanced: Specify gateway ip/address:port to be passed to interactive session '
|
||||||
'(for use with k8s ingestion / ELB)')
|
'(for use with k8s ingestion / ELB)')
|
||||||
parser.add_argument('--base-task-id', type=str, default=None,
|
parser.add_argument('--base-task-id', type=str, default=None,
|
||||||
help='Advanced: Set the base task ID for the interactive session. '
|
help='Advanced: Set the base task ID for the interactive session. '
|
||||||
@ -784,6 +818,8 @@ def setup_parser(parser):
|
|||||||
parser.add_argument('--username', type=str, default=None,
|
parser.add_argument('--username', type=str, default=None,
|
||||||
help='Advanced: Select ssh username for the interactive session '
|
help='Advanced: Select ssh username for the interactive session '
|
||||||
'(default: `root` or previously used one)')
|
'(default: `root` or previously used one)')
|
||||||
|
parser.add_argument('--debug', action='store_true', default=None,
|
||||||
|
help='Advanced: If set, print debugging information')
|
||||||
|
|
||||||
|
|
||||||
def get_version():
|
def get_version():
|
||||||
@ -818,6 +854,9 @@ def cli():
|
|||||||
state_file = os.path.abspath(os.path.expandvars(os.path.expanduser(args.config_file)))
|
state_file = os.path.abspath(os.path.expandvars(os.path.expanduser(args.config_file)))
|
||||||
state = load_state(state_file)
|
state = load_state(state_file)
|
||||||
|
|
||||||
|
if args.debug:
|
||||||
|
state['debug'] = args.debug
|
||||||
|
|
||||||
client = APIClient()
|
client = APIClient()
|
||||||
|
|
||||||
# get previous session, if it is running
|
# get previous session, if it is running
|
||||||
@ -882,7 +921,7 @@ def _check_previous_session(client, args, state):
|
|||||||
task = None
|
task = None
|
||||||
status = task.get_status() if task else None
|
status = task.get_status() if task else None
|
||||||
if status == 'in_progress':
|
if status == 'in_progress':
|
||||||
if not args.debugging or task.parent == args.debugging:
|
if not args.debugging_session or task.parent == args.debugging_session:
|
||||||
# only ask if we were not requested directly
|
# only ask if we were not requested directly
|
||||||
print('Using active session id={}'.format(task_id))
|
print('Using active session id={}'.format(task_id))
|
||||||
return task
|
return task
|
||||||
@ -894,10 +933,10 @@ def _check_previous_session(client, args, state):
|
|||||||
if not running_task_ids_created:
|
if not running_task_ids_created:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if args.debugging:
|
if args.debugging_session:
|
||||||
running_task_ids_created = [t for t in running_task_ids_created if t[2] == args.debugging]
|
running_task_ids_created = [t for t in running_task_ids_created if t[2] == args.debugging_session]
|
||||||
if not running_task_ids_created:
|
if not running_task_ids_created:
|
||||||
print('No active task={} debugging session found'.format(args.debugging))
|
print('No active task={} debugging session found'.format(args.debugging_session))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# a single running session
|
# a single running session
|
||||||
@ -920,8 +959,8 @@ def _check_previous_session(client, args, state):
|
|||||||
for i, (tid, dt, _) in enumerate(running_task_ids_created))
|
for i, (tid, dt, _) in enumerate(running_task_ids_created))
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
choice = input(session_list+'\nConnect to session [0-{}] or \'N\' to skip'.format(
|
choice = input(session_list+'\nConnect to session [{}] or \'N\' to skip: '.format(
|
||||||
len(running_task_ids_created)-1))
|
'0' if len(running_task_ids_created) <= 1 else '0-{}'.format(len(running_task_ids_created)-1)))
|
||||||
if choice.strip().lower().startswith('n'):
|
if choice.strip().lower().startswith('n'):
|
||||||
choice = None
|
choice = None
|
||||||
elif default_i is not None and not choice.strip():
|
elif default_i is not None and not choice.strip():
|
||||||
|
Loading…
Reference in New Issue
Block a user