From a7873705ec3f30c4bdf9cbe3b535cb36c7490dbe Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 8 Nov 2019 22:36:24 +0200 Subject: [PATCH] Add --gpus / --cpu-only (equivalent to NVIDIA_VISIBLE_DEVICE) Add agent.python_binary specifying full path to python binary to use for virtual environement creation Fix Windows support --- .../backend_api/config/default/agent.conf | 5 ++ .../backend_api/session/client/client.py | 4 +- trains_agent/commands/worker.py | 76 ++++++++++++++----- trains_agent/helper/package/pip_api/venv.py | 2 +- trains_agent/interface/worker.py | 9 +++ trains_agent/session.py | 7 +- 6 files changed, 81 insertions(+), 22 deletions(-) diff --git a/trains_agent/backend_api/config/default/agent.conf b/trains_agent/backend_api/config/default/agent.conf index e25c413..e2b4c92 100644 --- a/trains_agent/backend_api/config/default/agent.conf +++ b/trains_agent/backend_api/config/default/agent.conf @@ -13,6 +13,11 @@ # git_user: "" # git_pass: "" + # Set the python version to use when creating the virtual environment and launching the experiment + # Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6" + # The default is the python executing the trains_agent + python_binary: "" + # select python package manager: # currently supported pip and conda # poetry is used if pip selected and repository contains poetry.lock file diff --git a/trains_agent/backend_api/session/client/client.py b/trains_agent/backend_api/session/client/client.py index 7331f96..89f5281 100644 --- a/trains_agent/backend_api/session/client/client.py +++ b/trains_agent/backend_api/session/client/client.py @@ -212,8 +212,8 @@ class TableResponse(Response): fields = fields or self.fields from trains_agent.helper.base import create_table return create_table( - (tuple(getter(item, attr) for attr in fields) for item in self), - titles=fields, headers=True, + (dict((attr, getter(item, attr)) for attr in fields) for item in self), + titles=fields, columns=fields, headers=True, ) def display(self, fields=None): diff --git a/trains_agent/commands/worker.py b/trains_agent/commands/worker.py index 079e649..9172563 100644 --- a/trains_agent/commands/worker.py +++ b/trains_agent/commands/worker.py @@ -424,7 +424,7 @@ class Worker(ServiceCommandSection): docker_arguments=task_docker_cmd[1:]) else: self.send_logs(task_id=task_id, - lines=['No docker image specified, running Task {} inside docker: {} {}\n'.format( + lines=['running Task {} inside default docker image: {} {}\n'.format( task_id, self._docker_image, self._docker_arguments or '')], level="INFO") full_docker_cmd = self.docker_image_func(docker_image=self._docker_image, @@ -1028,11 +1028,20 @@ class Worker(ServiceCommandSection): sys.stdout.flush() sys.stderr.flush() os.chdir(script_dir) - os.execv(command.argv[0].as_posix(), tuple([command.argv[0].as_posix()])+command.argv[1:]) - # exit_code = command.check_call(cwd=script_dir) + if not is_windows_platform(): + os.execv(command.argv[0].as_posix(), tuple([command.argv[0].as_posix()])+command.argv[1:]) + else: + exit_code = command.check_call(cwd=script_dir) + exit(exit_code) except subprocess.CalledProcessError as ex: # non zero return code exit_code = ex.returncode + if is_windows_platform(): + exit(exit_code) + except Exception as ex: + if is_windows_platform(): + exit(-1) + raise ex else: # store stdout/stderr into file, and send to backend temp_stdout_fname = log_file or safe_mkstemp( @@ -1418,12 +1427,28 @@ class Worker(ServiceCommandSection): for i in range(len(it) + 1): yield it[:i] - python_executables = [ - (version, "python{}".format(version)) - for version in map( - ".".join, reversed(list(suffixes(config_version.split(".")))) - ) - ] + def rreplace(s, old, new, count): + return (s[::-1].replace(old[::-1], new[::-1], count))[::-1] + + if is_windows_platform(): + python_executables = [ + (version, config_version if os.path.sep in config_version else 'python{}'.format(version)) + for version in map( + ".".join, reversed(list(suffixes( + rreplace( + rreplace(config_version.split(os.path.sep)[-1].lower(), 'python', '', 1), + '.exe', '', 1).split(".")))) + ) + ] + else: + python_executables = [ + (version, config_version if os.path.sep in config_version else 'python{}'.format(version)) + for version in map( + ".".join, reversed(list(suffixes( + rreplace(config_version.split(os.path.sep)[-1], 'python', '', 1).split(".")))) + ) + ] + for version, executable in python_executables: self.log.debug("Searching for {}".format(executable)) if find_executable(executable): @@ -1435,7 +1460,8 @@ class Worker(ServiceCommandSection): self.log.warning("error getting %s version: %s", executable, ex) continue match = re.search( - r"Python ({}(?:\.\d+)*)".format(config_version or r"\d+"), output + r"Python ({}(?:\.\d+)*)".format( + r"\d+" if not config_version or os.path.sep in config_version else config_version), output ) if match: self.log.debug("Found: {}".format(executable)) @@ -1453,13 +1479,17 @@ class Worker(ServiceCommandSection): Install a new python virtual environment, removing the old one if exists :return: virtualenv directory and requirements manager to use with task """ - requested_python_version = requested_python_version or Text(self._session.config["agent.default_python"]) - venv_dir = Path(venv_dir) if venv_dir else \ - Path(self._session.config["agent.venvs_dir"], requested_python_version) + requested_python_version = requested_python_version or \ + Text(self._session.config.get("agent.python_binary", None)) or \ + Text(self._session.config.get("agent.default_python", None)) executable_version, executable_version_suffix, executable_name = self.find_python_executable_for_version( requested_python_version ) + venv_dir = Path(venv_dir) if venv_dir else \ + Path(self._session.config["agent.venvs_dir"], executable_version_suffix) + self._session.config.put("agent.default_python", executable_version) + self._session.config.put("agent.python_binary", executable_name) first_time = ( is_windows_platform() or self.is_conda @@ -1474,7 +1504,7 @@ class Worker(ServiceCommandSection): rm_tree(normalize_path(venv_dir, WORKING_REPOSITORY_DIR)) package_manager_params = dict( session=self._session, - python=executable_version_suffix, + python=executable_version_suffix if self.is_conda else executable_name, path=venv_dir, requirements_manager=requirements_manager, ) @@ -1564,6 +1594,9 @@ class Worker(ServiceCommandSection): temp_config.put("agent.vcs_cache.path", mounted_vcs_cache) temp_config.put("agent.package_manager.system_site_packages", True) temp_config.put("agent.default_python", "") + temp_config.put("agent.python_binary", "") + temp_config.put("agent.cuda_version", "") + temp_config.put("agent.cudnn_version", "") temp_config.put("agent.venvs_dir", mounted_venv_dir) host_apt_cache = Path(os.path.expandvars(self._session.config.get( @@ -1619,9 +1652,9 @@ class Worker(ServiceCommandSection): base_cmd = [docker, 'run', '-t'] gpu_devices = os.environ.get('NVIDIA_VISIBLE_DEVICES', None) - if gpu_devices is None: + if gpu_devices is None or gpu_devices.lower().strip() == 'all': base_cmd += ['--gpus', 'all', ] - elif gpu_devices.strip(): + elif gpu_devices.strip() and gpu_devices.strip() != 'none': base_cmd += ['--gpus', 'device='+gpu_devices, ] # We are using --gpu, so we should not pass NVIDIA_VISIBLE_DEVICES, I think. # base_cmd += ['-e', 'NVIDIA_VISIBLE_DEVICES=' + gpu_devices, ] @@ -1660,8 +1693,15 @@ class Worker(ServiceCommandSection): # ensure singleton worker_id = self._session.config["agent.worker_id"] worker_name = self._session.config["agent.worker_name"] - if not worker_id and os.environ.get('NVIDIA_VISIBLE_DEVICES'): - worker_id = '{}:gpu{}'.format(worker_name, os.environ.get('NVIDIA_VISIBLE_DEVICES')) + if not worker_id and os.environ.get('NVIDIA_VISIBLE_DEVICES') is not None: + nvidia_visible_devices = os.environ.get('NVIDIA_VISIBLE_DEVICES') + if nvidia_visible_devices and nvidia_visible_devices.lower() != 'none': + worker_id = '{}:gpu{}'.format(worker_name, nvidia_visible_devices) + elif nvidia_visible_devices == '': + pass + else: + worker_name = '{}:cpu'.format(worker_name) + self.worker_id, worker_slot = Singleton.register_instance(unique_worker_id=worker_id, worker_name=worker_name) if self.worker_id is None: error('Instance with the same WORKER_ID [{}] is already running'.format(worker_id)) diff --git a/trains_agent/helper/package/pip_api/venv.py b/trains_agent/helper/package/pip_api/venv.py index 61b1655..12b2fe4 100644 --- a/trains_agent/helper/package/pip_api/venv.py +++ b/trains_agent/helper/package/pip_api/venv.py @@ -30,7 +30,7 @@ class VirtualenvPip(SystemPip, PackageManager): self.session = session self.path = path self.requirements_manager = requirements_manager - self.python = "python{}".format(python) + self.python = python def _make_command(self, command): return self.session.command(self.bin, "-m", "pip", *command) diff --git a/trains_agent/interface/worker.py b/trains_agent/interface/worker.py index 414e6af..0e885e5 100644 --- a/trains_agent/interface/worker.py +++ b/trains_agent/interface/worker.py @@ -37,6 +37,15 @@ DAEMON_ARGS = dict({ 'help': 'Pipe full log to stdout/stderr, should not be used if running in background', 'action': 'store_true', }, + '--gpus': { + 'help': 'Specify active GPUs for the daemon to use (docker / virtual environment), ' + 'Equivalent to setting NVIDIA_VISIBLE_DEVICES ' + 'Examples: --gpus 0 or --gpu 0,1,2 or --gpus all', + }, + '--cpu-only': { + 'help': 'Disable GPU access for the daemon, only use CPU in either docker or virtual environment', + 'action': 'store_true', + }, '--docker': { 'help': 'Run execution task inside a docker (v19.03 and above). Optional args or ' 'specify default docker image in agent.default_docker.image / agent.default_docker.arguments' diff --git a/trains_agent/session.py b/trains_agent/session.py index c5751b0..f7addcc 100644 --- a/trains_agent/session.py +++ b/trains_agent/session.py @@ -72,6 +72,11 @@ class Session(_Session): os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = config_file if not Path(config_file).is_file(): raise ValueError("Could not open configuration file: {}".format(config_file)) + cpu_only = kwargs.get('cpu_only') + if cpu_only: + os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = 'none' + if kwargs.get('gpus'): + os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus') if kwargs.get('only_load_config'): from trains_agent.backend_api.config import load self.config = load() @@ -115,7 +120,7 @@ class Session(_Session): from trains_agent.helper.package.requirements import RequirementsManager agent = self.config['agent'] agent['cuda_version'], agent['cudnn_version'] = \ - RequirementsManager.get_cuda_version(self.config) + RequirementsManager.get_cuda_version(self.config) if not cpu_only else ('0', '0') except Exception: pass