Add --gpus / --cpu-only (equivalent to NVIDIA_VISIBLE_DEVICE)

Add agent.python_binary specifying full path to python binary to use for virtual environement creation
Fix Windows support
This commit is contained in:
allegroai 2019-11-08 22:36:24 +02:00
parent 4a8f52b5a5
commit a7873705ec
6 changed files with 81 additions and 22 deletions

View File

@ -13,6 +13,11 @@
# git_user: ""
# git_pass: ""
# Set the python version to use when creating the virtual environment and launching the experiment
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
# The default is the python executing the trains_agent
python_binary: ""
# select python package manager:
# currently supported pip and conda
# poetry is used if pip selected and repository contains poetry.lock file

View File

@ -212,8 +212,8 @@ class TableResponse(Response):
fields = fields or self.fields
from trains_agent.helper.base import create_table
return create_table(
(tuple(getter(item, attr) for attr in fields) for item in self),
titles=fields, headers=True,
(dict((attr, getter(item, attr)) for attr in fields) for item in self),
titles=fields, columns=fields, headers=True,
)
def display(self, fields=None):

View File

@ -424,7 +424,7 @@ class Worker(ServiceCommandSection):
docker_arguments=task_docker_cmd[1:])
else:
self.send_logs(task_id=task_id,
lines=['No docker image specified, running Task {} inside docker: {} {}\n'.format(
lines=['running Task {} inside default docker image: {} {}\n'.format(
task_id, self._docker_image, self._docker_arguments or '')],
level="INFO")
full_docker_cmd = self.docker_image_func(docker_image=self._docker_image,
@ -1028,11 +1028,20 @@ class Worker(ServiceCommandSection):
sys.stdout.flush()
sys.stderr.flush()
os.chdir(script_dir)
os.execv(command.argv[0].as_posix(), tuple([command.argv[0].as_posix()])+command.argv[1:])
# exit_code = command.check_call(cwd=script_dir)
if not is_windows_platform():
os.execv(command.argv[0].as_posix(), tuple([command.argv[0].as_posix()])+command.argv[1:])
else:
exit_code = command.check_call(cwd=script_dir)
exit(exit_code)
except subprocess.CalledProcessError as ex:
# non zero return code
exit_code = ex.returncode
if is_windows_platform():
exit(exit_code)
except Exception as ex:
if is_windows_platform():
exit(-1)
raise ex
else:
# store stdout/stderr into file, and send to backend
temp_stdout_fname = log_file or safe_mkstemp(
@ -1418,12 +1427,28 @@ class Worker(ServiceCommandSection):
for i in range(len(it) + 1):
yield it[:i]
python_executables = [
(version, "python{}".format(version))
for version in map(
".".join, reversed(list(suffixes(config_version.split("."))))
)
]
def rreplace(s, old, new, count):
return (s[::-1].replace(old[::-1], new[::-1], count))[::-1]
if is_windows_platform():
python_executables = [
(version, config_version if os.path.sep in config_version else 'python{}'.format(version))
for version in map(
".".join, reversed(list(suffixes(
rreplace(
rreplace(config_version.split(os.path.sep)[-1].lower(), 'python', '', 1),
'.exe', '', 1).split("."))))
)
]
else:
python_executables = [
(version, config_version if os.path.sep in config_version else 'python{}'.format(version))
for version in map(
".".join, reversed(list(suffixes(
rreplace(config_version.split(os.path.sep)[-1], 'python', '', 1).split("."))))
)
]
for version, executable in python_executables:
self.log.debug("Searching for {}".format(executable))
if find_executable(executable):
@ -1435,7 +1460,8 @@ class Worker(ServiceCommandSection):
self.log.warning("error getting %s version: %s", executable, ex)
continue
match = re.search(
r"Python ({}(?:\.\d+)*)".format(config_version or r"\d+"), output
r"Python ({}(?:\.\d+)*)".format(
r"\d+" if not config_version or os.path.sep in config_version else config_version), output
)
if match:
self.log.debug("Found: {}".format(executable))
@ -1453,13 +1479,17 @@ class Worker(ServiceCommandSection):
Install a new python virtual environment, removing the old one if exists
:return: virtualenv directory and requirements manager to use with task
"""
requested_python_version = requested_python_version or Text(self._session.config["agent.default_python"])
venv_dir = Path(venv_dir) if venv_dir else \
Path(self._session.config["agent.venvs_dir"], requested_python_version)
requested_python_version = requested_python_version or \
Text(self._session.config.get("agent.python_binary", None)) or \
Text(self._session.config.get("agent.default_python", None))
executable_version, executable_version_suffix, executable_name = self.find_python_executable_for_version(
requested_python_version
)
venv_dir = Path(venv_dir) if venv_dir else \
Path(self._session.config["agent.venvs_dir"], executable_version_suffix)
self._session.config.put("agent.default_python", executable_version)
self._session.config.put("agent.python_binary", executable_name)
first_time = (
is_windows_platform()
or self.is_conda
@ -1474,7 +1504,7 @@ class Worker(ServiceCommandSection):
rm_tree(normalize_path(venv_dir, WORKING_REPOSITORY_DIR))
package_manager_params = dict(
session=self._session,
python=executable_version_suffix,
python=executable_version_suffix if self.is_conda else executable_name,
path=venv_dir,
requirements_manager=requirements_manager,
)
@ -1564,6 +1594,9 @@ class Worker(ServiceCommandSection):
temp_config.put("agent.vcs_cache.path", mounted_vcs_cache)
temp_config.put("agent.package_manager.system_site_packages", True)
temp_config.put("agent.default_python", "")
temp_config.put("agent.python_binary", "")
temp_config.put("agent.cuda_version", "")
temp_config.put("agent.cudnn_version", "")
temp_config.put("agent.venvs_dir", mounted_venv_dir)
host_apt_cache = Path(os.path.expandvars(self._session.config.get(
@ -1619,9 +1652,9 @@ class Worker(ServiceCommandSection):
base_cmd = [docker, 'run', '-t']
gpu_devices = os.environ.get('NVIDIA_VISIBLE_DEVICES', None)
if gpu_devices is None:
if gpu_devices is None or gpu_devices.lower().strip() == 'all':
base_cmd += ['--gpus', 'all', ]
elif gpu_devices.strip():
elif gpu_devices.strip() and gpu_devices.strip() != 'none':
base_cmd += ['--gpus', 'device='+gpu_devices, ]
# We are using --gpu, so we should not pass NVIDIA_VISIBLE_DEVICES, I think.
# base_cmd += ['-e', 'NVIDIA_VISIBLE_DEVICES=' + gpu_devices, ]
@ -1660,8 +1693,15 @@ class Worker(ServiceCommandSection):
# ensure singleton
worker_id = self._session.config["agent.worker_id"]
worker_name = self._session.config["agent.worker_name"]
if not worker_id and os.environ.get('NVIDIA_VISIBLE_DEVICES'):
worker_id = '{}:gpu{}'.format(worker_name, os.environ.get('NVIDIA_VISIBLE_DEVICES'))
if not worker_id and os.environ.get('NVIDIA_VISIBLE_DEVICES') is not None:
nvidia_visible_devices = os.environ.get('NVIDIA_VISIBLE_DEVICES')
if nvidia_visible_devices and nvidia_visible_devices.lower() != 'none':
worker_id = '{}:gpu{}'.format(worker_name, nvidia_visible_devices)
elif nvidia_visible_devices == '':
pass
else:
worker_name = '{}:cpu'.format(worker_name)
self.worker_id, worker_slot = Singleton.register_instance(unique_worker_id=worker_id, worker_name=worker_name)
if self.worker_id is None:
error('Instance with the same WORKER_ID [{}] is already running'.format(worker_id))

View File

@ -30,7 +30,7 @@ class VirtualenvPip(SystemPip, PackageManager):
self.session = session
self.path = path
self.requirements_manager = requirements_manager
self.python = "python{}".format(python)
self.python = python
def _make_command(self, command):
return self.session.command(self.bin, "-m", "pip", *command)

View File

@ -37,6 +37,15 @@ DAEMON_ARGS = dict({
'help': 'Pipe full log to stdout/stderr, should not be used if running in background',
'action': 'store_true',
},
'--gpus': {
'help': 'Specify active GPUs for the daemon to use (docker / virtual environment), '
'Equivalent to setting NVIDIA_VISIBLE_DEVICES '
'Examples: --gpus 0 or --gpu 0,1,2 or --gpus all',
},
'--cpu-only': {
'help': 'Disable GPU access for the daemon, only use CPU in either docker or virtual environment',
'action': 'store_true',
},
'--docker': {
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'

View File

@ -72,6 +72,11 @@ class Session(_Session):
os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = config_file
if not Path(config_file).is_file():
raise ValueError("Could not open configuration file: {}".format(config_file))
cpu_only = kwargs.get('cpu_only')
if cpu_only:
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = 'none'
if kwargs.get('gpus'):
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus')
if kwargs.get('only_load_config'):
from trains_agent.backend_api.config import load
self.config = load()
@ -115,7 +120,7 @@ class Session(_Session):
from trains_agent.helper.package.requirements import RequirementsManager
agent = self.config['agent']
agent['cuda_version'], agent['cudnn_version'] = \
RequirementsManager.get_cuda_version(self.config)
RequirementsManager.get_cuda_version(self.config) if not cpu_only else ('0', '0')
except Exception:
pass