mirror of
https://github.com/clearml/clearml-agent
synced 2025-04-27 17:31:29 +00:00
Add --gpus / --cpu-only (equivalent to NVIDIA_VISIBLE_DEVICE)
Add agent.python_binary specifying full path to python binary to use for virtual environement creation Fix Windows support
This commit is contained in:
parent
4a8f52b5a5
commit
a7873705ec
@ -13,6 +13,11 @@
|
|||||||
# git_user: ""
|
# git_user: ""
|
||||||
# git_pass: ""
|
# git_pass: ""
|
||||||
|
|
||||||
|
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||||
|
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||||
|
# The default is the python executing the trains_agent
|
||||||
|
python_binary: ""
|
||||||
|
|
||||||
# select python package manager:
|
# select python package manager:
|
||||||
# currently supported pip and conda
|
# currently supported pip and conda
|
||||||
# poetry is used if pip selected and repository contains poetry.lock file
|
# poetry is used if pip selected and repository contains poetry.lock file
|
||||||
|
@ -212,8 +212,8 @@ class TableResponse(Response):
|
|||||||
fields = fields or self.fields
|
fields = fields or self.fields
|
||||||
from trains_agent.helper.base import create_table
|
from trains_agent.helper.base import create_table
|
||||||
return create_table(
|
return create_table(
|
||||||
(tuple(getter(item, attr) for attr in fields) for item in self),
|
(dict((attr, getter(item, attr)) for attr in fields) for item in self),
|
||||||
titles=fields, headers=True,
|
titles=fields, columns=fields, headers=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
def display(self, fields=None):
|
def display(self, fields=None):
|
||||||
|
@ -424,7 +424,7 @@ class Worker(ServiceCommandSection):
|
|||||||
docker_arguments=task_docker_cmd[1:])
|
docker_arguments=task_docker_cmd[1:])
|
||||||
else:
|
else:
|
||||||
self.send_logs(task_id=task_id,
|
self.send_logs(task_id=task_id,
|
||||||
lines=['No docker image specified, running Task {} inside docker: {} {}\n'.format(
|
lines=['running Task {} inside default docker image: {} {}\n'.format(
|
||||||
task_id, self._docker_image, self._docker_arguments or '')],
|
task_id, self._docker_image, self._docker_arguments or '')],
|
||||||
level="INFO")
|
level="INFO")
|
||||||
full_docker_cmd = self.docker_image_func(docker_image=self._docker_image,
|
full_docker_cmd = self.docker_image_func(docker_image=self._docker_image,
|
||||||
@ -1028,11 +1028,20 @@ class Worker(ServiceCommandSection):
|
|||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
sys.stderr.flush()
|
sys.stderr.flush()
|
||||||
os.chdir(script_dir)
|
os.chdir(script_dir)
|
||||||
|
if not is_windows_platform():
|
||||||
os.execv(command.argv[0].as_posix(), tuple([command.argv[0].as_posix()])+command.argv[1:])
|
os.execv(command.argv[0].as_posix(), tuple([command.argv[0].as_posix()])+command.argv[1:])
|
||||||
# exit_code = command.check_call(cwd=script_dir)
|
else:
|
||||||
|
exit_code = command.check_call(cwd=script_dir)
|
||||||
|
exit(exit_code)
|
||||||
except subprocess.CalledProcessError as ex:
|
except subprocess.CalledProcessError as ex:
|
||||||
# non zero return code
|
# non zero return code
|
||||||
exit_code = ex.returncode
|
exit_code = ex.returncode
|
||||||
|
if is_windows_platform():
|
||||||
|
exit(exit_code)
|
||||||
|
except Exception as ex:
|
||||||
|
if is_windows_platform():
|
||||||
|
exit(-1)
|
||||||
|
raise ex
|
||||||
else:
|
else:
|
||||||
# store stdout/stderr into file, and send to backend
|
# store stdout/stderr into file, and send to backend
|
||||||
temp_stdout_fname = log_file or safe_mkstemp(
|
temp_stdout_fname = log_file or safe_mkstemp(
|
||||||
@ -1418,12 +1427,28 @@ class Worker(ServiceCommandSection):
|
|||||||
for i in range(len(it) + 1):
|
for i in range(len(it) + 1):
|
||||||
yield it[:i]
|
yield it[:i]
|
||||||
|
|
||||||
|
def rreplace(s, old, new, count):
|
||||||
|
return (s[::-1].replace(old[::-1], new[::-1], count))[::-1]
|
||||||
|
|
||||||
|
if is_windows_platform():
|
||||||
python_executables = [
|
python_executables = [
|
||||||
(version, "python{}".format(version))
|
(version, config_version if os.path.sep in config_version else 'python{}'.format(version))
|
||||||
for version in map(
|
for version in map(
|
||||||
".".join, reversed(list(suffixes(config_version.split("."))))
|
".".join, reversed(list(suffixes(
|
||||||
|
rreplace(
|
||||||
|
rreplace(config_version.split(os.path.sep)[-1].lower(), 'python', '', 1),
|
||||||
|
'.exe', '', 1).split("."))))
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
else:
|
||||||
|
python_executables = [
|
||||||
|
(version, config_version if os.path.sep in config_version else 'python{}'.format(version))
|
||||||
|
for version in map(
|
||||||
|
".".join, reversed(list(suffixes(
|
||||||
|
rreplace(config_version.split(os.path.sep)[-1], 'python', '', 1).split("."))))
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
for version, executable in python_executables:
|
for version, executable in python_executables:
|
||||||
self.log.debug("Searching for {}".format(executable))
|
self.log.debug("Searching for {}".format(executable))
|
||||||
if find_executable(executable):
|
if find_executable(executable):
|
||||||
@ -1435,7 +1460,8 @@ class Worker(ServiceCommandSection):
|
|||||||
self.log.warning("error getting %s version: %s", executable, ex)
|
self.log.warning("error getting %s version: %s", executable, ex)
|
||||||
continue
|
continue
|
||||||
match = re.search(
|
match = re.search(
|
||||||
r"Python ({}(?:\.\d+)*)".format(config_version or r"\d+"), output
|
r"Python ({}(?:\.\d+)*)".format(
|
||||||
|
r"\d+" if not config_version or os.path.sep in config_version else config_version), output
|
||||||
)
|
)
|
||||||
if match:
|
if match:
|
||||||
self.log.debug("Found: {}".format(executable))
|
self.log.debug("Found: {}".format(executable))
|
||||||
@ -1453,13 +1479,17 @@ class Worker(ServiceCommandSection):
|
|||||||
Install a new python virtual environment, removing the old one if exists
|
Install a new python virtual environment, removing the old one if exists
|
||||||
:return: virtualenv directory and requirements manager to use with task
|
:return: virtualenv directory and requirements manager to use with task
|
||||||
"""
|
"""
|
||||||
requested_python_version = requested_python_version or Text(self._session.config["agent.default_python"])
|
requested_python_version = requested_python_version or \
|
||||||
venv_dir = Path(venv_dir) if venv_dir else \
|
Text(self._session.config.get("agent.python_binary", None)) or \
|
||||||
Path(self._session.config["agent.venvs_dir"], requested_python_version)
|
Text(self._session.config.get("agent.default_python", None))
|
||||||
executable_version, executable_version_suffix, executable_name = self.find_python_executable_for_version(
|
executable_version, executable_version_suffix, executable_name = self.find_python_executable_for_version(
|
||||||
requested_python_version
|
requested_python_version
|
||||||
)
|
)
|
||||||
|
venv_dir = Path(venv_dir) if venv_dir else \
|
||||||
|
Path(self._session.config["agent.venvs_dir"], executable_version_suffix)
|
||||||
|
|
||||||
self._session.config.put("agent.default_python", executable_version)
|
self._session.config.put("agent.default_python", executable_version)
|
||||||
|
self._session.config.put("agent.python_binary", executable_name)
|
||||||
first_time = (
|
first_time = (
|
||||||
is_windows_platform()
|
is_windows_platform()
|
||||||
or self.is_conda
|
or self.is_conda
|
||||||
@ -1474,7 +1504,7 @@ class Worker(ServiceCommandSection):
|
|||||||
rm_tree(normalize_path(venv_dir, WORKING_REPOSITORY_DIR))
|
rm_tree(normalize_path(venv_dir, WORKING_REPOSITORY_DIR))
|
||||||
package_manager_params = dict(
|
package_manager_params = dict(
|
||||||
session=self._session,
|
session=self._session,
|
||||||
python=executable_version_suffix,
|
python=executable_version_suffix if self.is_conda else executable_name,
|
||||||
path=venv_dir,
|
path=venv_dir,
|
||||||
requirements_manager=requirements_manager,
|
requirements_manager=requirements_manager,
|
||||||
)
|
)
|
||||||
@ -1564,6 +1594,9 @@ class Worker(ServiceCommandSection):
|
|||||||
temp_config.put("agent.vcs_cache.path", mounted_vcs_cache)
|
temp_config.put("agent.vcs_cache.path", mounted_vcs_cache)
|
||||||
temp_config.put("agent.package_manager.system_site_packages", True)
|
temp_config.put("agent.package_manager.system_site_packages", True)
|
||||||
temp_config.put("agent.default_python", "")
|
temp_config.put("agent.default_python", "")
|
||||||
|
temp_config.put("agent.python_binary", "")
|
||||||
|
temp_config.put("agent.cuda_version", "")
|
||||||
|
temp_config.put("agent.cudnn_version", "")
|
||||||
temp_config.put("agent.venvs_dir", mounted_venv_dir)
|
temp_config.put("agent.venvs_dir", mounted_venv_dir)
|
||||||
|
|
||||||
host_apt_cache = Path(os.path.expandvars(self._session.config.get(
|
host_apt_cache = Path(os.path.expandvars(self._session.config.get(
|
||||||
@ -1619,9 +1652,9 @@ class Worker(ServiceCommandSection):
|
|||||||
|
|
||||||
base_cmd = [docker, 'run', '-t']
|
base_cmd = [docker, 'run', '-t']
|
||||||
gpu_devices = os.environ.get('NVIDIA_VISIBLE_DEVICES', None)
|
gpu_devices = os.environ.get('NVIDIA_VISIBLE_DEVICES', None)
|
||||||
if gpu_devices is None:
|
if gpu_devices is None or gpu_devices.lower().strip() == 'all':
|
||||||
base_cmd += ['--gpus', 'all', ]
|
base_cmd += ['--gpus', 'all', ]
|
||||||
elif gpu_devices.strip():
|
elif gpu_devices.strip() and gpu_devices.strip() != 'none':
|
||||||
base_cmd += ['--gpus', 'device='+gpu_devices, ]
|
base_cmd += ['--gpus', 'device='+gpu_devices, ]
|
||||||
# We are using --gpu, so we should not pass NVIDIA_VISIBLE_DEVICES, I think.
|
# We are using --gpu, so we should not pass NVIDIA_VISIBLE_DEVICES, I think.
|
||||||
# base_cmd += ['-e', 'NVIDIA_VISIBLE_DEVICES=' + gpu_devices, ]
|
# base_cmd += ['-e', 'NVIDIA_VISIBLE_DEVICES=' + gpu_devices, ]
|
||||||
@ -1660,8 +1693,15 @@ class Worker(ServiceCommandSection):
|
|||||||
# ensure singleton
|
# ensure singleton
|
||||||
worker_id = self._session.config["agent.worker_id"]
|
worker_id = self._session.config["agent.worker_id"]
|
||||||
worker_name = self._session.config["agent.worker_name"]
|
worker_name = self._session.config["agent.worker_name"]
|
||||||
if not worker_id and os.environ.get('NVIDIA_VISIBLE_DEVICES'):
|
if not worker_id and os.environ.get('NVIDIA_VISIBLE_DEVICES') is not None:
|
||||||
worker_id = '{}:gpu{}'.format(worker_name, os.environ.get('NVIDIA_VISIBLE_DEVICES'))
|
nvidia_visible_devices = os.environ.get('NVIDIA_VISIBLE_DEVICES')
|
||||||
|
if nvidia_visible_devices and nvidia_visible_devices.lower() != 'none':
|
||||||
|
worker_id = '{}:gpu{}'.format(worker_name, nvidia_visible_devices)
|
||||||
|
elif nvidia_visible_devices == '':
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
worker_name = '{}:cpu'.format(worker_name)
|
||||||
|
|
||||||
self.worker_id, worker_slot = Singleton.register_instance(unique_worker_id=worker_id, worker_name=worker_name)
|
self.worker_id, worker_slot = Singleton.register_instance(unique_worker_id=worker_id, worker_name=worker_name)
|
||||||
if self.worker_id is None:
|
if self.worker_id is None:
|
||||||
error('Instance with the same WORKER_ID [{}] is already running'.format(worker_id))
|
error('Instance with the same WORKER_ID [{}] is already running'.format(worker_id))
|
||||||
|
@ -30,7 +30,7 @@ class VirtualenvPip(SystemPip, PackageManager):
|
|||||||
self.session = session
|
self.session = session
|
||||||
self.path = path
|
self.path = path
|
||||||
self.requirements_manager = requirements_manager
|
self.requirements_manager = requirements_manager
|
||||||
self.python = "python{}".format(python)
|
self.python = python
|
||||||
|
|
||||||
def _make_command(self, command):
|
def _make_command(self, command):
|
||||||
return self.session.command(self.bin, "-m", "pip", *command)
|
return self.session.command(self.bin, "-m", "pip", *command)
|
||||||
|
@ -37,6 +37,15 @@ DAEMON_ARGS = dict({
|
|||||||
'help': 'Pipe full log to stdout/stderr, should not be used if running in background',
|
'help': 'Pipe full log to stdout/stderr, should not be used if running in background',
|
||||||
'action': 'store_true',
|
'action': 'store_true',
|
||||||
},
|
},
|
||||||
|
'--gpus': {
|
||||||
|
'help': 'Specify active GPUs for the daemon to use (docker / virtual environment), '
|
||||||
|
'Equivalent to setting NVIDIA_VISIBLE_DEVICES '
|
||||||
|
'Examples: --gpus 0 or --gpu 0,1,2 or --gpus all',
|
||||||
|
},
|
||||||
|
'--cpu-only': {
|
||||||
|
'help': 'Disable GPU access for the daemon, only use CPU in either docker or virtual environment',
|
||||||
|
'action': 'store_true',
|
||||||
|
},
|
||||||
'--docker': {
|
'--docker': {
|
||||||
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
|
||||||
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
|
||||||
|
@ -72,6 +72,11 @@ class Session(_Session):
|
|||||||
os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = config_file
|
os.environ[LOCAL_CONFIG_FILE_OVERRIDE_VAR] = config_file
|
||||||
if not Path(config_file).is_file():
|
if not Path(config_file).is_file():
|
||||||
raise ValueError("Could not open configuration file: {}".format(config_file))
|
raise ValueError("Could not open configuration file: {}".format(config_file))
|
||||||
|
cpu_only = kwargs.get('cpu_only')
|
||||||
|
if cpu_only:
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = 'none'
|
||||||
|
if kwargs.get('gpus'):
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus')
|
||||||
if kwargs.get('only_load_config'):
|
if kwargs.get('only_load_config'):
|
||||||
from trains_agent.backend_api.config import load
|
from trains_agent.backend_api.config import load
|
||||||
self.config = load()
|
self.config = load()
|
||||||
@ -115,7 +120,7 @@ class Session(_Session):
|
|||||||
from trains_agent.helper.package.requirements import RequirementsManager
|
from trains_agent.helper.package.requirements import RequirementsManager
|
||||||
agent = self.config['agent']
|
agent = self.config['agent']
|
||||||
agent['cuda_version'], agent['cudnn_version'] = \
|
agent['cuda_version'], agent['cudnn_version'] = \
|
||||||
RequirementsManager.get_cuda_version(self.config)
|
RequirementsManager.get_cuda_version(self.config) if not cpu_only else ('0', '0')
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user