mirror of
https://github.com/clearml/clearml-agent
synced 2025-04-16 21:41:55 +00:00
Add warning on --gpus without detected CUDA version (see issue #24)
This commit is contained in:
parent
1736d205bb
commit
257dd95401
@ -396,7 +396,7 @@ class Worker(ServiceCommandSection):
|
|||||||
"""
|
"""
|
||||||
if kwargs.get('services_mode'):
|
if kwargs.get('services_mode'):
|
||||||
kwargs['cpu_only'] = True
|
kwargs['cpu_only'] = True
|
||||||
kwargs['docker'] = kwargs.get('docker', [])
|
kwargs['docker'] = kwargs.get('docker') or []
|
||||||
kwargs['gpus'] = None
|
kwargs['gpus'] = None
|
||||||
|
|
||||||
return kwargs
|
return kwargs
|
||||||
@ -678,6 +678,10 @@ class Worker(ServiceCommandSection):
|
|||||||
|
|
||||||
self._standalone_mode = kwargs.get('standalone_mode', False)
|
self._standalone_mode = kwargs.get('standalone_mode', False)
|
||||||
self._services_mode = kwargs.get('services_mode', False)
|
self._services_mode = kwargs.get('services_mode', False)
|
||||||
|
# must have docker in services_mode
|
||||||
|
if self._services_mode:
|
||||||
|
kwargs = self._verify_command_states(kwargs)
|
||||||
|
docker = docker or kwargs.get('docker')
|
||||||
|
|
||||||
# make sure we only have a single instance,
|
# make sure we only have a single instance,
|
||||||
# also make sure we set worker_id properly and cache folders
|
# also make sure we set worker_id properly and cache folders
|
||||||
@ -714,6 +718,22 @@ class Worker(ServiceCommandSection):
|
|||||||
self.set_docker_variables(docker)
|
self.set_docker_variables(docker)
|
||||||
else:
|
else:
|
||||||
self.dump_config()
|
self.dump_config()
|
||||||
|
# only in none docker we have to make sure we have CUDA setup
|
||||||
|
|
||||||
|
# make sure we have CUDA set if we have --gpus
|
||||||
|
if kwargs.get('gpus') and self._session.config.get('agent.cuda_version', None) in (None, 0, '0'):
|
||||||
|
message = 'Running with GPUs but no CUDA version was detected!\n' \
|
||||||
|
'\tSet OS environemnt CUDA_VERSION & CUDNN_VERSION to the correct version\n' \
|
||||||
|
'\tExample: export CUDA_VERSION=10.1 or (Windows: set CUDA_VERSION=10.1)'
|
||||||
|
if is_conda(self._session.config):
|
||||||
|
self._unregister(queues)
|
||||||
|
safe_remove_file(self.temp_config_path)
|
||||||
|
raise ValueError(message)
|
||||||
|
else:
|
||||||
|
warning(message+'\n')
|
||||||
|
|
||||||
|
if self._services_mode:
|
||||||
|
print('Trains-Agent running in services mode')
|
||||||
|
|
||||||
self._daemon_foreground = foreground
|
self._daemon_foreground = foreground
|
||||||
if not foreground:
|
if not foreground:
|
||||||
@ -1713,7 +1733,7 @@ class Worker(ServiceCommandSection):
|
|||||||
if self._session.debug_mode and temp_file:
|
if self._session.debug_mode and temp_file:
|
||||||
rm_file(temp_file.name)
|
rm_file(temp_file.name)
|
||||||
# call post installation callback
|
# call post installation callback
|
||||||
requirements_manager.post_install()
|
requirements_manager.post_install(self._session)
|
||||||
# mark as successful installation
|
# mark as successful installation
|
||||||
repo_requirements_installed = True
|
repo_requirements_installed = True
|
||||||
|
|
||||||
@ -2208,8 +2228,11 @@ class Worker(ServiceCommandSection):
|
|||||||
|
|
||||||
def set_uid(self, user_uid, user_gid):
|
def set_uid(self, user_uid, user_gid):
|
||||||
from pwd import getpwnam
|
from pwd import getpwnam
|
||||||
self.uid = getpwnam(user_uid).pw_uid
|
try:
|
||||||
self.gid = getpwnam(user_gid).pw_gid
|
self.uid = getpwnam(user_uid).pw_uid
|
||||||
|
self.gid = getpwnam(user_gid).pw_gid
|
||||||
|
except Exception:
|
||||||
|
raise ValueError("Could not find requested user uid={} gid={}".format(user_uid, user_gid))
|
||||||
|
|
||||||
def _change_uid(self):
|
def _change_uid(self):
|
||||||
os.setgid(self.gid)
|
os.setgid(self.gid)
|
||||||
|
Loading…
Reference in New Issue
Block a user