diff --git a/trains_agent/commands/worker.py b/trains_agent/commands/worker.py index 0b79928..35f80fe 100644 --- a/trains_agent/commands/worker.py +++ b/trains_agent/commands/worker.py @@ -1882,6 +1882,17 @@ class Worker(ServiceCommandSection): if isinstance(extra_docker_arguments, six.string_types) else extra_docker_arguments base_cmd += [str(a) for a in extra_docker_arguments if a] + # check if running inside a kubernetes + if os.environ.get('KUBERNETES_SERVICE_HOST') and os.environ.get('KUBERNETES_PORT'): + # map network to sibling docker + try: + network_mode = get_bash_output( + 'docker inspect --format=\'{{.HostConfig.NetworkMode}}\' $(basename $(cat /proc/1/cpuset))') + base_cmd += ['--network', network_mode] + except: + pass + base_cmd += ['-e', 'NVIDIA_VISIBLE_DEVICES={}'.format(dockers_nvidia_visible_devices)] + base_cmd += ['-e', 'TRAINS_WORKER_ID='+worker_id, ] if host_ssh_cache: diff --git a/trains_agent/session.py b/trains_agent/session.py index a6d1aba..d605d56 100644 --- a/trains_agent/session.py +++ b/trains_agent/session.py @@ -75,7 +75,8 @@ class Session(_Session): cpu_only = kwargs.get('cpu_only') if cpu_only: os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = 'none' - if kwargs.get('gpus'): + if kwargs.get('gpus') and not os.environ.get('KUBERNETES_SERVICE_HOST') \ + and not os.environ.get('KUBERNETES_PORT'): os.environ['CUDA_VISIBLE_DEVICES'] = os.environ['NVIDIA_VISIBLE_DEVICES'] = kwargs.get('gpus') if kwargs.get('only_load_config'): from trains_agent.backend_api.config import load