diff --git a/trains_agent/commands/worker.py b/trains_agent/commands/worker.py index 0f18b81..079e649 100644 --- a/trains_agent/commands/worker.py +++ b/trains_agent/commands/worker.py @@ -78,7 +78,7 @@ from trains_agent.helper.process import ( Argv, COMMAND_SUCCESS, Executable, - get_bash_output) + get_bash_output, shutdown_docker_process) from trains_agent.helper.package.cython_req import CythonRequirement from trains_agent.helper.repo import clone_repository_cached, RepoInfo, VCS from trains_agent.helper.resource_monitor import ResourceMonitor @@ -483,6 +483,8 @@ class Worker(ServiceCommandSection): # remove temp files after we sent everything to the backend safe_remove_file(temp_stdout_name) safe_remove_file(temp_stderr_name) + if self.docker_image_func: + shutdown_docker_process(docker_cmd_ending='--id {}\'\"'.format(task_id)) def run_tasks_loop(self, queues, worker_params): """ @@ -1291,7 +1293,7 @@ class Worker(ServiceCommandSection): self.package_api.out_of_scope_install_package('Cython') cached_requirements_failed = False - if cached_requirements: + if cached_requirements and ('pip' in cached_requirements or 'conda' in cached_requirements): self.log("Found cached requirements, trying to install") try: self.package_api.load_requirements(cached_requirements) @@ -1648,7 +1650,7 @@ class Worker(ServiceCommandSection): "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0 {python_single_digit}-pip ; " "{python} -m pip install -U pip ; " "{python} -m pip install -U trains-agent ; " - "{python} -u -m trains_agent ".format( + "NVIDIA_VISIBLE_DEVICES=all CUDA_VISIBLE_DEVICES= {python} -u -m trains_agent ".format( python_single_digit=python_version.split('.')[0], python=python_version)] diff --git a/trains_agent/helper/process.py b/trains_agent/helper/process.py index b723926..d5feeeb 100644 --- a/trains_agent/helper/process.py +++ b/trains_agent/helper/process.py @@ -59,6 +59,19 @@ def kill_all_child_processes(pid=None): parent.kill() +def shutdown_docker_process(docker_cmd_ending): + try: + containers_running = get_bash_output(cmd='docker ps --no-trunc --format \"{{.ID}}: {{.Command}}\"') + for docker_line in containers_running.split('\n'): + parts = docker_line.split(':') + if parts[-1].endswith(docker_cmd_ending): + # we found our docker, stop it + get_bash_output(cmd='docker stop -t 1 {}'.format(parts[0])) + return + except Exception: + pass + + def check_if_command_exists(cmd): return bool(find_executable(cmd))