mirror of
https://github.com/clearml/clearml
synced 2025-03-03 18:52:12 +00:00
Add OpenMPI/Slurm support
This commit is contained in:
parent
0adbd79975
commit
babaf9f1ce
@ -95,6 +95,14 @@ def get_node_id(default=0):
|
||||
return node_id
|
||||
|
||||
|
||||
def get_is_master_node():
|
||||
global __force_master_node
|
||||
if __force_master_node:
|
||||
return True
|
||||
|
||||
return get_node_id(default=0) == 0
|
||||
|
||||
|
||||
def get_log_redirect_level():
|
||||
""" Returns which log level (and up) should be redirected to stderr. None means no redirection. """
|
||||
value = LOG_STDERR_REDIRECT_LEVEL.get()
|
||||
@ -107,3 +115,21 @@ def get_log_redirect_level():
|
||||
|
||||
def dev_worker_name():
|
||||
return DEV_WORKER_NAME.get()
|
||||
|
||||
|
||||
def __set_is_master_node():
|
||||
try:
|
||||
force_master_node = os.environ.pop('TRAINS_FORCE_MASTER_NODE', None)
|
||||
except:
|
||||
force_master_node = None
|
||||
|
||||
if force_master_node is not None:
|
||||
try:
|
||||
force_master_node = bool(int(force_master_node))
|
||||
except:
|
||||
force_master_node = None
|
||||
|
||||
return force_master_node
|
||||
|
||||
|
||||
__force_master_node = __set_is_master_node()
|
||||
|
@ -35,7 +35,7 @@ from .binding.frameworks.tensorflow_bind import TensorflowBinding
|
||||
from .binding.frameworks.xgboost_bind import PatchXGBoostModelIO
|
||||
from .binding.joblib_bind import PatchedJoblib
|
||||
from .binding.matplotlib_bind import PatchedMatplotlib
|
||||
from .config import config, DEV_TASK_NO_REUSE, get_node_id
|
||||
from .config import config, DEV_TASK_NO_REUSE, get_is_master_node
|
||||
from .config import running_remotely, get_remote_task_id
|
||||
from .config.cache import SessionCache
|
||||
from .debugging.log import LoggerRoot
|
||||
@ -240,9 +240,7 @@ class Task(_Task):
|
||||
# we could not find a task ID, revert to old stub behaviour
|
||||
if not is_sub_process_task_id:
|
||||
return _TaskStub()
|
||||
elif running_remotely() and get_node_id(default=0) != 0:
|
||||
print("get_node_id", get_node_id(), get_remote_task_id())
|
||||
|
||||
elif running_remotely() and not get_is_master_node():
|
||||
# make sure we only do it once per process
|
||||
cls.__forked_proc_main_pid = os.getpid()
|
||||
# make sure everyone understands we should act as if we are a subprocess (fake pid 1)
|
||||
|
Loading…
Reference in New Issue
Block a user