Add initial slurm support (multiple nodes sharing the same task id)

This commit is contained in:
allegroai
2020-03-12 18:12:16 +02:00
parent 5b29aa194c
commit afad6a42ea
4 changed files with 64 additions and 10 deletions

View File

@@ -43,8 +43,8 @@ sdk {
subsampling: 0
}
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to True, each series should have its own graph)
tensorboard_single_series_per_graph: False
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
tensorboard_single_series_per_graph: false
}
network {
@@ -125,11 +125,11 @@ sdk {
log {
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
null_log_propagate: False
null_log_propagate: false
task_log_buffer_capacity: 66
# disable urllib info and lower levels
disable_urllib3_info: True
disable_urllib3_info: true
}
development {
@@ -139,14 +139,14 @@ sdk {
task_reuse_time_window_in_hours: 72.0
# Run VCS repository detection asynchronously
vcs_repo_detect_async: True
vcs_repo_detect_async: true
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
store_uncommitted_code_diff_on_train: True
store_uncommitted_code_diff: true
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
support_stopping: True
support_stopping: true
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
default_output_uri: ""
@@ -160,7 +160,7 @@ sdk {
ping_period_sec: 30
# Log all stdout & stderr
log_stdout: True
log_stdout: true
}
}
}