mirror of
https://github.com/clearml/clearml-agent
synced 2025-02-07 13:26:08 +00:00
Fix CLEARML_MULTI_NODE_SINGLE_TASK should be read once not every reported line
This commit is contained in:
parent
b2a4bf08ac
commit
2f0553b873
@ -2081,9 +2081,9 @@ class Worker(ServiceCommandSection):
|
|||||||
stderr_line_count, stderr_pos_count, stderr_last_lines = 0, 0, []
|
stderr_line_count, stderr_pos_count, stderr_last_lines = 0, 0, []
|
||||||
lines_buffer = defaultdict(list)
|
lines_buffer = defaultdict(list)
|
||||||
|
|
||||||
def report_lines(lines, source):
|
def report_lines(lines, source, a_multi_node_single_task=None):
|
||||||
# support colored multi-node reporting on the same Task for easier debugging
|
# support colored multi-node reporting on the same Task for easier debugging
|
||||||
if lines and ENV_MULTI_NODE_SINGLE_TASK.get() and ENV_MULTI_NODE_SINGLE_TASK.get() > 0:
|
if lines and a_multi_node_single_task and a_multi_node_single_task > 0:
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
rank = int(os.environ.get("RANK", os.environ.get('SLURM_PROCID')) or 0)
|
rank = int(os.environ.get("RANK", os.environ.get('SLURM_PROCID')) or 0)
|
||||||
@ -2111,6 +2111,7 @@ class Worker(ServiceCommandSection):
|
|||||||
status = None
|
status = None
|
||||||
process = None
|
process = None
|
||||||
last_task_ping = 0
|
last_task_ping = 0
|
||||||
|
multi_node_single_task = ENV_MULTI_NODE_SINGLE_TASK.get()
|
||||||
try:
|
try:
|
||||||
_last_machine_update_ts = time()
|
_last_machine_update_ts = time()
|
||||||
stop_reason = None
|
stop_reason = None
|
||||||
@ -2169,11 +2170,11 @@ class Worker(ServiceCommandSection):
|
|||||||
if status is not None:
|
if status is not None:
|
||||||
stop_reason = 'Service started'
|
stop_reason = 'Service started'
|
||||||
|
|
||||||
stdout_line_count += report_lines(printed_lines, "stdout")
|
stdout_line_count += report_lines(printed_lines, "stdout", multi_node_single_task)
|
||||||
|
|
||||||
if stderr_path:
|
if stderr_path:
|
||||||
printed_lines, stderr_pos_count = _print_file(stderr_path, stderr_pos_count)
|
printed_lines, stderr_pos_count = _print_file(stderr_path, stderr_pos_count)
|
||||||
stderr_line_count += report_lines(printed_lines, "stderr")
|
stderr_line_count += report_lines(printed_lines, "stderr", multi_node_single_task)
|
||||||
|
|
||||||
except subprocess.CalledProcessError as ex:
|
except subprocess.CalledProcessError as ex:
|
||||||
# non zero return code
|
# non zero return code
|
||||||
@ -2187,10 +2188,10 @@ class Worker(ServiceCommandSection):
|
|||||||
except Exception:
|
except Exception:
|
||||||
# we should not get here, but better safe than sorry
|
# we should not get here, but better safe than sorry
|
||||||
printed_lines, stdout_pos_count = _print_file(stdout_path, stdout_pos_count)
|
printed_lines, stdout_pos_count = _print_file(stdout_path, stdout_pos_count)
|
||||||
stdout_line_count += report_lines(printed_lines, "stdout")
|
stdout_line_count += report_lines(printed_lines, "stdout", multi_node_single_task)
|
||||||
if stderr_path:
|
if stderr_path:
|
||||||
printed_lines, stderr_pos_count = _print_file(stderr_path, stderr_pos_count)
|
printed_lines, stderr_pos_count = _print_file(stderr_path, stderr_pos_count)
|
||||||
stderr_line_count += report_lines(printed_lines, "stderr")
|
stderr_line_count += report_lines(printed_lines, "stderr", multi_node_single_task)
|
||||||
stop_reason = TaskStopReason.exception
|
stop_reason = TaskStopReason.exception
|
||||||
status = -1
|
status = -1
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user