Fix ontinuing Tasks with tensorflow sometimes result in wrong initial iteration (#762)

2025-06-26 18:16:07 +00:00 · 2022-12-13 15:36:05 +02:00 · 2022-12-13 15:36:05 +02:00 · 16df0794be
commit 16df0794be
parent 3da182426f
1 changed files with 10 additions and 0 deletions
--- a/clearml/binding/frameworks/tensorflow_bind.py
+++ b/clearml/binding/frameworks/tensorflow_bind.py
@ -726,6 +726,16 @@ class EventTrainsWriter(object):
                        'Received event without step, assuming step = {}'.format(step))
            else:
                step = int(step)
+            # unlike other frameworks, tensorflow already accounts for the iteration number
+            # when continuing the training. we substract the smallest iteration such that we
+            # don't increment the step twice number
+            step_before = step
+            if EventTrainsWriter._current_task:
+                step -= EventTrainsWriter._current_task.get_initial_iteration()
+            # there can be a few metrics getting reported again, so the step can be negative
+            # for the first few reports
+            if step <= 0:
+                return
            self._max_step = max(self._max_step, step)
            if value_dicts is None:
                LoggerRoot.get_base_logger(TensorflowBinding).debug("Summary arrived without 'value'")