Improve pytorch model saving better, add support for mmvc

2025-06-26 18:16:07 +00:00 · 2022-09-15 15:59:34 +03:00 · 2022-09-15 15:59:34 +03:00 · 63e7cbab30
commit 63e7cbab30
parent 3e5d50e15d
2 changed files with 62 additions and 4 deletions
--- a/clearml/binding/frameworks/init.py
+++ b/clearml/binding/frameworks/init.py
@ -44,6 +44,13 @@ def _patched_call(original_fn, patched_fn):
    return _inner_patch


+def _patched_call_no_recursion_guard(original_fn, patched_fn):
+    def _inner_patch(*args, **kwargs):
+        return patched_fn(original_fn, *args, **kwargs)
+
+    return _inner_patch
+
+
 class _Empty(object):
    def __init__(self):
        self.trains_in_model = None
@ -159,7 +166,7 @@ class WeightsFileHandler(object):
        If the callback was already added, return the existing handle.

        :param handle: A callback handle returned from :meth:WeightsFileHandler.add_post_callback
-        :return True if callback removed, False otherwise
+        :return: True if callback removed, False otherwise
        """
        return cls._remove_callback(handle, cls._model_post_callbacks)

--- a/clearml/binding/frameworks/pytorch_bind.py
+++ b/clearml/binding/frameworks/pytorch_bind.py
@ -1,10 +1,12 @@
 import sys

 import six
+import threading
+
 from pathlib2 import Path

 from ...binding.frameworks.base_bind import PatchBaseModelIO
-from ..frameworks import _patched_call, WeightsFileHandler, _Empty
+from ..frameworks import _patched_call, _patched_call_no_recursion_guard, WeightsFileHandler, _Empty
 from ..import_bind import PostImportHookPatching
 from ...config import running_remotely
 from ...model import Framework
@ -12,8 +14,11 @@ from ...model import Framework

 class PatchPyTorchModelIO(PatchBaseModelIO):
    _current_task = None
+    _checkpoint_filename = {}
    __patched = None
    __patched_lightning = None
+    __patched_mmcv = None
+    __default_checkpoint_filename_counter = {}

    @staticmethod
    def update_current_task(task, **_):
@ -22,6 +27,7 @@ class PatchPyTorchModelIO(PatchBaseModelIO):
            return
        PatchPyTorchModelIO._patch_model_io()
        PatchPyTorchModelIO._patch_lightning_io()
+        PatchPyTorchModelIO._patch_mmcv()
        PostImportHookPatching.add_on_import('torch', PatchPyTorchModelIO._patch_model_io)
        PostImportHookPatching.add_on_import('pytorch_lightning', PatchPyTorchModelIO._patch_lightning_io)

@ -65,6 +71,41 @@ class PatchPyTorchModelIO(PatchBaseModelIO):
        except Exception:
            pass  # print('Failed patching pytorch')

+    @staticmethod
+    def _patch_mmcv():
+        if PatchPyTorchModelIO.__patched_mmcv:
+            return
+        if "mmcv" not in sys.modules:
+            return
+        PatchPyTorchModelIO.__patched_mmcv = True
+
+        # noinspection PyBroadException
+        try:
+            from mmcv.runner import epoch_based_runner, iter_based_runner
+
+            # we don't want the recursion check here because it guards pytorch's patched save functions
+            # which we need in order to log the saved model/checkpoint
+            epoch_based_runner.save_checkpoint = _patched_call_no_recursion_guard(
+                epoch_based_runner.save_checkpoint, PatchPyTorchModelIO._mmcv_save_checkpoint
+            )
+            iter_based_runner.save_checkpoint = _patched_call_no_recursion_guard(
+                iter_based_runner.save_checkpoint, PatchPyTorchModelIO._mmcv_save_checkpoint
+            )
+        except Exception:
+            pass
+
+    @staticmethod
+    def _mmcv_save_checkpoint(original_fn, model, filename, *args, **kwargs):
+        # note that mmcv.runner.save_checkpoint doesn't return anything, hence the need for this
+        # patch function, but we return from it just in case this changes in the future
+        if not PatchPyTorchModelIO._current_task:
+            return original_fn(model, filename, *args, **kwargs)
+        tid = threading.current_thread().ident
+        PatchPyTorchModelIO._checkpoint_filename[tid] = filename
+        ret = original_fn(model, filename, *args, **kwargs)
+        del PatchPyTorchModelIO._checkpoint_filename[tid]
+        return ret
+
    @staticmethod
    def _patch_lightning_io():
        if PatchPyTorchModelIO.__patched_lightning:
@ -144,9 +185,9 @@ class PatchPyTorchModelIO(PatchBaseModelIO):

                filename = f.name
            else:
-                filename = None
+                filename = PatchPyTorchModelIO.__create_default_filename()
        except Exception:
-            filename = None
+            filename = PatchPyTorchModelIO.__create_default_filename()

        # give the model a descriptive name based on the file name
        # noinspection PyBroadException
@ -241,3 +282,13 @@ class PatchPyTorchModelIO(PatchBaseModelIO):
                pass

        return model
+
+    @staticmethod
+    def __create_default_filename():
+        tid = threading.current_thread().ident
+        checkpoint_filename = PatchPyTorchModelIO._checkpoint_filename.get(tid)
+        if checkpoint_filename:
+            return checkpoint_filename
+        counter = PatchPyTorchModelIO.__default_checkpoint_filename_counter.setdefault(tid, 0)
+        PatchPyTorchModelIO.__default_checkpoint_filename_counter[tid] += 1
+        return "default_{}_{}".format(tid, counter)