Fix code hangs when running with joblib (#1009)

2025-06-26 18:16:07 +00:00 · 2023-05-21 09:42:32 +03:00 · 2023-05-21 09:42:32 +03:00 · 7c09251686
commit 7c09251686
parent e80d1f1ff4
2 changed files with 13 additions and 10 deletions
--- a/clearml/binding/frameworks/pytorch_bind.py
+++ b/clearml/binding/frameworks/pytorch_bind.py
@ -18,7 +18,6 @@ class PatchPyTorchModelIO(PatchBaseModelIO):
    __patched = None
    __patched_lightning = None
    __patched_mmcv = None
    __default_checkpoint_filename_counter = {}
    @staticmethod
    def update_current_task(task, **_):
@ -185,9 +184,9 @@ class PatchPyTorchModelIO(PatchBaseModelIO):
                filename = f.name
            else:
-                filename = PatchPyTorchModelIO.__create_default_filename()
+                filename = PatchPyTorchModelIO.__get_cached_checkpoint_filename()
        except Exception:
-            filename = PatchPyTorchModelIO.__create_default_filename()
+            filename = PatchPyTorchModelIO.__get_cached_checkpoint_filename()
        # give the model a descriptive name based on the file name
        # noinspection PyBroadException
@ -195,7 +194,6 @@ class PatchPyTorchModelIO(PatchBaseModelIO):
            model_name = Path(filename).stem if filename is not None else None
        except Exception:
            model_name = None
        WeightsFileHandler.create_output_model(
            obj, filename, Framework.pytorch, PatchPyTorchModelIO._current_task, singlefile=True, model_name=model_name)
@ -284,11 +282,7 @@ class PatchPyTorchModelIO(PatchBaseModelIO):
        return model
    @staticmethod
-    def __create_default_filename():
+    def __get_cached_checkpoint_filename():
        tid = threading.current_thread().ident
        checkpoint_filename = PatchPyTorchModelIO._checkpoint_filename.get(tid)
-        if checkpoint_filename:
+        return checkpoint_filename or None
            return checkpoint_filename
        counter = PatchPyTorchModelIO.__default_checkpoint_filename_counter.setdefault(tid, 0)
        PatchPyTorchModelIO.__default_checkpoint_filename_counter[tid] += 1
        return "default_{}_{}".format(tid, counter)
--- a/clearml/binding/joblib_bind.py
+++ b/clearml/binding/joblib_bind.py
@ -48,6 +48,10 @@ class PatchedJoblib(object):
                    joblib.numpy_pickle.NumpyPickler.__init__ = _patched_call(
                        joblib.numpy_pickle.NumpyPickler.__init__,
                        PatchedJoblib._numpypickler)
                    joblib.memory.MemorizedFunc._cached_call = _patched_call(
                        joblib.memory.MemorizedFunc._cached_call,
                        PatchedJoblib._cached_call_recursion_guard
                    )
            if not PatchedJoblib._patched_sk_joblib and 'sklearn' in sys.modules:
                PatchedJoblib._patched_sk_joblib = True
@ -194,3 +198,8 @@ class PatchedJoblib(object):
                "Can't get model framework {}, model framework will be: {} ".format(object_orig_module, framework))
        finally:
            return framework
    @staticmethod
    def _cached_call_recursion_guard(original_fn, *args, **kwargs):
        # used just to avoid getting into the `_load` binding in the context of memory caching
        return original_fn(*args, **kwargs)