not tested - exit on cuda oom

2025-06-26 18:16:00 +00:00 · 2024-08-14 19:43:44 +03:00 · 2024-08-14 19:43:44 +03:00 · 6a8e616256
commit 6a8e616256
parent 6ef1f67ad0
4 changed files with 9 additions and 2 deletions
--- a/clearml_serving/preprocess/preprocess_template.py
+++ b/clearml_serving/preprocess/preprocess_template.py
@ -41,8 +41,7 @@ class Preprocess(object):
        """
        pass

-
-    def unload(self) -> None:  # noqa
+    def unload(self) -> None:
        """
        OPTIONAL: provide unloading method for the model
        For example:
--- a/clearml_serving/serving/entrypoint.sh
+++ b/clearml_serving/serving/entrypoint.sh
@ -69,8 +69,11 @@ while : ; do
        $GUNICORN_EXTRA_ARGS
  fi

+  echo "[DEBUG] ~~~~~~~~~~~~ Check if we restart here server ~~~~~~~~~~~~"
  if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ]
  then
+    echo "[DEBUG] ~~~~~~~~~~~~ Not restarting ~~~~~~~~~~~~"
    break
  fi
+  echo "[DEBUG] ~~~~~~~~~~~~ Restarting server ~~~~~~~~~~~~"
 done
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@ -1,6 +1,7 @@
 import os
 import traceback
 import gzip
+import sys

 from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
 from fastapi.routing import APIRoute
@ -102,6 +103,9 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
    except ValueError as ex:
        session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
+        if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
+            # can't always recover from this - prefer to exit the program such that it can be restarted
+            sys.exit(1)
        raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
    except Exception as ex:
        session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
--- a/clearml_serving/serving/model_request_processor.py
+++ b/clearml_serving/serving/model_request_processor.py
@ -917,6 +917,7 @@ class ModelRequestProcessor(object):
                        if k not in self._endpoints:
                            # atomic
                            self._engine_processor_lookup[k]._model = None
+                            print("clearml-serving --id c1a4ebd2586040ad906cf338d16bcb87 model remove --endpoint test_model_sklearn")
                            gc.collect()
                            if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"):
                                try: