not tested - exit on cuda oom

This commit is contained in:
Eugen Ajechiloae 2024-08-14 19:43:44 +03:00
parent 6ef1f67ad0
commit 6a8e616256
4 changed files with 9 additions and 2 deletions

View File

@ -41,8 +41,7 @@ class Preprocess(object):
"""
pass
def unload(self) -> None: # noqa
def unload(self) -> None:
"""
OPTIONAL: provide unloading method for the model
For example:

View File

@ -69,8 +69,11 @@ while : ; do
$GUNICORN_EXTRA_ARGS
fi
echo "[DEBUG] ~~~~~~~~~~~~ Check if we restart here server ~~~~~~~~~~~~"
if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ]
then
echo "[DEBUG] ~~~~~~~~~~~~ Not restarting ~~~~~~~~~~~~"
break
fi
echo "[DEBUG] ~~~~~~~~~~~~ Restarting server ~~~~~~~~~~~~"
done

View File

@ -1,6 +1,7 @@
import os
import traceback
import gzip
import sys
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
from fastapi.routing import APIRoute
@ -102,6 +103,9 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
except ValueError as ex:
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
# can't always recover from this - prefer to exit the program such that it can be restarted
sys.exit(1)
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
except Exception as ex:
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(

View File

@ -917,6 +917,7 @@ class ModelRequestProcessor(object):
if k not in self._endpoints:
# atomic
self._engine_processor_lookup[k]._model = None
print("clearml-serving --id c1a4ebd2586040ad906cf338d16bcb87 model remove --endpoint test_model_sklearn")
gc.collect()
if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"):
try: