not tested - exit on cuda oom

This commit is contained in:
Eugen Ajechiloae 2024-08-14 19:43:44 +03:00
parent 6ef1f67ad0
commit 6a8e616256
4 changed files with 9 additions and 2 deletions

View File

@ -41,8 +41,7 @@ class Preprocess(object):
""" """
pass pass
def unload(self) -> None:
def unload(self) -> None: # noqa
""" """
OPTIONAL: provide unloading method for the model OPTIONAL: provide unloading method for the model
For example: For example:

View File

@ -69,8 +69,11 @@ while : ; do
$GUNICORN_EXTRA_ARGS $GUNICORN_EXTRA_ARGS
fi fi
echo "[DEBUG] ~~~~~~~~~~~~ Check if we restart here server ~~~~~~~~~~~~"
if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ] if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ]
then then
echo "[DEBUG] ~~~~~~~~~~~~ Not restarting ~~~~~~~~~~~~"
break break
fi fi
echo "[DEBUG] ~~~~~~~~~~~~ Restarting server ~~~~~~~~~~~~"
done done

View File

@ -1,6 +1,7 @@
import os import os
import traceback import traceback
import gzip import gzip
import sys
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
from fastapi.routing import APIRoute from fastapi.routing import APIRoute
@ -102,6 +103,9 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
except ValueError as ex: except ValueError as ex:
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
# can't always recover from this - prefer to exit the program such that it can be restarted
sys.exit(1)
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
except Exception as ex: except Exception as ex:
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(

View File

@ -917,6 +917,7 @@ class ModelRequestProcessor(object):
if k not in self._endpoints: if k not in self._endpoints:
# atomic # atomic
self._engine_processor_lookup[k]._model = None self._engine_processor_lookup[k]._model = None
print("clearml-serving --id c1a4ebd2586040ad906cf338d16bcb87 model remove --endpoint test_model_sklearn")
gc.collect() gc.collect()
if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"): if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"):
try: try: