mirror of
https://github.com/clearml/clearml-serving
synced 2025-06-26 18:16:00 +00:00
not tested - exit on cuda oom
This commit is contained in:
parent
6ef1f67ad0
commit
6a8e616256
@ -41,8 +41,7 @@ class Preprocess(object):
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def unload(self) -> None: # noqa
|
||||
def unload(self) -> None:
|
||||
"""
|
||||
OPTIONAL: provide unloading method for the model
|
||||
For example:
|
||||
|
@ -69,8 +69,11 @@ while : ; do
|
||||
$GUNICORN_EXTRA_ARGS
|
||||
fi
|
||||
|
||||
echo "[DEBUG] ~~~~~~~~~~~~ Check if we restart here server ~~~~~~~~~~~~"
|
||||
if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ]
|
||||
then
|
||||
echo "[DEBUG] ~~~~~~~~~~~~ Not restarting ~~~~~~~~~~~~"
|
||||
break
|
||||
fi
|
||||
echo "[DEBUG] ~~~~~~~~~~~~ Restarting server ~~~~~~~~~~~~"
|
||||
done
|
||||
|
@ -1,6 +1,7 @@
|
||||
import os
|
||||
import traceback
|
||||
import gzip
|
||||
import sys
|
||||
|
||||
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
|
||||
from fastapi.routing import APIRoute
|
||||
@ -102,6 +103,9 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
|
||||
except ValueError as ex:
|
||||
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
||||
if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
|
||||
# can't always recover from this - prefer to exit the program such that it can be restarted
|
||||
sys.exit(1)
|
||||
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
|
||||
except Exception as ex:
|
||||
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||
|
@ -917,6 +917,7 @@ class ModelRequestProcessor(object):
|
||||
if k not in self._endpoints:
|
||||
# atomic
|
||||
self._engine_processor_lookup[k]._model = None
|
||||
print("clearml-serving --id c1a4ebd2586040ad906cf338d16bcb87 model remove --endpoint test_model_sklearn")
|
||||
gc.collect()
|
||||
if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"):
|
||||
try:
|
||||
|
Loading…
Reference in New Issue
Block a user