mirror of
https://github.com/clearml/clearml-serving
synced 2025-06-26 18:16:00 +00:00
not tested - exit on cuda oom
This commit is contained in:
parent
6ef1f67ad0
commit
6a8e616256
@ -41,8 +41,7 @@ class Preprocess(object):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def unload(self) -> None:
|
||||||
def unload(self) -> None: # noqa
|
|
||||||
"""
|
"""
|
||||||
OPTIONAL: provide unloading method for the model
|
OPTIONAL: provide unloading method for the model
|
||||||
For example:
|
For example:
|
||||||
|
@ -69,8 +69,11 @@ while : ; do
|
|||||||
$GUNICORN_EXTRA_ARGS
|
$GUNICORN_EXTRA_ARGS
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
echo "[DEBUG] ~~~~~~~~~~~~ Check if we restart here server ~~~~~~~~~~~~"
|
||||||
if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ]
|
if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ]
|
||||||
then
|
then
|
||||||
|
echo "[DEBUG] ~~~~~~~~~~~~ Not restarting ~~~~~~~~~~~~"
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
|
echo "[DEBUG] ~~~~~~~~~~~~ Restarting server ~~~~~~~~~~~~"
|
||||||
done
|
done
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
import gzip
|
import gzip
|
||||||
|
import sys
|
||||||
|
|
||||||
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
|
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
|
||||||
from fastapi.routing import APIRoute
|
from fastapi.routing import APIRoute
|
||||||
@ -102,6 +103,9 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
|
|||||||
except ValueError as ex:
|
except ValueError as ex:
|
||||||
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
||||||
|
if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
|
||||||
|
# can't always recover from this - prefer to exit the program such that it can be restarted
|
||||||
|
sys.exit(1)
|
||||||
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
|
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||||
|
@ -917,6 +917,7 @@ class ModelRequestProcessor(object):
|
|||||||
if k not in self._endpoints:
|
if k not in self._endpoints:
|
||||||
# atomic
|
# atomic
|
||||||
self._engine_processor_lookup[k]._model = None
|
self._engine_processor_lookup[k]._model = None
|
||||||
|
print("clearml-serving --id c1a4ebd2586040ad906cf338d16bcb87 model remove --endpoint test_model_sklearn")
|
||||||
gc.collect()
|
gc.collect()
|
||||||
if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"):
|
if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"):
|
||||||
try:
|
try:
|
||||||
|
Loading…
Reference in New Issue
Block a user