mirror of
https://github.com/clearml/clearml-serving
synced 2025-06-26 18:16:00 +00:00
initial commit for oom issues - allow for model unload and restart serving on crash
This commit is contained in:
parent
724c99c605
commit
6ef1f67ad0
@ -41,6 +41,18 @@ class Preprocess(object):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def unload(self) -> None: # noqa
|
||||||
|
"""
|
||||||
|
OPTIONAL: provide unloading method for the model
|
||||||
|
For example:
|
||||||
|
```py
|
||||||
|
import torch
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
def preprocess(
|
def preprocess(
|
||||||
self,
|
self,
|
||||||
body: Union[bytes, dict],
|
body: Union[bytes, dict],
|
||||||
|
@ -9,6 +9,7 @@ echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES"
|
|||||||
echo CLEARML_SERVING_NUM_PROCESS="$CLEARML_SERVING_NUM_PROCESS"
|
echo CLEARML_SERVING_NUM_PROCESS="$CLEARML_SERVING_NUM_PROCESS"
|
||||||
echo CLEARML_SERVING_POLL_FREQ="$CLEARML_SERVING_POLL_FREQ"
|
echo CLEARML_SERVING_POLL_FREQ="$CLEARML_SERVING_POLL_FREQ"
|
||||||
echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL"
|
echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL"
|
||||||
|
echo CLEARML_SERVING_RESTART_ON_FAILURE="$CLEARML_SERVING_RESTART_ON_FAILURE"
|
||||||
|
|
||||||
SERVING_PORT="${CLEARML_SERVING_PORT:-8080}"
|
SERVING_PORT="${CLEARML_SERVING_PORT:-8080}"
|
||||||
GUNICORN_NUM_PROCESS="${CLEARML_SERVING_NUM_PROCESS:-4}"
|
GUNICORN_NUM_PROCESS="${CLEARML_SERVING_NUM_PROCESS:-4}"
|
||||||
@ -40,29 +41,36 @@ then
|
|||||||
python3 -m pip install $CLEARML_EXTRA_PYTHON_PACKAGES
|
python3 -m pip install $CLEARML_EXTRA_PYTHON_PACKAGES
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -z "$CLEARML_USE_GUNICORN" ]
|
while : ; do
|
||||||
then
|
if [ -z "$CLEARML_USE_GUNICORN" ]
|
||||||
if [ -z "$CLEARML_SERVING_NUM_PROCESS" ]
|
|
||||||
then
|
then
|
||||||
echo "Starting Uvicorn server - single worker"
|
if [ -z "$CLEARML_SERVING_NUM_PROCESS" ]
|
||||||
PYTHONPATH=$(pwd) python3 -m uvicorn \
|
then
|
||||||
clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \
|
echo "Starting Uvicorn server - single worker"
|
||||||
$UVICORN_EXTRA_ARGS
|
PYTHONPATH=$(pwd) python3 -m uvicorn \
|
||||||
|
clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \
|
||||||
|
$UVICORN_EXTRA_ARGS
|
||||||
|
else
|
||||||
|
echo "Starting Uvicorn server - multi worker"
|
||||||
|
PYTHONPATH=$(pwd) python3 clearml_serving/serving/uvicorn_mp_entrypoint.py \
|
||||||
|
clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \
|
||||||
|
--workers $CLEARML_SERVING_NUM_PROCESS $UVICORN_EXTRA_ARGS
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "Starting Uvicorn server - multi worker"
|
echo "Starting Gunicorn server"
|
||||||
PYTHONPATH=$(pwd) python3 clearml_serving/serving/uvicorn_mp_entrypoint.py \
|
# start service
|
||||||
clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \
|
PYTHONPATH=$(pwd) python3 -m gunicorn \
|
||||||
--workers $CLEARML_SERVING_NUM_PROCESS $UVICORN_EXTRA_ARGS
|
--preload clearml_serving.serving.main:app \
|
||||||
|
--workers $GUNICORN_NUM_PROCESS \
|
||||||
|
--worker-class uvicorn.workers.UvicornWorker \
|
||||||
|
--max-requests $GUNICORN_MAX_REQUESTS \
|
||||||
|
--timeout $GUNICORN_SERVING_TIMEOUT \
|
||||||
|
--bind 0.0.0.0:$SERVING_PORT \
|
||||||
|
$GUNICORN_EXTRA_ARGS
|
||||||
fi
|
fi
|
||||||
else
|
|
||||||
echo "Starting Gunicorn server"
|
if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ]
|
||||||
# start service
|
then
|
||||||
PYTHONPATH=$(pwd) python3 -m gunicorn \
|
break
|
||||||
--preload clearml_serving.serving.main:app \
|
fi
|
||||||
--workers $GUNICORN_NUM_PROCESS \
|
done
|
||||||
--worker-class uvicorn.workers.UvicornWorker \
|
|
||||||
--max-requests $GUNICORN_MAX_REQUESTS \
|
|
||||||
--timeout $GUNICORN_SERVING_TIMEOUT \
|
|
||||||
--bind 0.0.0.0:$SERVING_PORT \
|
|
||||||
$GUNICORN_EXTRA_ARGS
|
|
||||||
fi
|
|
||||||
|
@ -1,13 +1,9 @@
|
|||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
import gzip
|
import gzip
|
||||||
import asyncio
|
|
||||||
|
|
||||||
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
|
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
|
||||||
from fastapi.routing import APIRoute
|
from fastapi.routing import APIRoute
|
||||||
from fastapi.responses import PlainTextResponse
|
|
||||||
|
|
||||||
from starlette.background import BackgroundTask
|
|
||||||
|
|
||||||
from typing import Optional, Dict, Any, Callable, Union
|
from typing import Optional, Dict, Any, Callable, Union
|
||||||
|
|
||||||
@ -52,9 +48,6 @@ try:
|
|||||||
except (ValueError, TypeError):
|
except (ValueError, TypeError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class CUDAException(Exception):
|
|
||||||
def __init__(self, exception: str):
|
|
||||||
self.exception = exception
|
|
||||||
|
|
||||||
# start FastAPI app
|
# start FastAPI app
|
||||||
app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router")
|
app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router")
|
||||||
@ -77,20 +70,6 @@ async def startup_event():
|
|||||||
processor.launch(poll_frequency_sec=model_sync_frequency_secs*60)
|
processor.launch(poll_frequency_sec=model_sync_frequency_secs*60)
|
||||||
|
|
||||||
|
|
||||||
@app.on_event('shutdown')
|
|
||||||
def shutdown_event():
|
|
||||||
print('RESTARTING INFERENCE SERVICE!')
|
|
||||||
|
|
||||||
async def exit_app():
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
loop.stop()
|
|
||||||
|
|
||||||
@app.exception_handler(CUDAException)
|
|
||||||
async def cuda_exception_handler(request, exc):
|
|
||||||
task = BackgroundTask(exit_app)
|
|
||||||
return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task)
|
|
||||||
|
|
||||||
|
|
||||||
router = APIRouter(
|
router = APIRouter(
|
||||||
prefix="/serve",
|
prefix="/serve",
|
||||||
tags=["models"],
|
tags=["models"],
|
||||||
@ -123,10 +102,7 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
|
|||||||
except ValueError as ex:
|
except ValueError as ex:
|
||||||
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
||||||
if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
|
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
|
||||||
raise CUDAException(exception=ex)
|
|
||||||
else:
|
|
||||||
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import gc
|
import gc
|
||||||
import torch
|
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from random import random
|
from random import random
|
||||||
@ -918,11 +917,13 @@ class ModelRequestProcessor(object):
|
|||||||
if k not in self._endpoints:
|
if k not in self._endpoints:
|
||||||
# atomic
|
# atomic
|
||||||
self._engine_processor_lookup[k]._model = None
|
self._engine_processor_lookup[k]._model = None
|
||||||
self._engine_processor_lookup[k]._preprocess = None
|
|
||||||
del self._engine_processor_lookup[k]
|
|
||||||
self._engine_processor_lookup.pop(k, None)
|
|
||||||
gc.collect()
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"):
|
||||||
|
try:
|
||||||
|
self._engine_processor_lookup[k]._preprocess.unload()
|
||||||
|
except Exception as ex:
|
||||||
|
print("Exception occurred unloading model: {}".format(ex))
|
||||||
|
self._engine_processor_lookup.pop(k, None)
|
||||||
cleanup = False
|
cleanup = False
|
||||||
model_monitor_update = False
|
model_monitor_update = False
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
|
@ -96,6 +96,7 @@ services:
|
|||||||
CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
|
CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
|
||||||
CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
|
CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
|
||||||
CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
|
CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
|
||||||
|
CLEARML_SERVING_RESTART_ON_FAILURE: ${CLEARML_SERVING_RESTART_ON_FAILURE:-}
|
||||||
CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
|
CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
|
||||||
CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
|
CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
|
||||||
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
|
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
|
||||||
|
@ -96,6 +96,7 @@ services:
|
|||||||
CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
|
CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
|
||||||
CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
|
CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
|
||||||
CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
|
CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
|
||||||
|
CLEARML_SERVING_RESTART_ON_FAILURE: ${CLEARML_SERVING_RESTART_ON_FAILURE:-}
|
||||||
CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
|
CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
|
||||||
CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
|
CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
|
||||||
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
|
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
|
||||||
|
@ -96,6 +96,7 @@ services:
|
|||||||
CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
|
CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
|
||||||
CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-}
|
CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-}
|
||||||
CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
|
CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
|
||||||
|
CLEARML_SERVING_RESTART_ON_FAILURE: ${CLEARML_SERVING_RESTART_ON_FAILURE:-}
|
||||||
CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
|
CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
|
||||||
CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
|
CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
|
||||||
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
|
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
|
||||||
|
@ -73,6 +73,8 @@ CLEARML_EXTRA_PYTHON_PACKAGES=transformers
|
|||||||
# Change this depending on your machine and performance needs
|
# Change this depending on your machine and performance needs
|
||||||
CLEARML_USE_GUNICORN=1
|
CLEARML_USE_GUNICORN=1
|
||||||
CLEARML_SERVING_NUM_PROCESS=8
|
CLEARML_SERVING_NUM_PROCESS=8
|
||||||
|
# Restarts if the serving process crashes
|
||||||
|
CLEARML_SERVING_RESTART_ON_FAILURE=1
|
||||||
```
|
```
|
||||||
|
|
||||||
Huggingface models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart to set things up. Check the repository main readme documentation if you need help.
|
Huggingface models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart to set things up. Check the repository main readme documentation if you need help.
|
||||||
|
Loading…
Reference in New Issue
Block a user