initial commit for oom issues - allow for model unload and restart serving on crash

This commit is contained in:
Eugen Ajechiloae 2024-08-14 17:01:29 +03:00
parent 724c99c605
commit 6ef1f67ad0
8 changed files with 55 additions and 53 deletions

View File

@ -41,6 +41,18 @@ class Preprocess(object):
""" """
pass pass
def unload(self) -> None: # noqa
"""
OPTIONAL: provide unloading method for the model
For example:
```py
import torch
torch.cuda.empty_cache()
```
"""
pass
def preprocess( def preprocess(
self, self,
body: Union[bytes, dict], body: Union[bytes, dict],

View File

@ -9,6 +9,7 @@ echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES"
echo CLEARML_SERVING_NUM_PROCESS="$CLEARML_SERVING_NUM_PROCESS" echo CLEARML_SERVING_NUM_PROCESS="$CLEARML_SERVING_NUM_PROCESS"
echo CLEARML_SERVING_POLL_FREQ="$CLEARML_SERVING_POLL_FREQ" echo CLEARML_SERVING_POLL_FREQ="$CLEARML_SERVING_POLL_FREQ"
echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL" echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL"
echo CLEARML_SERVING_RESTART_ON_FAILURE="$CLEARML_SERVING_RESTART_ON_FAILURE"
SERVING_PORT="${CLEARML_SERVING_PORT:-8080}" SERVING_PORT="${CLEARML_SERVING_PORT:-8080}"
GUNICORN_NUM_PROCESS="${CLEARML_SERVING_NUM_PROCESS:-4}" GUNICORN_NUM_PROCESS="${CLEARML_SERVING_NUM_PROCESS:-4}"
@ -40,29 +41,36 @@ then
python3 -m pip install $CLEARML_EXTRA_PYTHON_PACKAGES python3 -m pip install $CLEARML_EXTRA_PYTHON_PACKAGES
fi fi
if [ -z "$CLEARML_USE_GUNICORN" ] while : ; do
then if [ -z "$CLEARML_USE_GUNICORN" ]
if [ -z "$CLEARML_SERVING_NUM_PROCESS" ]
then then
echo "Starting Uvicorn server - single worker" if [ -z "$CLEARML_SERVING_NUM_PROCESS" ]
PYTHONPATH=$(pwd) python3 -m uvicorn \ then
clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \ echo "Starting Uvicorn server - single worker"
$UVICORN_EXTRA_ARGS PYTHONPATH=$(pwd) python3 -m uvicorn \
clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \
$UVICORN_EXTRA_ARGS
else
echo "Starting Uvicorn server - multi worker"
PYTHONPATH=$(pwd) python3 clearml_serving/serving/uvicorn_mp_entrypoint.py \
clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \
--workers $CLEARML_SERVING_NUM_PROCESS $UVICORN_EXTRA_ARGS
fi
else else
echo "Starting Uvicorn server - multi worker" echo "Starting Gunicorn server"
PYTHONPATH=$(pwd) python3 clearml_serving/serving/uvicorn_mp_entrypoint.py \ # start service
clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \ PYTHONPATH=$(pwd) python3 -m gunicorn \
--workers $CLEARML_SERVING_NUM_PROCESS $UVICORN_EXTRA_ARGS --preload clearml_serving.serving.main:app \
--workers $GUNICORN_NUM_PROCESS \
--worker-class uvicorn.workers.UvicornWorker \
--max-requests $GUNICORN_MAX_REQUESTS \
--timeout $GUNICORN_SERVING_TIMEOUT \
--bind 0.0.0.0:$SERVING_PORT \
$GUNICORN_EXTRA_ARGS
fi fi
else
echo "Starting Gunicorn server" if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ]
# start service then
PYTHONPATH=$(pwd) python3 -m gunicorn \ break
--preload clearml_serving.serving.main:app \ fi
--workers $GUNICORN_NUM_PROCESS \ done
--worker-class uvicorn.workers.UvicornWorker \
--max-requests $GUNICORN_MAX_REQUESTS \
--timeout $GUNICORN_SERVING_TIMEOUT \
--bind 0.0.0.0:$SERVING_PORT \
$GUNICORN_EXTRA_ARGS
fi

View File

@ -1,13 +1,9 @@
import os import os
import traceback import traceback
import gzip import gzip
import asyncio
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
from fastapi.routing import APIRoute from fastapi.routing import APIRoute
from fastapi.responses import PlainTextResponse
from starlette.background import BackgroundTask
from typing import Optional, Dict, Any, Callable, Union from typing import Optional, Dict, Any, Callable, Union
@ -52,9 +48,6 @@ try:
except (ValueError, TypeError): except (ValueError, TypeError):
pass pass
class CUDAException(Exception):
def __init__(self, exception: str):
self.exception = exception
# start FastAPI app # start FastAPI app
app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router") app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router")
@ -77,20 +70,6 @@ async def startup_event():
processor.launch(poll_frequency_sec=model_sync_frequency_secs*60) processor.launch(poll_frequency_sec=model_sync_frequency_secs*60)
@app.on_event('shutdown')
def shutdown_event():
print('RESTARTING INFERENCE SERVICE!')
async def exit_app():
loop = asyncio.get_running_loop()
loop.stop()
@app.exception_handler(CUDAException)
async def cuda_exception_handler(request, exc):
task = BackgroundTask(exit_app)
return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task)
router = APIRouter( router = APIRouter(
prefix="/serve", prefix="/serve",
tags=["models"], tags=["models"],
@ -123,10 +102,7 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
except ValueError as ex: except ValueError as ex:
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
raise CUDAException(exception=ex)
else:
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
except Exception as ex: except Exception as ex:
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) instance_id, type(ex), ex, request, "".join(traceback.format_exc())))

View File

@ -1,7 +1,6 @@
import json import json
import os import os
import gc import gc
import torch
from collections import deque from collections import deque
from pathlib import Path from pathlib import Path
from random import random from random import random
@ -918,11 +917,13 @@ class ModelRequestProcessor(object):
if k not in self._endpoints: if k not in self._endpoints:
# atomic # atomic
self._engine_processor_lookup[k]._model = None self._engine_processor_lookup[k]._model = None
self._engine_processor_lookup[k]._preprocess = None
del self._engine_processor_lookup[k]
self._engine_processor_lookup.pop(k, None)
gc.collect() gc.collect()
torch.cuda.empty_cache() if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"):
try:
self._engine_processor_lookup[k]._preprocess.unload()
except Exception as ex:
print("Exception occurred unloading model: {}".format(ex))
self._engine_processor_lookup.pop(k, None)
cleanup = False cleanup = False
model_monitor_update = False model_monitor_update = False
except Exception as ex: except Exception as ex:

View File

@ -96,6 +96,7 @@ services:
CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001} CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-} CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
CLEARML_SERVING_RESTART_ON_FAILURE: ${CLEARML_SERVING_RESTART_ON_FAILURE:-}
CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-} CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-} CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}

View File

@ -96,6 +96,7 @@ services:
CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001} CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-} CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
CLEARML_SERVING_RESTART_ON_FAILURE: ${CLEARML_SERVING_RESTART_ON_FAILURE:-}
CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-} CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-} CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}

View File

@ -96,6 +96,7 @@ services:
CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-} CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-}
CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-} CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
CLEARML_SERVING_RESTART_ON_FAILURE: ${CLEARML_SERVING_RESTART_ON_FAILURE:-}
CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-} CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-} CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-}
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}

View File

@ -73,6 +73,8 @@ CLEARML_EXTRA_PYTHON_PACKAGES=transformers
# Change this depending on your machine and performance needs # Change this depending on your machine and performance needs
CLEARML_USE_GUNICORN=1 CLEARML_USE_GUNICORN=1
CLEARML_SERVING_NUM_PROCESS=8 CLEARML_SERVING_NUM_PROCESS=8
# Restarts if the serving process crashes
CLEARML_SERVING_RESTART_ON_FAILURE=1
``` ```
Huggingface models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart to set things up. Check the repository main readme documentation if you need help. Huggingface models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart to set things up. Check the repository main readme documentation if you need help.