From 6ef1f67ad0096605144eeb4b495defbe1f9ae16f Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Wed, 14 Aug 2024 17:01:29 +0300 Subject: [PATCH 01/14] initial commit for oom issues - allow for model unload and restart serving on crash --- .../preprocess/preprocess_template.py | 12 +++++ clearml_serving/serving/entrypoint.sh | 54 +++++++++++-------- clearml_serving/serving/main.py | 26 +-------- .../serving/model_request_processor.py | 11 ++-- docker/docker-compose-triton-gpu.yml | 1 + docker/docker-compose-triton.yml | 1 + docker/docker-compose.yml | 1 + examples/huggingface/readme.md | 2 + 8 files changed, 55 insertions(+), 53 deletions(-) diff --git a/clearml_serving/preprocess/preprocess_template.py b/clearml_serving/preprocess/preprocess_template.py index 28554a5..bd9b936 100644 --- a/clearml_serving/preprocess/preprocess_template.py +++ b/clearml_serving/preprocess/preprocess_template.py @@ -41,6 +41,18 @@ class Preprocess(object): """ pass + + def unload(self) -> None: # noqa + """ + OPTIONAL: provide unloading method for the model + For example: + ```py + import torch + torch.cuda.empty_cache() + ``` + """ + pass + def preprocess( self, body: Union[bytes, dict], diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh index a5efea1..634c281 100755 --- a/clearml_serving/serving/entrypoint.sh +++ b/clearml_serving/serving/entrypoint.sh @@ -9,6 +9,7 @@ echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES" echo CLEARML_SERVING_NUM_PROCESS="$CLEARML_SERVING_NUM_PROCESS" echo CLEARML_SERVING_POLL_FREQ="$CLEARML_SERVING_POLL_FREQ" echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL" +echo CLEARML_SERVING_RESTART_ON_FAILURE="$CLEARML_SERVING_RESTART_ON_FAILURE" SERVING_PORT="${CLEARML_SERVING_PORT:-8080}" GUNICORN_NUM_PROCESS="${CLEARML_SERVING_NUM_PROCESS:-4}" @@ -40,29 +41,36 @@ then python3 -m pip install $CLEARML_EXTRA_PYTHON_PACKAGES fi -if [ -z "$CLEARML_USE_GUNICORN" ] -then - if [ -z "$CLEARML_SERVING_NUM_PROCESS" ] +while : ; do + if [ -z "$CLEARML_USE_GUNICORN" ] then - echo "Starting Uvicorn server - single worker" - PYTHONPATH=$(pwd) python3 -m uvicorn \ - clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \ - $UVICORN_EXTRA_ARGS + if [ -z "$CLEARML_SERVING_NUM_PROCESS" ] + then + echo "Starting Uvicorn server - single worker" + PYTHONPATH=$(pwd) python3 -m uvicorn \ + clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \ + $UVICORN_EXTRA_ARGS + else + echo "Starting Uvicorn server - multi worker" + PYTHONPATH=$(pwd) python3 clearml_serving/serving/uvicorn_mp_entrypoint.py \ + clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \ + --workers $CLEARML_SERVING_NUM_PROCESS $UVICORN_EXTRA_ARGS + fi else - echo "Starting Uvicorn server - multi worker" - PYTHONPATH=$(pwd) python3 clearml_serving/serving/uvicorn_mp_entrypoint.py \ - clearml_serving.serving.main:app --log-level $UVICORN_LOG_LEVEL --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \ - --workers $CLEARML_SERVING_NUM_PROCESS $UVICORN_EXTRA_ARGS + echo "Starting Gunicorn server" + # start service + PYTHONPATH=$(pwd) python3 -m gunicorn \ + --preload clearml_serving.serving.main:app \ + --workers $GUNICORN_NUM_PROCESS \ + --worker-class uvicorn.workers.UvicornWorker \ + --max-requests $GUNICORN_MAX_REQUESTS \ + --timeout $GUNICORN_SERVING_TIMEOUT \ + --bind 0.0.0.0:$SERVING_PORT \ + $GUNICORN_EXTRA_ARGS fi -else - echo "Starting Gunicorn server" - # start service - PYTHONPATH=$(pwd) python3 -m gunicorn \ - --preload clearml_serving.serving.main:app \ - --workers $GUNICORN_NUM_PROCESS \ - --worker-class uvicorn.workers.UvicornWorker \ - --max-requests $GUNICORN_MAX_REQUESTS \ - --timeout $GUNICORN_SERVING_TIMEOUT \ - --bind 0.0.0.0:$SERVING_PORT \ - $GUNICORN_EXTRA_ARGS -fi + + if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ] + then + break + fi +done diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 10ce9c9..6865c93 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -1,13 +1,9 @@ import os import traceback import gzip -import asyncio from fastapi import FastAPI, Request, Response, APIRouter, HTTPException from fastapi.routing import APIRoute -from fastapi.responses import PlainTextResponse - -from starlette.background import BackgroundTask from typing import Optional, Dict, Any, Callable, Union @@ -52,9 +48,6 @@ try: except (ValueError, TypeError): pass -class CUDAException(Exception): - def __init__(self, exception: str): - self.exception = exception # start FastAPI app app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router") @@ -77,20 +70,6 @@ async def startup_event(): processor.launch(poll_frequency_sec=model_sync_frequency_secs*60) -@app.on_event('shutdown') -def shutdown_event(): - print('RESTARTING INFERENCE SERVICE!') - -async def exit_app(): - loop = asyncio.get_running_loop() - loop.stop() - -@app.exception_handler(CUDAException) -async def cuda_exception_handler(request, exc): - task = BackgroundTask(exit_app) - return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task) - - router = APIRouter( prefix="/serve", tags=["models"], @@ -123,10 +102,7 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni except ValueError as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) - if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): - raise CUDAException(exception=ex) - else: - raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) + raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) except Exception as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 35f5120..c22e242 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1,7 +1,6 @@ import json import os import gc -import torch from collections import deque from pathlib import Path from random import random @@ -918,11 +917,13 @@ class ModelRequestProcessor(object): if k not in self._endpoints: # atomic self._engine_processor_lookup[k]._model = None - self._engine_processor_lookup[k]._preprocess = None - del self._engine_processor_lookup[k] - self._engine_processor_lookup.pop(k, None) gc.collect() - torch.cuda.empty_cache() + if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"): + try: + self._engine_processor_lookup[k]._preprocess.unload() + except Exception as ex: + print("Exception occurred unloading model: {}".format(ex)) + self._engine_processor_lookup.pop(k, None) cleanup = False model_monitor_update = False except Exception as ex: diff --git a/docker/docker-compose-triton-gpu.yml b/docker/docker-compose-triton-gpu.yml index 8e54073..0d7ca32 100644 --- a/docker/docker-compose-triton-gpu.yml +++ b/docker/docker-compose-triton-gpu.yml @@ -96,6 +96,7 @@ services: CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001} CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-} + CLEARML_SERVING_RESTART_ON_FAILURE: ${CLEARML_SERVING_RESTART_ON_FAILURE:-} CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-} CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-} AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} diff --git a/docker/docker-compose-triton.yml b/docker/docker-compose-triton.yml index b815583..f1fd8a5 100644 --- a/docker/docker-compose-triton.yml +++ b/docker/docker-compose-triton.yml @@ -96,6 +96,7 @@ services: CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001} CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-} + CLEARML_SERVING_RESTART_ON_FAILURE: ${CLEARML_SERVING_RESTART_ON_FAILURE:-} CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-} CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-} AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 24e3b95..50d69d4 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -96,6 +96,7 @@ services: CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092} CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-} CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-} + CLEARML_SERVING_RESTART_ON_FAILURE: ${CLEARML_SERVING_RESTART_ON_FAILURE:-} CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-} CLEARML_EXTRA_PYTHON_PACKAGES: ${CLEARML_EXTRA_PYTHON_PACKAGES:-} AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} diff --git a/examples/huggingface/readme.md b/examples/huggingface/readme.md index de0f62a..071576d 100644 --- a/examples/huggingface/readme.md +++ b/examples/huggingface/readme.md @@ -73,6 +73,8 @@ CLEARML_EXTRA_PYTHON_PACKAGES=transformers # Change this depending on your machine and performance needs CLEARML_USE_GUNICORN=1 CLEARML_SERVING_NUM_PROCESS=8 +# Restarts if the serving process crashes +CLEARML_SERVING_RESTART_ON_FAILURE=1 ``` Huggingface models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart to set things up. Check the repository main readme documentation if you need help. From 6a8e61625682bc2101ca0707bddcefa850096610 Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Wed, 14 Aug 2024 19:43:44 +0300 Subject: [PATCH 02/14] not tested - exit on cuda oom --- clearml_serving/preprocess/preprocess_template.py | 3 +-- clearml_serving/serving/entrypoint.sh | 3 +++ clearml_serving/serving/main.py | 4 ++++ clearml_serving/serving/model_request_processor.py | 1 + 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/clearml_serving/preprocess/preprocess_template.py b/clearml_serving/preprocess/preprocess_template.py index bd9b936..373204a 100644 --- a/clearml_serving/preprocess/preprocess_template.py +++ b/clearml_serving/preprocess/preprocess_template.py @@ -41,8 +41,7 @@ class Preprocess(object): """ pass - - def unload(self) -> None: # noqa + def unload(self) -> None: """ OPTIONAL: provide unloading method for the model For example: diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh index 634c281..c68fa02 100755 --- a/clearml_serving/serving/entrypoint.sh +++ b/clearml_serving/serving/entrypoint.sh @@ -69,8 +69,11 @@ while : ; do $GUNICORN_EXTRA_ARGS fi + echo "[DEBUG] ~~~~~~~~~~~~ Check if we restart here server ~~~~~~~~~~~~" if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ] then + echo "[DEBUG] ~~~~~~~~~~~~ Not restarting ~~~~~~~~~~~~" break fi + echo "[DEBUG] ~~~~~~~~~~~~ Restarting server ~~~~~~~~~~~~" done diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 6865c93..d75e60b 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -1,6 +1,7 @@ import os import traceback import gzip +import sys from fastapi import FastAPI, Request, Response, APIRouter, HTTPException from fastapi.routing import APIRoute @@ -102,6 +103,9 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni except ValueError as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) + if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): + # can't always recover from this - prefer to exit the program such that it can be restarted + sys.exit(1) raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) except Exception as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index c22e242..1510034 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -917,6 +917,7 @@ class ModelRequestProcessor(object): if k not in self._endpoints: # atomic self._engine_processor_lookup[k]._model = None + print("clearml-serving --id c1a4ebd2586040ad906cf338d16bcb87 model remove --endpoint test_model_sklearn") gc.collect() if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"): try: From 7d801a11dafbc115c93135ad5f78c579a73749c9 Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Wed, 14 Aug 2024 19:47:58 +0300 Subject: [PATCH 03/14] add degug print --- clearml_serving/serving/entrypoint.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh index c68fa02..b68cbb9 100755 --- a/clearml_serving/serving/entrypoint.sh +++ b/clearml_serving/serving/entrypoint.sh @@ -42,6 +42,7 @@ then fi while : ; do + echo "[DEBUG] ~~~~~~~~~~~~ Debug changes applied ~~~~~~~~~~~~" if [ -z "$CLEARML_USE_GUNICORN" ] then if [ -z "$CLEARML_SERVING_NUM_PROCESS" ] From f95939cffe67991ba1363f6080960e1b0eb6ab8d Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Wed, 14 Aug 2024 20:46:45 +0300 Subject: [PATCH 04/14] more debug prints --- clearml_serving/serving/model_request_processor.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 1510034..934a33a 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -909,17 +909,24 @@ class ModelRequestProcessor(object): self._engine_processor_lookup = dict() except Exception as ex: print("Exception occurred in monitoring thread: {}".format(ex)) + print("before sleep") sleep(poll_frequency_sec) + print("after sleep") try: # we assume that by now all old deleted endpoints requests already returned + print("before if") if model_monitor_update and not cleanup: + print("before 1") for k in list(self._engine_processor_lookup.keys()): + print("before 2") if k not in self._endpoints: + print("before 3") # atomic self._engine_processor_lookup[k]._model = None print("clearml-serving --id c1a4ebd2586040ad906cf338d16bcb87 model remove --endpoint test_model_sklearn") gc.collect() if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"): + print("hasattr") try: self._engine_processor_lookup[k]._preprocess.unload() except Exception as ex: From 67325df4d57a83019c86421abe65d24911110627 Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Wed, 14 Aug 2024 20:50:57 +0300 Subject: [PATCH 05/14] change dbg print --- clearml_serving/serving/model_request_processor.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 934a33a..7e31316 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -872,7 +872,7 @@ class ModelRequestProcessor(object): poll_frequency_sec = float(poll_frequency_sec) # force mark started on the main serving service task self._task.mark_started(force=True) - self._report_text("Launching - configuration sync every {} sec".format(poll_frequency_sec)) + self._report_text("Launching - configuration sync every {} sec -- dbg print".format(poll_frequency_sec)) cleanup = False model_monitor_update = False self._update_serving_plot() @@ -909,24 +909,18 @@ class ModelRequestProcessor(object): self._engine_processor_lookup = dict() except Exception as ex: print("Exception occurred in monitoring thread: {}".format(ex)) - print("before sleep") + self._report_text("before sleep") sleep(poll_frequency_sec) - print("after sleep") + self._report_text("after sleep") try: # we assume that by now all old deleted endpoints requests already returned - print("before if") if model_monitor_update and not cleanup: - print("before 1") for k in list(self._engine_processor_lookup.keys()): - print("before 2") if k not in self._endpoints: - print("before 3") # atomic self._engine_processor_lookup[k]._model = None - print("clearml-serving --id c1a4ebd2586040ad906cf338d16bcb87 model remove --endpoint test_model_sklearn") gc.collect() if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"): - print("hasattr") try: self._engine_processor_lookup[k]._preprocess.unload() except Exception as ex: From 79b987ac0c05f0ccd14adced7604c3f15852a6c4 Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Wed, 14 Aug 2024 21:03:25 +0300 Subject: [PATCH 06/14] more dbg prints --- clearml_serving/serving/model_request_processor.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 7e31316..a5c02fc 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -914,12 +914,17 @@ class ModelRequestProcessor(object): self._report_text("after sleep") try: # we assume that by now all old deleted endpoints requests already returned + self._report_text("model_monitor_update and not cleanup") if model_monitor_update and not cleanup: + self._report_text("for k in list(self._engine_processor_lookup") for k in list(self._engine_processor_lookup.keys()): + self._report_text(f"if k now in self._endpoints {self._endpoints} {k}") if k not in self._endpoints: # atomic self._engine_processor_lookup[k]._model = None gc.collect() + self._report_text(str(self._engine_processor_lookup[k]._preprocess)) + self._report_text(dir(self._engine_processor_lookup[k]._preprocess)) if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"): try: self._engine_processor_lookup[k]._preprocess.unload() From 5d5188de40df2a19bb9a39fd140d3f0cf3a08657 Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Wed, 14 Aug 2024 21:20:13 +0300 Subject: [PATCH 07/14] add debugging comment --- clearml_serving/serving/model_request_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index a5c02fc..98f8b0d 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -796,8 +796,8 @@ class ModelRequestProcessor(object): # update model serving state self._model_monitoring_versions[model.base_serving_url] = versions_model_ids_dict - if not self._model_monitoring_update_request: - return False + # if not self._model_monitoring_update_request: + # return False self._report_text("INFO: Monitored Models updated: {}".format( json.dumps(self._model_monitoring_versions, indent=2)) From 1f4e1599e3d878440b4ebaad39acedbbc827aaf9 Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Thu, 15 Aug 2024 15:23:29 +0300 Subject: [PATCH 08/14] let gc handle unload --- .../serving/model_request_processor.py | 24 +++++++++---------- clearml_serving/serving/preprocess_service.py | 10 +++++++- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 98f8b0d..49de499 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -168,6 +168,8 @@ class ModelRequestProcessor(object): # retry to process return await self.process_request(base_url=base_url, version=version, request_body=request_body) + processor = None + url = None try: # normalize url and version url = self._normalize_endpoint_url(base_url, version) @@ -189,6 +191,9 @@ class ModelRequestProcessor(object): return_value = await self._process_request(processor=processor, url=url, body=request_body) finally: + if url and processor is not None and processor is not self._engine_processor_lookup.get(url): + self._report_text("calling gc collect in request processing") + gc.collect() self._request_processing_state.dec() return return_value @@ -906,6 +911,8 @@ class ModelRequestProcessor(object): if cleanup or model_monitor_update: self._update_serving_plot() if cleanup: + self._report_text("calling gc collect in cleanup") + gc.collect() self._engine_processor_lookup = dict() except Exception as ex: print("Exception occurred in monitoring thread: {}".format(ex)) @@ -914,23 +921,16 @@ class ModelRequestProcessor(object): self._report_text("after sleep") try: # we assume that by now all old deleted endpoints requests already returned - self._report_text("model_monitor_update and not cleanup") + call_gc_collect = False if model_monitor_update and not cleanup: - self._report_text("for k in list(self._engine_processor_lookup") for k in list(self._engine_processor_lookup.keys()): - self._report_text(f"if k now in self._endpoints {self._endpoints} {k}") if k not in self._endpoints: # atomic - self._engine_processor_lookup[k]._model = None - gc.collect() - self._report_text(str(self._engine_processor_lookup[k]._preprocess)) - self._report_text(dir(self._engine_processor_lookup[k]._preprocess)) - if hasattr(self._engine_processor_lookup[k]._preprocess, "unload"): - try: - self._engine_processor_lookup[k]._preprocess.unload() - except Exception as ex: - print("Exception occurred unloading model: {}".format(ex)) self._engine_processor_lookup.pop(k, None) + call_gc_collect = True + if call_gc_collect: + self._report_text("calling gc collect in try") + gc.collect() cleanup = False model_monitor_update = False except Exception as ex: diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index a5c069c..81d4926 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -90,7 +90,15 @@ class BasePreprocessRequest(object): sys.modules[spec.name] = _preprocess spec.loader.exec_module(_preprocess) - Preprocess = _preprocess.Preprocess # noqa + class PreprocessDelWrapper(_preprocess.Preprocess): + def __del__(self): + super_ = super(PreprocessDelWrapper, self) + if callable(getattr(super_, "unload", None)): + super_.unload() + if callable(getattr(super_, "__del__", None)): + super_.__del__() + + Preprocess = PreprocessDelWrapper # noqa # override `send_request` method Preprocess.send_request = BasePreprocessRequest._preprocess_send_request # create preprocess class From 4fa5b5aa3603a7af0618dcf1d98c3591bc306855 Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Thu, 15 Aug 2024 15:55:34 +0300 Subject: [PATCH 09/14] force exit using os._exit --- clearml_serving/serving/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index d75e60b..1896c88 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -1,7 +1,6 @@ import os import traceback import gzip -import sys from fastapi import FastAPI, Request, Response, APIRouter, HTTPException from fastapi.routing import APIRoute @@ -105,7 +104,7 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): # can't always recover from this - prefer to exit the program such that it can be restarted - sys.exit(1) + os._exit() raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) except Exception as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( From d0131d0b588626dc8e988e4d08bfbe860ddd5f07 Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Thu, 15 Aug 2024 16:00:40 +0300 Subject: [PATCH 10/14] set status in exit --- clearml_serving/serving/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 1896c88..1e3f8fc 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -104,7 +104,7 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): # can't always recover from this - prefer to exit the program such that it can be restarted - os._exit() + os._exit(1) raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) except Exception as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( From fd8b7604a99a99930a20aaceae9457264ef4030c Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Thu, 15 Aug 2024 16:18:41 +0300 Subject: [PATCH 11/14] call gc on remove as well --- clearml_serving/serving/model_request_processor.py | 2 ++ clearml_serving/serving/preprocess_service.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 49de499..45deb29 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -388,6 +388,8 @@ class ModelRequestProcessor(object): return False self._endpoints.pop(endpoint_url, None) self._remove_registered_input_model(endpoint_url) + self._engine_processor_lookup.pop(endpoint_url, None) + gc.collect() return True def add_canary_endpoint( diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index 81d4926..efa33a1 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -94,7 +94,10 @@ class BasePreprocessRequest(object): def __del__(self): super_ = super(PreprocessDelWrapper, self) if callable(getattr(super_, "unload", None)): - super_.unload() + try: + super_.unload() + except Exception as ex: + print("Failed unloading model: {}".format(ex)) if callable(getattr(super_, "__del__", None)): super_.__del__() From df098f77490296375f2a96c3f98962bd70a96b1e Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Thu, 15 Aug 2024 16:25:59 +0300 Subject: [PATCH 12/14] dont call gc collect on remove --- clearml_serving/serving/model_request_processor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 45deb29..49de499 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -388,8 +388,6 @@ class ModelRequestProcessor(object): return False self._endpoints.pop(endpoint_url, None) self._remove_registered_input_model(endpoint_url) - self._engine_processor_lookup.pop(endpoint_url, None) - gc.collect() return True def add_canary_endpoint( From 64909cdd655e399d224785659026893f1fabf3ca Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Thu, 15 Aug 2024 16:38:27 +0300 Subject: [PATCH 13/14] remove dbg prints --- clearml_serving/serving/entrypoint.sh | 4 ---- clearml_serving/serving/model_request_processor.py | 10 +++------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh index b68cbb9..634c281 100755 --- a/clearml_serving/serving/entrypoint.sh +++ b/clearml_serving/serving/entrypoint.sh @@ -42,7 +42,6 @@ then fi while : ; do - echo "[DEBUG] ~~~~~~~~~~~~ Debug changes applied ~~~~~~~~~~~~" if [ -z "$CLEARML_USE_GUNICORN" ] then if [ -z "$CLEARML_SERVING_NUM_PROCESS" ] @@ -70,11 +69,8 @@ while : ; do $GUNICORN_EXTRA_ARGS fi - echo "[DEBUG] ~~~~~~~~~~~~ Check if we restart here server ~~~~~~~~~~~~" if [ -z "$CLEARML_SERVING_RESTART_ON_FAILURE" ] then - echo "[DEBUG] ~~~~~~~~~~~~ Not restarting ~~~~~~~~~~~~" break fi - echo "[DEBUG] ~~~~~~~~~~~~ Restarting server ~~~~~~~~~~~~" done diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 49de499..5e1eba2 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -801,8 +801,8 @@ class ModelRequestProcessor(object): # update model serving state self._model_monitoring_versions[model.base_serving_url] = versions_model_ids_dict - # if not self._model_monitoring_update_request: - # return False + if not self._model_monitoring_update_request: + return False self._report_text("INFO: Monitored Models updated: {}".format( json.dumps(self._model_monitoring_versions, indent=2)) @@ -877,7 +877,7 @@ class ModelRequestProcessor(object): poll_frequency_sec = float(poll_frequency_sec) # force mark started on the main serving service task self._task.mark_started(force=True) - self._report_text("Launching - configuration sync every {} sec -- dbg print".format(poll_frequency_sec)) + self._report_text("Launching - configuration sync every {} sec".format(poll_frequency_sec)) cleanup = False model_monitor_update = False self._update_serving_plot() @@ -911,14 +911,11 @@ class ModelRequestProcessor(object): if cleanup or model_monitor_update: self._update_serving_plot() if cleanup: - self._report_text("calling gc collect in cleanup") gc.collect() self._engine_processor_lookup = dict() except Exception as ex: print("Exception occurred in monitoring thread: {}".format(ex)) - self._report_text("before sleep") sleep(poll_frequency_sec) - self._report_text("after sleep") try: # we assume that by now all old deleted endpoints requests already returned call_gc_collect = False @@ -929,7 +926,6 @@ class ModelRequestProcessor(object): self._engine_processor_lookup.pop(k, None) call_gc_collect = True if call_gc_collect: - self._report_text("calling gc collect in try") gc.collect() cleanup = False model_monitor_update = False From 61be8733c8807de0c59e87f861cd31baf7047db1 Mon Sep 17 00:00:00 2001 From: Eugen Ajechiloae Date: Thu, 15 Aug 2024 16:45:36 +0300 Subject: [PATCH 14/14] remove dbg print --- clearml_serving/serving/model_request_processor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 5e1eba2..92f893f 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -192,7 +192,6 @@ class ModelRequestProcessor(object): return_value = await self._process_request(processor=processor, url=url, body=request_body) finally: if url and processor is not None and processor is not self._engine_processor_lookup.get(url): - self._report_text("calling gc collect in request processing") gc.collect() self._request_processing_state.dec()