From 666ce26ab22b27e35a5542ca6d3c8646e7561c4b Mon Sep 17 00:00:00 2001 From: stephanbertl Date: Sun, 7 Jul 2024 14:51:23 +0200 Subject: [PATCH 1/5] Add exit-on-error option for tritonserver (#76) This fixes #60 Co-authored-by: = --- clearml_serving/engines/triton/triton_helper.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/clearml_serving/engines/triton/triton_helper.py b/clearml_serving/engines/triton/triton_helper.py index 879174b..19fd241 100644 --- a/clearml_serving/engines/triton/triton_helper.py +++ b/clearml_serving/engines/triton/triton_helper.py @@ -540,6 +540,12 @@ def main(): parser.add_argument( '--t-log-verbose', type=str, help=' Triton server logging verbosity (default disabled)') + parser.add_argument( + '--t-exit-on-error', type=bool, default=True, + help='Exits the inference server if any error occurs during initialization.' + 'Recommended to set to True to catch any unanticipated errors.' + 'False prevents single models breaking the whole tritonserver.' + ) args = parser.parse_args() From 724c99c605540cdae25e4ef504c09f705cd53503 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 <58298387+IlyaMescheryakov1402@users.noreply.github.com> Date: Sun, 7 Jul 2024 15:54:08 +0300 Subject: [PATCH 2/5] Add clearml_serving_inference restart on CUDA OOM (#75) * initial commit * add OOM handler for MIG profiles --------- Co-authored-by: Meshcheryakov Ilya --- clearml_serving/serving/entrypoint.sh | 1 + clearml_serving/serving/init.py | 2 ++ clearml_serving/serving/main.py | 26 ++++++++++++++++++- .../serving/model_request_processor.py | 7 +++++ 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh index e1a5bbc..a5efea1 100755 --- a/clearml_serving/serving/entrypoint.sh +++ b/clearml_serving/serving/entrypoint.sh @@ -2,6 +2,7 @@ # print configuration echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID" +echo CLEARML_INFERENCE_TASK_ID="$CLEARML_INFERENCE_TASK_ID" echo CLEARML_SERVING_PORT="$CLEARML_SERVING_PORT" echo CLEARML_USE_GUNICORN="$CLEARML_USE_GUNICORN" echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES" diff --git a/clearml_serving/serving/init.py b/clearml_serving/serving/init.py index 2ae54a8..0c75712 100644 --- a/clearml_serving/serving/init.py +++ b/clearml_serving/serving/init.py @@ -6,6 +6,7 @@ from clearml_serving.serving.preprocess_service import BasePreprocessRequest def setup_task(force_threaded_logging=None): serving_service_task_id = os.environ.get("CLEARML_SERVING_TASK_ID", None) + inference_service_task_id = os.environ.get("CLEARML_INFERENCE_TASK_ID", False) # according Task.init() docs # always use background thread, it requires less memory if force_threaded_logging or os.environ.get("CLEARML_BKG_THREAD_REPORT") in ("1", "Y", "y", "true"): @@ -24,6 +25,7 @@ def setup_task(force_threaded_logging=None): project_name=serving_task.get_project_name(), task_name="{} - serve instance".format(serving_task.name), task_type="inference", # noqa + continue_last_task=inference_service_task_id, ) instance_task.set_system_tags(["service"]) # make sure we start logging thread/process diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 6865c93..10ce9c9 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -1,9 +1,13 @@ import os import traceback import gzip +import asyncio from fastapi import FastAPI, Request, Response, APIRouter, HTTPException from fastapi.routing import APIRoute +from fastapi.responses import PlainTextResponse + +from starlette.background import BackgroundTask from typing import Optional, Dict, Any, Callable, Union @@ -48,6 +52,9 @@ try: except (ValueError, TypeError): pass +class CUDAException(Exception): + def __init__(self, exception: str): + self.exception = exception # start FastAPI app app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router") @@ -70,6 +77,20 @@ async def startup_event(): processor.launch(poll_frequency_sec=model_sync_frequency_secs*60) +@app.on_event('shutdown') +def shutdown_event(): + print('RESTARTING INFERENCE SERVICE!') + +async def exit_app(): + loop = asyncio.get_running_loop() + loop.stop() + +@app.exception_handler(CUDAException) +async def cuda_exception_handler(request, exc): + task = BackgroundTask(exit_app) + return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task) + + router = APIRouter( prefix="/serve", tags=["models"], @@ -102,7 +123,10 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni except ValueError as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) - raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) + if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): + raise CUDAException(exception=ex) + else: + raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) except Exception as ex: session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index ba9242d..35f5120 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1,5 +1,7 @@ import json import os +import gc +import torch from collections import deque from pathlib import Path from random import random @@ -915,7 +917,12 @@ class ModelRequestProcessor(object): for k in list(self._engine_processor_lookup.keys()): if k not in self._endpoints: # atomic + self._engine_processor_lookup[k]._model = None + self._engine_processor_lookup[k]._preprocess = None + del self._engine_processor_lookup[k] self._engine_processor_lookup.pop(k, None) + gc.collect() + torch.cuda.empty_cache() cleanup = False model_monitor_update = False except Exception as ex: From aff27c62b89f9d0f2fe2e55cc75ffdbc272a6f78 Mon Sep 17 00:00:00 2001 From: clearml <> Date: Thu, 12 Dec 2024 23:57:21 +0200 Subject: [PATCH 3/5] Fix gRPC errors print stack traces and full verbose details. Add support for controlling error printouts using `CLEARML_SERVING_AIO_RPC_IGNORE_ERRORS` and `CLEARML_SERVING_AIO_RPC_VERBOSE_ERRORS` (pass a whitespace-separated list of error codes or error names) --- clearml_serving/serving/main.py | 87 +++++++++++++++++++++++--------- clearml_serving/serving/utils.py | 17 +++++++ clearml_serving/version.py | 2 +- 3 files changed, 80 insertions(+), 26 deletions(-) create mode 100644 clearml_serving/serving/utils.py diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 10ce9c9..2536760 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -1,4 +1,5 @@ import os +import shlex import traceback import gzip import asyncio @@ -6,6 +7,7 @@ import asyncio from fastapi import FastAPI, Request, Response, APIRouter, HTTPException from fastapi.routing import APIRoute from fastapi.responses import PlainTextResponse +from grpc.aio import AioRpcError from starlette.background import BackgroundTask @@ -13,8 +15,14 @@ from typing import Optional, Dict, Any, Callable, Union from clearml_serving.version import __version__ from clearml_serving.serving.init import setup_task -from clearml_serving.serving.model_request_processor import ModelRequestProcessor, EndpointNotFoundException, \ - EndpointBackendEngineException, EndpointModelLoadException, ServingInitializationException +from clearml_serving.serving.model_request_processor import ( + ModelRequestProcessor, + EndpointNotFoundException, + EndpointBackendEngineException, + EndpointModelLoadException, + ServingInitializationException, +) +from clearml_serving.serving.utils import parse_grpc_errors class GzipRequest(Request): @@ -52,10 +60,16 @@ try: except (ValueError, TypeError): pass + +grpc_aio_ignore_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_IGNORE_ERRORS", ""))) +grpc_aio_verbose_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_VERBOSE_ERRORS", ""))) + + class CUDAException(Exception): def __init__(self, exception: str): self.exception = exception + # start FastAPI app app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router") @@ -65,26 +79,31 @@ async def startup_event(): global processor if processor: - print("ModelRequestProcessor already initialized [pid={}] [service_id={}]".format( - os.getpid(), serving_service_task_id)) + print( + "ModelRequestProcessor already initialized [pid={}] [service_id={}]".format( + os.getpid(), serving_service_task_id + ) + ) else: - print("Starting up ModelRequestProcessor [pid={}] [service_id={}]".format( - os.getpid(), serving_service_task_id)) + print("Starting up ModelRequestProcessor [pid={}] [service_id={}]".format(os.getpid(), serving_service_task_id)) processor = ModelRequestProcessor( - task_id=serving_service_task_id, update_lock_guard=singleton_sync_lock, + task_id=serving_service_task_id, + update_lock_guard=singleton_sync_lock, ) print("ModelRequestProcessor [id={}] loaded".format(processor.get_id())) - processor.launch(poll_frequency_sec=model_sync_frequency_secs*60) + processor.launch(poll_frequency_sec=model_sync_frequency_secs * 60) -@app.on_event('shutdown') +@app.on_event("shutdown") def shutdown_event(): - print('RESTARTING INFERENCE SERVICE!') - + print("RESTARTING INFERENCE SERVICE!") + + async def exit_app(): loop = asyncio.get_running_loop() loop.stop() - + + @app.exception_handler(CUDAException) async def cuda_exception_handler(request, exc): task = BackgroundTask(exit_app) @@ -105,31 +124,49 @@ router = APIRouter( @router.post("/{model_id}") async def serve_model(model_id: str, version: Optional[str] = None, request: Union[bytes, Dict[Any, Any]] = None): try: - return_value = await processor.process_request( - base_url=model_id, - version=version, - request_body=request - ) + return_value = await processor.process_request(base_url=model_id, version=version, request_body=request) except EndpointNotFoundException as ex: raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex)) except (EndpointModelLoadException, EndpointBackendEngineException) as ex: - session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) + session_logger.report_text( + "[{}] Exception [{}] {} while processing request: {}\n{}".format( + instance_id, type(ex), ex, request, "".join(traceback.format_exc()) + ) + ) raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) except ServingInitializationException as ex: - session_logger.report_text("[{}] Exception [{}] {} while loading serving inference: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) + session_logger.report_text( + "[{}] Exception [{}] {} while loading serving inference: {}\n{}".format( + instance_id, type(ex), ex, request, "".join(traceback.format_exc()) + ) + ) raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) except ValueError as ex: - session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) + session_logger.report_text( + "[{}] Exception [{}] {} while processing request: {}\n{}".format( + instance_id, type(ex), ex, request, "".join(traceback.format_exc()) + ) + ) if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex): raise CUDAException(exception=ex) else: raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex)) + except AioRpcError as ex: + if grpc_aio_verbose_errors and ex.code() in grpc_aio_verbose_errors: + session_logger.report_text( + "[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request) + ) + elif not grpc_aio_ignore_errors or ex.code() not in grpc_aio_ignore_errors: + session_logger.report_text("[{}] Exception [AioRpcError] status={} ".format(instance_id, ex.code())) + raise HTTPException( + status_code=500, detail="Error [AioRpcError] processing request: status={}".format(ex.code()) + ) except Exception as ex: - session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format( - instance_id, type(ex), ex, request, "".join(traceback.format_exc()))) + session_logger.report_text( + "[{}] Exception [{}] {} while processing request: {}\n{}".format( + instance_id, type(ex), ex, request, "".join(traceback.format_exc()) + ) + ) raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex)) return return_value diff --git a/clearml_serving/serving/utils.py b/clearml_serving/serving/utils.py new file mode 100644 index 0000000..da18e45 --- /dev/null +++ b/clearml_serving/serving/utils.py @@ -0,0 +1,17 @@ +from typing import List, Set + +import grpc + + +def parse_grpc_errors(errors: List[str]) -> Set[grpc.StatusCode]: + try: + typed_errors = { + int(e) if e.isdigit() else e.lower().replace("-", " ").replace("_", " ") + for e in errors + } + if len(typed_errors) == 1 and next(iter(typed_errors)) in ("true", "false"): + return set(grpc.StatusCode if next(iter(typed_errors)) == "true" else []) + return {e for e in grpc.StatusCode if typed_errors.intersection(e.value)} + except (ValueError, TypeError): + pass + return set() diff --git a/clearml_serving/version.py b/clearml_serving/version.py index 19b4f1d..72837bd 100644 --- a/clearml_serving/version.py +++ b/clearml_serving/version.py @@ -1 +1 @@ -__version__ = '1.3.0' +__version__ = '1.3.1' From 9f51a9334fac7c6a4a760c42f787dcb1cefbae49 Mon Sep 17 00:00:00 2001 From: clearml <> Date: Mon, 16 Dec 2024 18:51:58 +0200 Subject: [PATCH 4/5] Fix torch import --- clearml_serving/serving/model_request_processor.py | 9 +++++++-- clearml_serving/version.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 35f5120..2fcdcab 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1,7 +1,6 @@ import json import os import gc -import torch from collections import deque from pathlib import Path from random import random @@ -19,6 +18,11 @@ from clearml.storage.util import hash_dict from .preprocess_service import BasePreprocessRequest from .endpoints import ModelEndpoint, ModelMonitoring, CanaryEP, EndpointMetricLogging +try: + import torch +except ImportError: + torch = None + class ModelRequestProcessorException(Exception): def __init__(self, message): @@ -922,7 +926,8 @@ class ModelRequestProcessor(object): del self._engine_processor_lookup[k] self._engine_processor_lookup.pop(k, None) gc.collect() - torch.cuda.empty_cache() + if torch: + torch.cuda.empty_cache() cleanup = False model_monitor_update = False except Exception as ex: diff --git a/clearml_serving/version.py b/clearml_serving/version.py index 72837bd..e398332 100644 --- a/clearml_serving/version.py +++ b/clearml_serving/version.py @@ -1 +1 @@ -__version__ = '1.3.1' +__version__ = '1.3.2' From 1def0a6901617767687f2b747aaffdc060f96046 Mon Sep 17 00:00:00 2001 From: clearml <> Date: Mon, 13 Jan 2025 18:40:02 +0200 Subject: [PATCH 5/5] Update github repo link --- LICENSE | 2 +- README.md | 16 ++++++++-------- setup.py | 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/LICENSE b/LICENSE index 261eeb9..9700080 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2025 ClearML Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index f4dde1e..7147f40 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,17 @@
- + **ClearML Serving - Model deployment made easy** ## **`clearml-serving v1.3.1`
:sparkles: Model Serving (ML/DL) Made Easy :tada:**
:fire: NEW version 1.3 :rocket: 20% faster ! -[![GitHub license](https://img.shields.io/github/license/allegroai/clearml-serving.svg)](https://img.shields.io/github/license/allegroai/clearml-serving.svg) +[![GitHub license](https://img.shields.io/github/license/clearml/clearml-serving.svg)](https://img.shields.io/github/license/clearml/clearml-serving.svg) [![PyPI pyversions](https://img.shields.io/pypi/pyversions/clearml-serving.svg)](https://img.shields.io/pypi/pyversions/clearml-serving.svg) [![PyPI version shields.io](https://img.shields.io/pypi/v/clearml-serving.svg)](https://img.shields.io/pypi/v/clearml-serving.svg) -[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/allegroai)](https://artifacthub.io/packages/helm/allegroai/clearml-serving) +[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/clearml)](https://artifacthub.io/packages/helm/clearml/clearml-serving) [![Slack Channel](https://img.shields.io/badge/slack-%23clearml--community-blueviolet?logo=slack)](https://joinslack.clear.ml) `🌟 ClearML is open-source - Leave a star to support the project! 🌟` @@ -25,7 +25,7 @@ It enables model deployment including serving and preprocessing code to a Kubern ### :fire: NEW :confetti_ball: Take it for a spin with a simple `docker-compose` [command](#nail_care-initial-setup) :magic_wand: :sparkles: - + Features: * Easy to deploy & configure @@ -73,7 +73,7 @@ Features: ### :nail_care: Initial Setup -1. Setup your [**ClearML Server**](https://github.com/allegroai/clearml-server) or use the [Free tier Hosting](https://app.clear.ml) +1. Setup your [**ClearML Server**](https://github.com/clearml/clearml-server) or use the [Free tier Hosting](https://app.clear.ml) 2. Setup local access (if you haven't already), see instructions [here](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps#install-clearml) 3. Install clearml-serving CLI: ```bash @@ -85,7 +85,7 @@ pip3 install clearml-serving 5. Write down the Serving Service UID 6. Clone clearml-serving repository ```bash -git clone https://github.com/allegroai/clearml-serving.git +git clone https://github.com/clearml/clearml-serving.git ``` 7. Edit the environment variables file (`docker/example.env`) with your clearml-server credentials and Serving Service UID. For example, you should have something like ```bash @@ -170,7 +170,7 @@ AZURE_STORAGE_KEY **Notice** On the first few requests the inference container needs to download the model file and preprocessing python code, this means the request might take a little longer, once everything is cached, it will return almost immediately. **Notes:** -> Review the model repository in the ClearML web UI, under the "serving examples" Project on your ClearML account/server ([free hosted](https://app.clear.ml) or [self-deployed](https://github.com/allegroai/clearml-server)). +> Review the model repository in the ClearML web UI, under the "serving examples" Project on your ClearML account/server ([free hosted](https://app.clear.ml) or [self-deployed](https://github.com/clearml/clearml-server)). > Inference services status, console outputs and machine metrics are available in the ClearML UI in the Serving Service project (default: "DevOps" project) @@ -330,6 +330,6 @@ Grafana model performance example: ## Contributing -**PRs are always welcomed** :heart: See more details in the ClearML [Guidelines for Contributing](https://github.com/allegroai/clearml/blob/master/docs/contributing.md). +**PRs are always welcomed** :heart: See more details in the ClearML [Guidelines for Contributing](https://github.com/clearml/clearml/blob/master/docs/contributing.md). diff --git a/setup.py b/setup.py index bdfda40..e80794f 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ """ `clearml-serving` - Model-Serving Orchestration and Repository Solution -https://github.com/allegroai/clearml-serving +https://github.com/clearml/clearml-serving """ import os.path @@ -38,7 +38,7 @@ setup( long_description=long_description, long_description_content_type='text/markdown', # The project's main homepage. - url='https://github.com/allegroai/clearml-serving.git', + url='https://github.com/clearml/clearml-serving.git', author='ClearML', author_email='support@clear.ml', license='Apache License 2.0',