mirror of
https://github.com/clearml/clearml-serving
synced 2025-06-26 18:16:00 +00:00
Merge branch 'main' into feature/multimodel
This commit is contained in:
commit
2685d2a0e5
2
LICENSE
2
LICENSE
@ -186,7 +186,7 @@
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
Copyright 2025 ClearML Inc
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
16
README.md
16
README.md
@ -1,17 +1,17 @@
|
||||
|
||||
<div align="center">
|
||||
|
||||
<a href="https://app.clear.ml"><img src="https://github.com/allegroai/clearml/blob/master/docs/clearml-logo.svg?raw=true" width="250px"></a>
|
||||
<a href="https://app.clear.ml"><img src="https://github.com/clearml/clearml/blob/master/docs/clearml-logo.svg?raw=true" width="250px"></a>
|
||||
|
||||
**ClearML Serving - Model deployment made easy**
|
||||
|
||||
## **`clearml-serving v1.3.1` </br> :sparkles: Model Serving (ML/DL) Made Easy :tada:** <br> :fire: NEW version 1.3 :rocket: 20% faster !
|
||||
|
||||
|
||||
[](https://img.shields.io/github/license/allegroai/clearml-serving.svg)
|
||||
[](https://img.shields.io/github/license/clearml/clearml-serving.svg)
|
||||
[](https://img.shields.io/pypi/pyversions/clearml-serving.svg)
|
||||
[](https://img.shields.io/pypi/v/clearml-serving.svg)
|
||||
[](https://artifacthub.io/packages/helm/allegroai/clearml-serving)
|
||||
[](https://artifacthub.io/packages/helm/clearml/clearml-serving)
|
||||
[](https://joinslack.clear.ml)
|
||||
|
||||
`🌟 ClearML is open-source - Leave a star to support the project! 🌟`
|
||||
@ -25,7 +25,7 @@ It enables model deployment including serving and preprocessing code to a Kubern
|
||||
### :fire: NEW :confetti_ball: Take it for a spin with a simple `docker-compose` [command](#nail_care-initial-setup) :magic_wand: :sparkles:
|
||||
|
||||
|
||||
<a><img src="https://github.com/allegroai/clearml-serving/blob/main/docs/design_diagram.png?raw=true" width="100%"></a>
|
||||
<a><img src="https://github.com/clearml/clearml-serving/blob/main/docs/design_diagram.png?raw=true" width="100%"></a>
|
||||
|
||||
Features:
|
||||
* Easy to deploy & configure
|
||||
@ -73,7 +73,7 @@ Features:
|
||||
|
||||
### :nail_care: Initial Setup
|
||||
|
||||
1. Setup your [**ClearML Server**](https://github.com/allegroai/clearml-server) or use the [Free tier Hosting](https://app.clear.ml)
|
||||
1. Setup your [**ClearML Server**](https://github.com/clearml/clearml-server) or use the [Free tier Hosting](https://app.clear.ml)
|
||||
2. Setup local access (if you haven't already), see instructions [here](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps#install-clearml)
|
||||
3. Install clearml-serving CLI:
|
||||
```bash
|
||||
@ -85,7 +85,7 @@ pip3 install clearml-serving
|
||||
5. Write down the Serving Service UID
|
||||
6. Clone clearml-serving repository
|
||||
```bash
|
||||
git clone https://github.com/allegroai/clearml-serving.git
|
||||
git clone https://github.com/clearml/clearml-serving.git
|
||||
```
|
||||
7. Edit the environment variables file (`docker/example.env`) with your clearml-server credentials and Serving Service UID. For example, you should have something like
|
||||
```bash
|
||||
@ -170,7 +170,7 @@ AZURE_STORAGE_KEY
|
||||
**Notice** On the first few requests the inference container needs to download the model file and preprocessing python code, this means the request might take a little longer, once everything is cached, it will return almost immediately.
|
||||
|
||||
**Notes:**
|
||||
> Review the model repository in the ClearML web UI, under the "serving examples" Project on your ClearML account/server ([free hosted](https://app.clear.ml) or [self-deployed](https://github.com/allegroai/clearml-server)).
|
||||
> Review the model repository in the ClearML web UI, under the "serving examples" Project on your ClearML account/server ([free hosted](https://app.clear.ml) or [self-deployed](https://github.com/clearml/clearml-server)).
|
||||
|
||||
> Inference services status, console outputs and machine metrics are available in the ClearML UI in the Serving Service project (default: "DevOps" project)
|
||||
|
||||
@ -330,6 +330,6 @@ Grafana model performance example:
|
||||
|
||||
## Contributing
|
||||
|
||||
**PRs are always welcomed** :heart: See more details in the ClearML [Guidelines for Contributing](https://github.com/allegroai/clearml/blob/master/docs/contributing.md).
|
||||
**PRs are always welcomed** :heart: See more details in the ClearML [Guidelines for Contributing](https://github.com/clearml/clearml/blob/master/docs/contributing.md).
|
||||
|
||||
|
||||
|
@ -540,6 +540,12 @@ def main():
|
||||
parser.add_argument(
|
||||
'--t-log-verbose', type=str,
|
||||
help='<integer> Triton server logging verbosity (default disabled)')
|
||||
parser.add_argument(
|
||||
'--t-exit-on-error', type=bool, default=True,
|
||||
help='Exits the inference server if any error occurs during initialization.'
|
||||
'Recommended to set to True to catch any unanticipated errors.'
|
||||
'False prevents single models breaking the whole tritonserver.'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
import os
|
||||
import shlex
|
||||
import traceback
|
||||
import gzip
|
||||
import asyncio
|
||||
@ -6,6 +7,7 @@ import asyncio
|
||||
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
|
||||
from fastapi.routing import APIRoute
|
||||
from fastapi.responses import PlainTextResponse
|
||||
from grpc.aio import AioRpcError
|
||||
|
||||
from starlette.background import BackgroundTask
|
||||
|
||||
@ -13,8 +15,14 @@ from typing import Optional, Dict, Any, Callable, Union
|
||||
|
||||
from clearml_serving.version import __version__
|
||||
from clearml_serving.serving.init import setup_task
|
||||
from clearml_serving.serving.model_request_processor import ModelRequestProcessor, EndpointNotFoundException, \
|
||||
EndpointBackendEngineException, EndpointModelLoadException, ServingInitializationException
|
||||
from clearml_serving.serving.model_request_processor import (
|
||||
ModelRequestProcessor,
|
||||
EndpointNotFoundException,
|
||||
EndpointBackendEngineException,
|
||||
EndpointModelLoadException,
|
||||
ServingInitializationException,
|
||||
)
|
||||
from clearml_serving.serving.utils import parse_grpc_errors
|
||||
|
||||
|
||||
class GzipRequest(Request):
|
||||
@ -56,6 +64,15 @@ class CUDAException(Exception):
|
||||
def __init__(self, exception: str):
|
||||
self.exception = exception
|
||||
|
||||
grpc_aio_ignore_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_IGNORE_ERRORS", "")))
|
||||
grpc_aio_verbose_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_VERBOSE_ERRORS", "")))
|
||||
|
||||
|
||||
class CUDAException(Exception):
|
||||
def __init__(self, exception: str):
|
||||
self.exception = exception
|
||||
|
||||
|
||||
# start FastAPI app
|
||||
app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router")
|
||||
|
||||
@ -65,16 +82,35 @@ async def startup_event():
|
||||
global processor
|
||||
|
||||
if processor:
|
||||
print("ModelRequestProcessor already initialized [pid={}] [service_id={}]".format(
|
||||
os.getpid(), serving_service_task_id))
|
||||
print(
|
||||
"ModelRequestProcessor already initialized [pid={}] [service_id={}]".format(
|
||||
os.getpid(), serving_service_task_id
|
||||
)
|
||||
)
|
||||
else:
|
||||
print("Starting up ModelRequestProcessor [pid={}] [service_id={}]".format(
|
||||
os.getpid(), serving_service_task_id))
|
||||
print("Starting up ModelRequestProcessor [pid={}] [service_id={}]".format(os.getpid(), serving_service_task_id))
|
||||
processor = ModelRequestProcessor(
|
||||
task_id=serving_service_task_id, update_lock_guard=singleton_sync_lock,
|
||||
task_id=serving_service_task_id,
|
||||
update_lock_guard=singleton_sync_lock,
|
||||
)
|
||||
print("ModelRequestProcessor [id={}] loaded".format(processor.get_id()))
|
||||
processor.launch(poll_frequency_sec=model_sync_frequency_secs*60)
|
||||
processor.launch(poll_frequency_sec=model_sync_frequency_secs * 60)
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
def shutdown_event():
|
||||
print("RESTARTING INFERENCE SERVICE!")
|
||||
|
||||
|
||||
async def exit_app():
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.stop()
|
||||
|
||||
|
||||
@app.exception_handler(CUDAException)
|
||||
async def cuda_exception_handler(request, exc):
|
||||
task = BackgroundTask(exit_app)
|
||||
return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task)
|
||||
|
||||
|
||||
@app.on_event('shutdown')
|
||||
@ -111,23 +147,45 @@ async def llm_serve_model(endpoint_type: str, request: Union[bytes, Dict[Any, An
|
||||
except EndpointNotFoundException as ex:
|
||||
raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex))
|
||||
except (EndpointModelLoadException, EndpointBackendEngineException) as ex:
|
||||
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
||||
session_logger.report_text(
|
||||
"[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())
|
||||
)
|
||||
)
|
||||
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
|
||||
except ServingInitializationException as ex:
|
||||
session_logger.report_text("[{}] Exception [{}] {} while loading serving inference: {}\n{}".format(
|
||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
||||
session_logger.report_text(
|
||||
"[{}] Exception [{}] {} while loading serving inference: {}\n{}".format(
|
||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())
|
||||
)
|
||||
)
|
||||
raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex))
|
||||
except ValueError as ex:
|
||||
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
||||
session_logger.report_text(
|
||||
"[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())
|
||||
)
|
||||
)
|
||||
if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
|
||||
raise CUDAException(exception=ex)
|
||||
else:
|
||||
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
|
||||
except AioRpcError as ex:
|
||||
if grpc_aio_verbose_errors and ex.code() in grpc_aio_verbose_errors:
|
||||
session_logger.report_text(
|
||||
"[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request)
|
||||
)
|
||||
elif not grpc_aio_ignore_errors or ex.code() not in grpc_aio_ignore_errors:
|
||||
session_logger.report_text("[{}] Exception [AioRpcError] status={} ".format(instance_id, ex.code()))
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Error [AioRpcError] processing request: status={}".format(ex.code())
|
||||
)
|
||||
except Exception as ex:
|
||||
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
|
||||
session_logger.report_text(
|
||||
"[{}] Exception [{}] {} while processing request: {}\n{}".format(
|
||||
instance_id, type(ex), ex, request, "".join(traceback.format_exc())
|
||||
)
|
||||
)
|
||||
raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex))
|
||||
return return_value
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
import json
|
||||
import os
|
||||
import gc
|
||||
import torch
|
||||
from collections import deque
|
||||
from pathlib import Path
|
||||
from random import random
|
||||
@ -19,6 +18,11 @@ from clearml.storage.util import hash_dict
|
||||
from .preprocess_service import BasePreprocessRequest
|
||||
from .endpoints import ModelEndpoint, ModelMonitoring, CanaryEP, EndpointMetricLogging
|
||||
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
torch = None
|
||||
|
||||
|
||||
class ModelRequestProcessorException(Exception):
|
||||
def __init__(self, message):
|
||||
@ -922,7 +926,8 @@ class ModelRequestProcessor(object):
|
||||
del self._engine_processor_lookup[k]
|
||||
self._engine_processor_lookup.pop(k, None)
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
if torch:
|
||||
torch.cuda.empty_cache()
|
||||
cleanup = False
|
||||
model_monitor_update = False
|
||||
except Exception as ex:
|
||||
|
17
clearml_serving/serving/utils.py
Normal file
17
clearml_serving/serving/utils.py
Normal file
@ -0,0 +1,17 @@
|
||||
from typing import List, Set
|
||||
|
||||
import grpc
|
||||
|
||||
|
||||
def parse_grpc_errors(errors: List[str]) -> Set[grpc.StatusCode]:
|
||||
try:
|
||||
typed_errors = {
|
||||
int(e) if e.isdigit() else e.lower().replace("-", " ").replace("_", " ")
|
||||
for e in errors
|
||||
}
|
||||
if len(typed_errors) == 1 and next(iter(typed_errors)) in ("true", "false"):
|
||||
return set(grpc.StatusCode if next(iter(typed_errors)) == "true" else [])
|
||||
return {e for e in grpc.StatusCode if typed_errors.intersection(e.value)}
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
return set()
|
@ -1 +1 @@
|
||||
__version__ = '1.3.0'
|
||||
__version__ = '1.3.2'
|
||||
|
4
setup.py
4
setup.py
@ -1,6 +1,6 @@
|
||||
"""
|
||||
`clearml-serving` - Model-Serving Orchestration and Repository Solution
|
||||
https://github.com/allegroai/clearml-serving
|
||||
https://github.com/clearml/clearml-serving
|
||||
"""
|
||||
|
||||
import os.path
|
||||
@ -38,7 +38,7 @@ setup(
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
# The project's main homepage.
|
||||
url='https://github.com/allegroai/clearml-serving.git',
|
||||
url='https://github.com/clearml/clearml-serving.git',
|
||||
author='ClearML',
|
||||
author_email='support@clear.ml',
|
||||
license='Apache License 2.0',
|
||||
|
Loading…
Reference in New Issue
Block a user