From 666ce26ab22b27e35a5542ca6d3c8646e7561c4b Mon Sep 17 00:00:00 2001
From: stephanbertl <stephan@bweb.at>
Date: Sun, 7 Jul 2024 14:51:23 +0200
Subject: [PATCH 1/5] Add exit-on-error option for tritonserver (#76)

This fixes #60
Co-authored-by: = <s.bertl@iaea.org>
---
 clearml_serving/engines/triton/triton_helper.py | 6 ++++++
 1 file changed, 6 insertions(+)
diff --git a/clearml_serving/engines/triton/triton_helper.py b/clearml_serving/engines/triton/triton_helper.py
index 879174b..19fd241 100644
--- a/clearml_serving/engines/triton/triton_helper.py
+++ b/clearml_serving/engines/triton/triton_helper.py
@@ -540,6 +540,12 @@ def main():
     parser.add_argument(
         '--t-log-verbose', type=str,
         help='<integer> Triton server logging verbosity (default disabled)')
+    parser.add_argument(
+        '--t-exit-on-error', type=bool, default=True,
+        help='Exits the inference server if any error occurs during initialization.'
+             'Recommended to set to True to catch any unanticipated errors.'
+             'False prevents single models breaking the whole tritonserver.'
+    )
 
     args = parser.parse_args()
 

From 724c99c605540cdae25e4ef504c09f705cd53503 Mon Sep 17 00:00:00 2001
From: IlyaMescheryakov1402
 <58298387+IlyaMescheryakov1402@users.noreply.github.com>
Date: Sun, 7 Jul 2024 15:54:08 +0300
Subject: [PATCH 2/5] Add clearml_serving_inference restart on CUDA OOM (#75)

* initial commit

* add OOM handler for MIG profiles

---------

Co-authored-by: Meshcheryakov Ilya <i.meshcheryakov@mts.ai>
---
 clearml_serving/serving/entrypoint.sh         |  1 +
 clearml_serving/serving/init.py               |  2 ++
 clearml_serving/serving/main.py               | 26 ++++++++++++++++++-
 .../serving/model_request_processor.py        |  7 +++++
 4 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh
index e1a5bbc..a5efea1 100755
--- a/clearml_serving/serving/entrypoint.sh
+++ b/clearml_serving/serving/entrypoint.sh
@@ -2,6 +2,7 @@
 
 # print configuration
 echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID"
+echo CLEARML_INFERENCE_TASK_ID="$CLEARML_INFERENCE_TASK_ID"
 echo CLEARML_SERVING_PORT="$CLEARML_SERVING_PORT"
 echo CLEARML_USE_GUNICORN="$CLEARML_USE_GUNICORN"
 echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES"
diff --git a/clearml_serving/serving/init.py b/clearml_serving/serving/init.py
index 2ae54a8..0c75712 100644
--- a/clearml_serving/serving/init.py
+++ b/clearml_serving/serving/init.py
@@ -6,6 +6,7 @@ from clearml_serving.serving.preprocess_service import BasePreprocessRequest
 
 def setup_task(force_threaded_logging=None):
     serving_service_task_id = os.environ.get("CLEARML_SERVING_TASK_ID", None)
+    inference_service_task_id = os.environ.get("CLEARML_INFERENCE_TASK_ID", False) # according Task.init() docs
 
     # always use background thread, it requires less memory
     if force_threaded_logging or os.environ.get("CLEARML_BKG_THREAD_REPORT") in ("1", "Y", "y", "true"):
@@ -24,6 +25,7 @@ def setup_task(force_threaded_logging=None):
         project_name=serving_task.get_project_name(),
         task_name="{} - serve instance".format(serving_task.name),
         task_type="inference",  # noqa
+        continue_last_task=inference_service_task_id,
     )
     instance_task.set_system_tags(["service"])
     # make sure we start logging thread/process
diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py
index 6865c93..10ce9c9 100644
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@@ -1,9 +1,13 @@
 import os
 import traceback
 import gzip
+import asyncio
 
 from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
 from fastapi.routing import APIRoute
+from fastapi.responses import PlainTextResponse
+
+from starlette.background import BackgroundTask
 
 from typing import Optional, Dict, Any, Callable, Union
 
@@ -48,6 +52,9 @@ try:
 except (ValueError, TypeError):
     pass
 
+class CUDAException(Exception):
+    def __init__(self, exception: str):
+        self.exception = exception
 
 # start FastAPI app
 app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router")
@@ -70,6 +77,20 @@ async def startup_event():
         processor.launch(poll_frequency_sec=model_sync_frequency_secs*60)
 
 
+@app.on_event('shutdown')
+def shutdown_event():
+    print('RESTARTING INFERENCE SERVICE!')
+    
+async def exit_app():
+    loop = asyncio.get_running_loop()
+    loop.stop()
+    
+@app.exception_handler(CUDAException)
+async def cuda_exception_handler(request, exc):
+    task = BackgroundTask(exit_app)
+    return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task)
+
+
 router = APIRouter(
     prefix="/serve",
     tags=["models"],
@@ -102,7 +123,10 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
     except ValueError as ex:
         session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
             instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
-        raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
+        if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
+            raise CUDAException(exception=ex)
+        else:
+            raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
     except Exception as ex:
         session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
             instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py
index ba9242d..35f5120 100644
--- a/clearml_serving/serving/model_request_processor.py
+++ b/clearml_serving/serving/model_request_processor.py
@@ -1,5 +1,7 @@
 import json
 import os
+import gc
+import torch
 from collections import deque
 from pathlib import Path
 from random import random
@@ -915,7 +917,12 @@ class ModelRequestProcessor(object):
                     for k in list(self._engine_processor_lookup.keys()):
                         if k not in self._endpoints:
                             # atomic
+                            self._engine_processor_lookup[k]._model = None
+                            self._engine_processor_lookup[k]._preprocess = None
+                            del self._engine_processor_lookup[k]
                             self._engine_processor_lookup.pop(k, None)
+                            gc.collect()
+                            torch.cuda.empty_cache()
                 cleanup = False
                 model_monitor_update = False
             except Exception as ex:

From aff27c62b89f9d0f2fe2e55cc75ffdbc272a6f78 Mon Sep 17 00:00:00 2001
From: clearml <>
Date: Thu, 12 Dec 2024 23:57:21 +0200
Subject: [PATCH 3/5] Fix gRPC errors print stack traces and full verbose
 details. Add support for controlling error printouts using
 `CLEARML_SERVING_AIO_RPC_IGNORE_ERRORS` and
 `CLEARML_SERVING_AIO_RPC_VERBOSE_ERRORS` (pass a whitespace-separated list of
 error codes or error names)

---
 clearml_serving/serving/main.py  | 87 +++++++++++++++++++++++---------
 clearml_serving/serving/utils.py | 17 +++++++
 clearml_serving/version.py       |  2 +-
 3 files changed, 80 insertions(+), 26 deletions(-)
 create mode 100644 clearml_serving/serving/utils.py

diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py
index 10ce9c9..2536760 100644
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@@ -1,4 +1,5 @@
 import os
+import shlex
 import traceback
 import gzip
 import asyncio
@@ -6,6 +7,7 @@ import asyncio
 from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
 from fastapi.routing import APIRoute
 from fastapi.responses import PlainTextResponse
+from grpc.aio import AioRpcError
 
 from starlette.background import BackgroundTask
 
@@ -13,8 +15,14 @@ from typing import Optional, Dict, Any, Callable, Union
 
 from clearml_serving.version import __version__
 from clearml_serving.serving.init import setup_task
-from clearml_serving.serving.model_request_processor import ModelRequestProcessor, EndpointNotFoundException, \
-    EndpointBackendEngineException, EndpointModelLoadException, ServingInitializationException
+from clearml_serving.serving.model_request_processor import (
+    ModelRequestProcessor,
+    EndpointNotFoundException,
+    EndpointBackendEngineException,
+    EndpointModelLoadException,
+    ServingInitializationException,
+)
+from clearml_serving.serving.utils import parse_grpc_errors
 
 
 class GzipRequest(Request):
@@ -52,10 +60,16 @@ try:
 except (ValueError, TypeError):
     pass
 
+
+grpc_aio_ignore_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_IGNORE_ERRORS", "")))
+grpc_aio_verbose_errors = parse_grpc_errors(shlex.split(os.environ.get("CLEARML_SERVING_AIO_RPC_VERBOSE_ERRORS", "")))
+
+
 class CUDAException(Exception):
     def __init__(self, exception: str):
         self.exception = exception
 
+
 # start FastAPI app
 app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router")
 
@@ -65,26 +79,31 @@ async def startup_event():
     global processor
 
     if processor:
-        print("ModelRequestProcessor already initialized [pid={}] [service_id={}]".format(
-            os.getpid(), serving_service_task_id))
+        print(
+            "ModelRequestProcessor already initialized [pid={}] [service_id={}]".format(
+                os.getpid(), serving_service_task_id
+            )
+        )
     else:
-        print("Starting up ModelRequestProcessor [pid={}] [service_id={}]".format(
-            os.getpid(), serving_service_task_id))
+        print("Starting up ModelRequestProcessor [pid={}] [service_id={}]".format(os.getpid(), serving_service_task_id))
         processor = ModelRequestProcessor(
-            task_id=serving_service_task_id, update_lock_guard=singleton_sync_lock,
+            task_id=serving_service_task_id,
+            update_lock_guard=singleton_sync_lock,
         )
         print("ModelRequestProcessor [id={}] loaded".format(processor.get_id()))
-        processor.launch(poll_frequency_sec=model_sync_frequency_secs*60)
+        processor.launch(poll_frequency_sec=model_sync_frequency_secs * 60)
 
 
-@app.on_event('shutdown')
+@app.on_event("shutdown")
 def shutdown_event():
-    print('RESTARTING INFERENCE SERVICE!')
-    
+    print("RESTARTING INFERENCE SERVICE!")
+
+
 async def exit_app():
     loop = asyncio.get_running_loop()
     loop.stop()
-    
+
+
 @app.exception_handler(CUDAException)
 async def cuda_exception_handler(request, exc):
     task = BackgroundTask(exit_app)
@@ -105,31 +124,49 @@ router = APIRouter(
 @router.post("/{model_id}")
 async def serve_model(model_id: str, version: Optional[str] = None, request: Union[bytes, Dict[Any, Any]] = None):
     try:
-        return_value = await processor.process_request(
-            base_url=model_id,
-            version=version,
-            request_body=request
-        )
+        return_value = await processor.process_request(base_url=model_id, version=version, request_body=request)
     except EndpointNotFoundException as ex:
         raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex))
     except (EndpointModelLoadException, EndpointBackendEngineException) as ex:
-        session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
-            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
+        session_logger.report_text(
+            "[{}] Exception [{}] {} while processing request: {}\n{}".format(
+                instance_id, type(ex), ex, request, "".join(traceback.format_exc())
+            )
+        )
         raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
     except ServingInitializationException as ex:
-        session_logger.report_text("[{}] Exception [{}] {} while loading serving inference: {}\n{}".format(
-            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
+        session_logger.report_text(
+            "[{}] Exception [{}] {} while loading serving inference: {}\n{}".format(
+                instance_id, type(ex), ex, request, "".join(traceback.format_exc())
+            )
+        )
         raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex))
     except ValueError as ex:
-        session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
-            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
+        session_logger.report_text(
+            "[{}] Exception [{}] {} while processing request: {}\n{}".format(
+                instance_id, type(ex), ex, request, "".join(traceback.format_exc())
+            )
+        )
         if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
             raise CUDAException(exception=ex)
         else:
             raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
+    except AioRpcError as ex:
+        if grpc_aio_verbose_errors and ex.code() in grpc_aio_verbose_errors:
+            session_logger.report_text(
+                "[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request)
+            )
+        elif not grpc_aio_ignore_errors or ex.code() not in grpc_aio_ignore_errors:
+            session_logger.report_text("[{}] Exception [AioRpcError] status={} ".format(instance_id, ex.code()))
+        raise HTTPException(
+            status_code=500, detail="Error [AioRpcError] processing request: status={}".format(ex.code())
+        )
     except Exception as ex:
-        session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
-            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
+        session_logger.report_text(
+            "[{}] Exception [{}] {} while processing request: {}\n{}".format(
+                instance_id, type(ex), ex, request, "".join(traceback.format_exc())
+            )
+        )
         raise HTTPException(status_code=500, detail="Error  [{}] processing request: {}".format(type(ex), ex))
     return return_value
 
diff --git a/clearml_serving/serving/utils.py b/clearml_serving/serving/utils.py
new file mode 100644
index 0000000..da18e45
--- /dev/null
+++ b/clearml_serving/serving/utils.py
@@ -0,0 +1,17 @@
+from typing import List, Set
+
+import grpc
+
+
+def parse_grpc_errors(errors: List[str]) -> Set[grpc.StatusCode]:
+    try:
+        typed_errors = {
+            int(e) if e.isdigit() else e.lower().replace("-", " ").replace("_", " ")
+            for e in errors
+        }
+        if len(typed_errors) == 1 and next(iter(typed_errors)) in ("true", "false"):
+            return set(grpc.StatusCode if next(iter(typed_errors)) == "true" else [])
+        return {e for e in grpc.StatusCode if typed_errors.intersection(e.value)}
+    except (ValueError, TypeError):
+        pass
+    return set()
diff --git a/clearml_serving/version.py b/clearml_serving/version.py
index 19b4f1d..72837bd 100644
--- a/clearml_serving/version.py
+++ b/clearml_serving/version.py
@@ -1 +1 @@
-__version__ = '1.3.0'
+__version__ = '1.3.1'

From 9f51a9334fac7c6a4a760c42f787dcb1cefbae49 Mon Sep 17 00:00:00 2001
From: clearml <>
Date: Mon, 16 Dec 2024 18:51:58 +0200
Subject: [PATCH 4/5] Fix torch import

---
 clearml_serving/serving/model_request_processor.py | 9 +++++++--
 clearml_serving/version.py                         | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py
index 35f5120..2fcdcab 100644
--- a/clearml_serving/serving/model_request_processor.py
+++ b/clearml_serving/serving/model_request_processor.py
@@ -1,7 +1,6 @@
 import json
 import os
 import gc
-import torch
 from collections import deque
 from pathlib import Path
 from random import random
@@ -19,6 +18,11 @@ from clearml.storage.util import hash_dict
 from .preprocess_service import BasePreprocessRequest
 from .endpoints import ModelEndpoint, ModelMonitoring, CanaryEP, EndpointMetricLogging
 
+try:
+    import torch
+except ImportError:
+    torch = None
+
 
 class ModelRequestProcessorException(Exception):
     def __init__(self, message):
@@ -922,7 +926,8 @@ class ModelRequestProcessor(object):
                             del self._engine_processor_lookup[k]
                             self._engine_processor_lookup.pop(k, None)
                             gc.collect()
-                            torch.cuda.empty_cache()
+                            if torch:
+                                torch.cuda.empty_cache()
                 cleanup = False
                 model_monitor_update = False
             except Exception as ex:
diff --git a/clearml_serving/version.py b/clearml_serving/version.py
index 72837bd..e398332 100644
--- a/clearml_serving/version.py
+++ b/clearml_serving/version.py
@@ -1 +1 @@
-__version__ = '1.3.1'
+__version__ = '1.3.2'

From 1def0a6901617767687f2b747aaffdc060f96046 Mon Sep 17 00:00:00 2001
From: clearml <>
Date: Mon, 13 Jan 2025 18:40:02 +0200
Subject: [PATCH 5/5] Update github repo link

---
 LICENSE   |  2 +-
 README.md | 16 ++++++++--------
 setup.py  |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/LICENSE b/LICENSE
index 261eeb9..9700080 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2025 ClearML Inc
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
index f4dde1e..7147f40 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,17 @@
 
 <div align="center">
 
-<a href="https://app.clear.ml"><img src="https://github.com/allegroai/clearml/blob/master/docs/clearml-logo.svg?raw=true" width="250px"></a>
+<a href="https://app.clear.ml"><img src="https://github.com/clearml/clearml/blob/master/docs/clearml-logo.svg?raw=true" width="250px"></a>
 
 **ClearML Serving - Model deployment made easy**
 
 ## **`clearml-serving v1.3.1` </br> :sparkles: Model Serving (ML/DL) Made Easy :tada:** <br> :fire: NEW version 1.3 :rocket: 20% faster ! 
 
 
-[![GitHub license](https://img.shields.io/github/license/allegroai/clearml-serving.svg)](https://img.shields.io/github/license/allegroai/clearml-serving.svg)
+[![GitHub license](https://img.shields.io/github/license/clearml/clearml-serving.svg)](https://img.shields.io/github/license/clearml/clearml-serving.svg)
 [![PyPI pyversions](https://img.shields.io/pypi/pyversions/clearml-serving.svg)](https://img.shields.io/pypi/pyversions/clearml-serving.svg)
 [![PyPI version shields.io](https://img.shields.io/pypi/v/clearml-serving.svg)](https://img.shields.io/pypi/v/clearml-serving.svg)
-[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/allegroai)](https://artifacthub.io/packages/helm/allegroai/clearml-serving)
+[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/clearml)](https://artifacthub.io/packages/helm/clearml/clearml-serving)
 [![Slack Channel](https://img.shields.io/badge/slack-%23clearml--community-blueviolet?logo=slack)](https://joinslack.clear.ml)
 
 `🌟 ClearML is open-source - Leave a star to support the project! 🌟`
@@ -25,7 +25,7 @@ It enables model deployment including serving and preprocessing code to a Kubern
 ### :fire: NEW :confetti_ball: Take it for a spin with a simple `docker-compose` [command](#nail_care-initial-setup) :magic_wand: :sparkles: 
 
 
-<a><img src="https://github.com/allegroai/clearml-serving/blob/main/docs/design_diagram.png?raw=true" width="100%"></a>
+<a><img src="https://github.com/clearml/clearml-serving/blob/main/docs/design_diagram.png?raw=true" width="100%"></a>
 
 Features:
 * Easy to deploy & configure
@@ -73,7 +73,7 @@ Features:
 
 ### :nail_care: Initial Setup
 
-1. Setup your [**ClearML Server**](https://github.com/allegroai/clearml-server) or use the [Free tier Hosting](https://app.clear.ml)
+1. Setup your [**ClearML Server**](https://github.com/clearml/clearml-server) or use the [Free tier Hosting](https://app.clear.ml)
 2. Setup local access (if you haven't already), see instructions [here](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps#install-clearml)
 3. Install clearml-serving CLI: 
 ```bash
@@ -85,7 +85,7 @@ pip3 install clearml-serving
 5. Write down the Serving Service UID
 6. Clone clearml-serving repository
 ```bash
-git clone https://github.com/allegroai/clearml-serving.git
+git clone https://github.com/clearml/clearml-serving.git
 ```
 7. Edit the environment variables file (`docker/example.env`) with your clearml-server credentials and Serving Service UID. For example, you should have something like
 ```bash
@@ -170,7 +170,7 @@ AZURE_STORAGE_KEY
 **Notice** On the first few requests the inference container needs to download the model file and preprocessing python code, this means the request might take a little longer, once everything is cached, it will return almost immediately.
 
 **Notes:**
-> Review the model repository in the ClearML web UI, under the "serving examples" Project on your ClearML account/server ([free hosted](https://app.clear.ml) or [self-deployed](https://github.com/allegroai/clearml-server)).
+> Review the model repository in the ClearML web UI, under the "serving examples" Project on your ClearML account/server ([free hosted](https://app.clear.ml) or [self-deployed](https://github.com/clearml/clearml-server)).
 
 > Inference services status, console outputs and machine metrics are available in the ClearML UI in the Serving Service project (default: "DevOps" project)
 
@@ -330,6 +330,6 @@ Grafana model performance example:
 
 ## Contributing
 
-**PRs are always welcomed** :heart: See more details in the ClearML [Guidelines for Contributing](https://github.com/allegroai/clearml/blob/master/docs/contributing.md).
+**PRs are always welcomed** :heart: See more details in the ClearML [Guidelines for Contributing](https://github.com/clearml/clearml/blob/master/docs/contributing.md).
 
 
diff --git a/setup.py b/setup.py
index bdfda40..e80794f 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 """
 `clearml-serving` - Model-Serving Orchestration and Repository Solution
-https://github.com/allegroai/clearml-serving
+https://github.com/clearml/clearml-serving
 """
 
 import os.path
@@ -38,7 +38,7 @@ setup(
     long_description=long_description,
     long_description_content_type='text/markdown',
     # The project's main homepage.
-    url='https://github.com/allegroai/clearml-serving.git',
+    url='https://github.com/clearml/clearml-serving.git',
     author='ClearML',
     author_email='support@clear.ml',
     license='Apache License 2.0',