Add exception prints to serving session Task and inference Task, for better debugging capabilities

Add report instance ID when reporting back to the main serving session task
2025-06-26 18:16:00 +00:00 · 2024-03-01 13:12:56 +02:00 · 2024-03-01 13:12:56 +02:00 · 71c104c9df
commit 71c104c9df
parent c658780d97
3 changed files with 73 additions and 21 deletions
--- a/clearml_serving/serving/init.py
+++ b/clearml_serving/serving/init.py
@ -26,8 +26,12 @@ def setup_task(force_threaded_logging=None):
        task_type="inference",  # noqa
    )
    instance_task.set_system_tags(["service"])
+    # make sure we start logging thread/process
+    instance_logger = instance_task.get_logger()  # noqa
+    # this will use the main thread/process
+    session_logger = serving_task.get_logger()

    # preload modules into memory before forking
    BasePreprocessRequest.load_modules()

-    return serving_service_task_id
+    return serving_service_task_id, session_logger, instance_task.id
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@ -1,5 +1,5 @@
 import os
-from multiprocessing import Lock
+import traceback
 import gzip

 from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
@ -9,7 +9,8 @@ from typing import Optional, Dict, Any, Callable, Union

 from clearml_serving.version import __version__
 from clearml_serving.serving.init import setup_task
-from clearml_serving.serving.model_request_processor import ModelRequestProcessor
+from clearml_serving.serving.model_request_processor import ModelRequestProcessor, EndpointNotFoundException, \
+    EndpointBackendEngineException, EndpointModelLoadException, ServingInitializationException


 class GzipRequest(Request):
@ -39,7 +40,7 @@ singleton_sync_lock = None  # Lock()
 processor = None  # type: Optional[ModelRequestProcessor]

 # create clearml Task and load models
-serving_service_task_id = setup_task()
+serving_service_task_id, session_logger, instance_id = setup_task()
 # polling frequency
 model_sync_frequency_secs = 5
 try:
@ -88,10 +89,24 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
            version=version,
            request_body=request
        )
+    except EndpointNotFoundException as ex:
+        raise HTTPException(status_code=404, detail="Error processing request, endpoint was not found: {}".format(ex))
+    except (EndpointModelLoadException, EndpointBackendEngineException) as ex:
+        session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
+            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
+        raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
+    except ServingInitializationException as ex:
+        session_logger.report_text("[{}] Exception [{}] {} while loading serving inference: {}\n{}".format(
+            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
+        raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex))
    except ValueError as ex:
-        raise HTTPException(status_code=422, detail="Error processing request: {}".format(ex))
+        session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
+            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
+        raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
    except Exception as ex:
-        raise HTTPException(status_code=500, detail="Error processing request: {}".format(ex))
+        session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
+            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
+        raise HTTPException(status_code=500, detail="Error  [{}] processing request: {}".format(type(ex), ex))
    return return_value


--- a/clearml_serving/serving/model_request_processor.py
+++ b/clearml_serving/serving/model_request_processor.py
@ -18,6 +18,36 @@ from .preprocess_service import BasePreprocessRequest
 from .endpoints import ModelEndpoint, ModelMonitoring, CanaryEP, EndpointMetricLogging


+class ModelRequestProcessorException(Exception):
+    def __init__(self, message):
+        super().__init__(message)
+
+
+class EndpointNotFoundException(ModelRequestProcessorException):
+    def __init__(self, message):
+        super().__init__(message)
+
+
+class EndpointModelLoadException(ModelRequestProcessorException):
+    def __init__(self, message):
+        super().__init__(message)
+
+
+class EndpointBackendEngineException(ModelRequestProcessorException):
+    def __init__(self, message):
+        super().__init__(message)
+
+
+class ServingInitializationException(Exception):
+    def __init__(self, message):
+        super().__init__(message)
+
+
+class MetricLoggingException(Exception):
+    def __init__(self, message):
+        super().__init__(message)
+
+
 class FastWriteCounter(object):
    def __init__(self):
        self._counter_inc = itertools.count()
@ -148,7 +178,7 @@ class ModelRequestProcessor(object):

            ep = self._endpoints.get(url, None) or self._model_monitoring_endpoints.get(url, None)
            if not ep:
-                raise ValueError("Model inference endpoint '{}' not found".format(url))
+                raise EndpointNotFoundException("Model inference endpoint '{}' not found".format(url))

            processor = self._engine_processor_lookup.get(url)
            if not processor:
@ -270,7 +300,7 @@ class ModelRequestProcessor(object):
            )
            models = Model.query_models(max_results=2, **model_query)
            if not models:
-                raise ValueError("Could not find any Model to serve {}".format(model_query))
+                raise EndpointModelLoadException("Could not find any Model to serve {}".format(model_query))
            if len(models) > 1:
                print("Warning: Found multiple Models for \'{}\', selecting id={}".format(model_query, models[0].id))
            endpoint.model_id = models[0].id
@ -281,7 +311,7 @@ class ModelRequestProcessor(object):
        # upload as new artifact
        if preprocess_code:
            if not Path(preprocess_code).exists():
-                raise ValueError("Preprocessing code \'{}\' could not be found".format(preprocess_code))
+                raise EndpointModelLoadException("Preprocessing code \'{}\' could not be found".format(preprocess_code))
            preprocess_artifact_name = "py_code_{}".format(url.replace("/", "_"))
            self._task.upload_artifact(
                name=preprocess_artifact_name, artifact_object=Path(preprocess_code), wait_on_upload=True)
@ -311,8 +341,8 @@ class ModelRequestProcessor(object):

        # make sure we actually have something to monitor
        if not any([monitoring.monitor_project, monitoring.monitor_name, monitoring.monitor_tags]):
-            raise ValueError("Model monitoring requires at least a "
-                             "project / name / tag to monitor, none were provided.")
+            raise EndpointModelLoadException(
+                "Model monitoring requires at least a project / name / tag to monitor, none were provided.")

        # make sure we have everything configured
        self._validate_model(monitoring)
@ -324,7 +354,8 @@ class ModelRequestProcessor(object):
        # upload as new artifact
        if preprocess_code:
            if not Path(preprocess_code).exists():
-                raise ValueError("Preprocessing code \'{}\' could not be found".format(preprocess_code))
+                raise EndpointModelLoadException(
+                    "Preprocessing code \'{}\' could not be found".format(preprocess_code))
            preprocess_artifact_name = "py_code_{}".format(name.replace("/", "_"))
            self._task.upload_artifact(
                name=preprocess_artifact_name, artifact_object=Path(preprocess_code), wait_on_upload=True)
@ -367,7 +398,7 @@ class ModelRequestProcessor(object):
        if not isinstance(canary, CanaryEP):
            canary = CanaryEP(**canary)
        if canary.load_endpoints and canary.load_endpoint_prefix:
-            raise ValueError(
+            raise EndpointModelLoadException(
                "Could not add canary endpoint with both "
                "prefix ({}) and fixed set of endpoints ({})".format(
                    canary.load_endpoints, canary.load_endpoint_prefix))
@ -406,7 +437,7 @@ class ModelRequestProcessor(object):
        metric.endpoint = name

        if name not in self._endpoints and not name.endswith('*'):
-            raise ValueError("Metric logging \'{}\' references a nonexistent endpoint".format(name))
+            raise MetricLoggingException("Metric logging \'{}\' references a nonexistent endpoint".format(name))

        if name in self._metric_logging:
            print("Warning: Metric logging \'{}\' {}".format(name, "updated" if update else "overwritten"))
@ -1250,11 +1281,11 @@ class ModelRequestProcessor(object):
        if task_id:
            task = Task.get_task(task_id=task_id)
            if not task:
-                raise ValueError("Could not find Control Task ID={}".format(task_id))
+                raise ServingInitializationException("Could not find Control Task ID={}".format(task_id))
            task_status = task.status
            if task_status not in ("created", "in_progress",):
                if disable_change_state:
-                    raise ValueError(
+                    raise ServingInitializationException(
                        "Could Control Task ID={} status [{}] "
                        "is not valid (only 'draft', 'running' are supported)".format(task_id, task_status))
                else:
@ -1271,7 +1302,7 @@ class ModelRequestProcessor(object):
                         'system_tags': [cls._system_tag]}
        )
        if not tasks:
-            raise ValueError("Could not find any valid Control Tasks")
+            raise ServingInitializationException("Could not find any valid Control Tasks")

        if len(tasks) > 1:
            print("Warning: more than one valid Controller Tasks found, using Task ID={}".format(tasks[0]))
@ -1352,15 +1383,16 @@ class ModelRequestProcessor(object):
                    except Exception:
                        suggested_cli = "?"

-                    raise ValueError(
+                    raise EndpointBackendEngineException(
                        "Triton engine requires *manual* input/output specification, "
                        "You input/output in your pbtxt, please remove them and specify manually.\n"
                        "{}".format(suggested_cli)
                    )

                if aux_config_dict.get("default_model_filename", None):
-                    raise ValueError("ERROR: You have `default_model_filename` in your config pbtxt, "
-                                     "please remove it. It will be added automatically by the system.")
+                    raise EndpointBackendEngineException(
+                        "ERROR: You have `default_model_filename` in your config pbtxt, "
+                        "please remove it. It will be added automatically by the system.")

            # verify we have all the info we need
            d = endpoint.as_dict()
@ -1372,7 +1404,8 @@ class ModelRequestProcessor(object):
            ]

            if missing:
-                raise ValueError("Triton engine requires input description - missing values in {}".format(missing))
+                raise EndpointBackendEngineException(
+                    "Triton engine requires input description - missing values in {}".format(missing))
        return True

    def _add_registered_input_model(self, endpoint_url: str, model_id: str) -> bool: