small changes for pr

2025-06-26 18:16:00 +00:00 · 2025-03-12 14:24:00 +03:00 · 2025-03-12 14:24:00 +03:00 · a2817e38da
commit a2817e38da
parent 10f887d449
4 changed files with 12 additions and 10 deletions
--- a/clearml_serving/serving/Dockerfile
+++ b/clearml_serving/serving/Dockerfile
@ -4,7 +4,7 @@ FROM python:3.11-bullseye
 ENV LC_ALL=C.UTF-8

 # install base package
-# RUN pip3 install --no-cache-dir clearml-serving
+RUN pip3 install --no-cache-dir clearml-serving

 # get latest execution code from the git repository
 # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@ -9,6 +9,8 @@ from fastapi.routing import APIRoute
 from fastapi.responses import PlainTextResponse
 from grpc.aio import AioRpcError

+from http import HTTPStatus
+
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest

 from starlette.background import BackgroundTask
@ -115,14 +117,14 @@ async def cuda_exception_handler(request, exc):
 async def process_with_exceptions(
    base_url: str,
    version: Optional[str],
-    request_body: Union[bytes, Dict[Any, Any]],
+    request: Union[bytes, Dict[Any, Any]],
    serve_type: str
 ):
    try:
        return_value = await processor.process_request(
            base_url=base_url,
            version=version,
-            request_body=request_body,
+            request_body=request,
            serve_type=serve_type
        )
    except EndpointNotFoundException as ex:
@ -130,21 +132,21 @@ async def process_with_exceptions(
    except (EndpointModelLoadException, EndpointBackendEngineException) as ex:
        session_logger.report_text(
            "[{}] Exception [{}] {} while processing request: {}\n{}".format(
-                instance_id, type(ex), ex, request_body, "".join(traceback.format_exc())
+                instance_id, type(ex), ex, request, "".join(traceback.format_exc())
            )
        )
        raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
    except ServingInitializationException as ex:
        session_logger.report_text(
            "[{}] Exception [{}] {} while loading serving inference: {}\n{}".format(
-                instance_id, type(ex), ex, request_body, "".join(traceback.format_exc())
+                instance_id, type(ex), ex, request, "".join(traceback.format_exc())
            )
        )
        raise HTTPException(status_code=500, detail="Error [{}] processing request: {}".format(type(ex), ex))
    except ValueError as ex:
        session_logger.report_text(
            "[{}] Exception [{}] {} while processing request: {}\n{}".format(
-                instance_id, type(ex), ex, request_body, "".join(traceback.format_exc())
+                instance_id, type(ex), ex, request, "".join(traceback.format_exc())
            )
        )
        if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
@ -154,7 +156,7 @@ async def process_with_exceptions(
    except AioRpcError as ex:
        if grpc_aio_verbose_errors and ex.code() in grpc_aio_verbose_errors:
            session_logger.report_text(
-                "[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request_body)
+                "[{}] Exception [AioRpcError] {} while processing request: {}".format(instance_id, ex, request)
            )
        elif not grpc_aio_ignore_errors or ex.code() not in grpc_aio_ignore_errors:
            session_logger.report_text("[{}] Exception [AioRpcError] status={} ".format(instance_id, ex.code()))
@ -164,7 +166,7 @@ async def process_with_exceptions(
    except Exception as ex:
        session_logger.report_text(
            "[{}] Exception [{}] {} while processing request: {}\n{}".format(
-                instance_id, type(ex), ex, request_body, "".join(traceback.format_exc())
+                instance_id, type(ex), ex, request, "".join(traceback.format_exc())
            )
        )
        raise HTTPException(status_code=500, detail="Error  [{}] processing request: {}".format(type(ex), ex))
--- a/clearml_serving/statistics/Dockerfile
+++ b/clearml_serving/statistics/Dockerfile
@ -4,7 +4,7 @@ FROM python:3.11-bullseye
 ENV LC_ALL=C.UTF-8

 # install base package
-# RUN pip3 install --no-cache-dir clearml-serving
+RUN pip3 install --no-cache-dir clearml-serving

 # get latest execution code from the git repository
 # RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git
--- a/docker/prometheus.yml
+++ b/docker/prometheus.yml
@ -26,4 +26,4 @@ scrape_configs:
    scrape_interval: 5s

    static_configs:
-      - targets: ['clearml-serving-inference:8000']
+      - targets: ['clearml-serving-inference:8000']