Add clearml_serving_inference restart on CUDA OOM (#75)

* initial commit

* add OOM handler for MIG profiles

---------

Co-authored-by: Meshcheryakov Ilya <i.meshcheryakov@mts.ai>
This commit is contained in:
IlyaMescheryakov1402 2024-07-07 15:54:08 +03:00 committed by GitHub
parent 666ce26ab2
commit 724c99c605
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 35 additions and 1 deletions

View File

@ -2,6 +2,7 @@
# print configuration
echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID"
echo CLEARML_INFERENCE_TASK_ID="$CLEARML_INFERENCE_TASK_ID"
echo CLEARML_SERVING_PORT="$CLEARML_SERVING_PORT"
echo CLEARML_USE_GUNICORN="$CLEARML_USE_GUNICORN"
echo CLEARML_EXTRA_PYTHON_PACKAGES="$CLEARML_EXTRA_PYTHON_PACKAGES"

View File

@ -6,6 +6,7 @@ from clearml_serving.serving.preprocess_service import BasePreprocessRequest
def setup_task(force_threaded_logging=None):
serving_service_task_id = os.environ.get("CLEARML_SERVING_TASK_ID", None)
inference_service_task_id = os.environ.get("CLEARML_INFERENCE_TASK_ID", False) # according Task.init() docs
# always use background thread, it requires less memory
if force_threaded_logging or os.environ.get("CLEARML_BKG_THREAD_REPORT") in ("1", "Y", "y", "true"):
@ -24,6 +25,7 @@ def setup_task(force_threaded_logging=None):
project_name=serving_task.get_project_name(),
task_name="{} - serve instance".format(serving_task.name),
task_type="inference", # noqa
continue_last_task=inference_service_task_id,
)
instance_task.set_system_tags(["service"])
# make sure we start logging thread/process

View File

@ -1,9 +1,13 @@
import os
import traceback
import gzip
import asyncio
from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
from fastapi.routing import APIRoute
from fastapi.responses import PlainTextResponse
from starlette.background import BackgroundTask
from typing import Optional, Dict, Any, Callable, Union
@ -48,6 +52,9 @@ try:
except (ValueError, TypeError):
pass
class CUDAException(Exception):
def __init__(self, exception: str):
self.exception = exception
# start FastAPI app
app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router")
@ -70,6 +77,20 @@ async def startup_event():
processor.launch(poll_frequency_sec=model_sync_frequency_secs*60)
@app.on_event('shutdown')
def shutdown_event():
print('RESTARTING INFERENCE SERVICE!')
async def exit_app():
loop = asyncio.get_running_loop()
loop.stop()
@app.exception_handler(CUDAException)
async def cuda_exception_handler(request, exc):
task = BackgroundTask(exit_app)
return PlainTextResponse("CUDA out of memory. Restarting service", status_code=500, background=task)
router = APIRouter(
prefix="/serve",
tags=["models"],
@ -102,7 +123,10 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
except ValueError as ex:
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
raise CUDAException(exception=ex)
else:
raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))
except Exception as ex:
session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
instance_id, type(ex), ex, request, "".join(traceback.format_exc())))

View File

@ -1,5 +1,7 @@
import json
import os
import gc
import torch
from collections import deque
from pathlib import Path
from random import random
@ -915,7 +917,12 @@ class ModelRequestProcessor(object):
for k in list(self._engine_processor_lookup.keys()):
if k not in self._endpoints:
# atomic
self._engine_processor_lookup[k]._model = None
self._engine_processor_lookup[k]._preprocess = None
del self._engine_processor_lookup[k]
self._engine_processor_lookup.pop(k, None)
gc.collect()
torch.cuda.empty_cache()
cleanup = False
model_monitor_update = False
except Exception as ex: