add OOM handler for MIG profiles

2025-06-26 18:16:00 +00:00 · 2024-05-30 00:58:17 +03:00 · 2024-05-30 00:58:17 +03:00 · a0bfdf2c92
commit a0bfdf2c92
parent 6859920848
1 changed files with 1 additions and 1 deletions
--- a/clearml_serving/serving/main.py
+++ b/clearml_serving/serving/main.py
@ -123,7 +123,7 @@ async def serve_model(model_id: str, version: Optional[str] = None, request: Uni
    except ValueError as ex:
        session_logger.report_text("[{}] Exception [{}] {} while processing request: {}\n{}".format(
            instance_id, type(ex), ex, request, "".join(traceback.format_exc())))
-        if "CUDA out of memory. " in str(ex):
+        if "CUDA out of memory. " in str(ex) or "NVML_SUCCESS == r INTERNAL ASSERT FAILED" in str(ex):
            raise CUDAException(exception=ex)
        else:
            raise HTTPException(status_code=422, detail="Error [{}] processing request: {}".format(type(ex), ex))