diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index e7a94d8..4683838 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -214,7 +214,7 @@ async def openai_serve_model( ): combined_request = {"request": request, "raw_request": raw_request} return_value = await process_with_exceptions( - base_url=request.get("model", None), + base_url=request.model, version=None, request_body=combined_request, serve_type=endpoint_type diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index bbf8a1f..594bb07 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -637,12 +637,14 @@ class VllmPreprocessRequest(BasePreprocessRequest): ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, + CompletionResponse, ErrorResponse ) self._vllm = {} self._vllm["chat_completion_request"] = ChatCompletionRequest self._vllm["chat_completion_response"] = ChatCompletionResponse self._vllm["completion_request"] = CompletionRequest + self._vllm["completion_response"] = CompletionResponse self._vllm["error_response"] = ErrorResponse if self._fastapi is None: @@ -741,7 +743,7 @@ class VllmPreprocessRequest(BasePreprocessRequest): generator = await handler.create_completion(request=request, raw_request=raw_request) if isinstance(generator, self._vllm["error_response"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - elif isinstance(generator, self._vllm["chat_completion_response"]): + elif isinstance(generator, self._vllm["completion_response"]): return self._fastapi["json_response"](content=generator.model_dump()) return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream")