diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index 09e5ded..2d584c9 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -615,11 +615,38 @@ class VllmEngine(Singleton): # load vLLM Modules if self._vllm is None: - from vllm import entrypoints, engine, usage - self._vllm = {} - self._vllm["entrypoints"] = entrypoints - self._vllm["engine"] = engine - self._vllm["usage"] = usage + # from vllm import entrypoints, engine, usage + from vllm.engine.arg_utils import AsyncEngineArgs + from vllm.engine.async_llm_engine import AsyncLLMEngine + from vllm.entrypoints.logger import RequestLogger + from vllm.entrypoints.openai.serving_engine import OpenAIServing + from vllm.entrypoints.openai.serving_models import OpenAIServingModels, LoRAModulePath, PromptAdapterPath, BaseModelPath + from vllm.entrypoints.openai.serving_chat import OpenAIServingChat + from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion + from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding + from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization + from vllm.entrypoints.openai.protocol import ChatCompletionResponse, CompletionResponse, ErrorResponse + from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption + from vllm.usage.usage_lib import UsageContext + self._vllm = { + "AsyncEngineArgs": AsyncEngineArgs, + "AsyncLLMEngine": AsyncLLMEngine, + "RequestLogger": RequestLogger, + "OpenAIServing": OpenAIServing, + "OpenAIServingModels": OpenAIServingModels, + "LoRAModulePath": LoRAModulePath, + "PromptAdapterPath": PromptAdapterPath, + "BaseModelPath": BaseModelPath, + "OpenAIServingChat": OpenAIServingChat, + "OpenAIServingCompletion": OpenAIServingCompletion, + "OpenAIServingEmbedding": OpenAIServingEmbedding, + "OpenAIServingTokenization": OpenAIServingTokenization, + "ChatCompletionResponse": ChatCompletionResponse, + "CompletionResponse": CompletionResponse, + "ErrorResponse": ErrorResponse, + "ChatTemplateContentFormatOption": ChatTemplateContentFormatOption, + "UsageContext": UsageContext + } if self._fastapi is None: from fastapi.responses import JSONResponse, StreamingResponse @@ -647,85 +674,75 @@ class VllmEngine(Singleton): self.add_models(name=name, model_path=model_path) return None - vllm_engine_config = json.loads(os.environ.get("VLLM_ENGINE_ARGS")) - engine_args = self._vllm["engine"].arg_utils.AsyncEngineArgs(**vllm_engine_config) - async_engine_client = self._vllm["engine"].async_llm_engine.AsyncLLMEngine.from_engine_args( + vllm_engine_config = json.loads(os.environ.get("VLLM_ENGINE_ARGS").replace("'", "")) + vllm_engine_config["model"] = model_path + vllm_engine_config["served_model_name"] = name + engine_args = self._vllm["AsyncEngineArgs"](**vllm_engine_config) + async_engine_client = self._vllm["AsyncLLMEngine"].from_engine_args( engine_args, - usage_context=self._vllm["usage"].usage_lib.UsageContext.OPENAI_API_SERVER + usage_context=self._vllm["UsageContext"].OPENAI_API_SERVER ) model_config = async_engine_client.engine.get_model_config() - request_logger = self._vllm["entrypoints"].logger.RequestLogger( + request_logger = self._vllm["RequestLogger"]( max_log_len=vllm_model_config["max_log_len"] ) - self._model["openai_serving_models"] = self._vllm[ - "entrypoints" - ].openai.serving_models.OpenAIServingModels( - async_engine_client, - model_config, - [ - self._vllm["entrypoints"].openai.serving_models.BaseModelPath( - name=name, - model_path=model_path - ) - ], - lora_modules=svllm_model_config["lora_modules"], - prompt_adapters=vllm_model_config["prompt_adapters"], + self._model["openai_serving_models"] = self._vllm["OpenAIServingModels"]( + async_engine_client, + model_config, + [ + self._vllm["BaseModelPath"]( + name=name, + model_path=model_path + ) + ], + lora_modules=vllm_model_config["lora_modules"], + prompt_adapters=vllm_model_config["prompt_adapters"], ) - await self._model["openai_serving_models"].init_static_loras() - self._model["openai_serving"] = self._vllm[ - "entrypoints" - ].openai.serving_engine.OpenAIServing( - async_engine_client, - model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + # await self._model["openai_serving_models"].init_static_loras() + self._model["openai_serving"] = self._vllm["OpenAIServing"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] ) - self._model["openai_serving_chat"] = self._vllm[ - "entrypoints" - ].openai.serving_chat.OpenAIServingChat( - async_engine_client, - model_config, - self._model["openai_serving_models"], - response_role=vllm_model_config["response_role"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"], - chat_template_content_format=chat_settings["chat_template_content_format"], - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"], - enable_reasoning=chat_settings["enable_reasoning"], - reasoning_parser=chat_settings["reasoning_parser"], - enable_auto_tools=chat_settings["enable_auto_tools"], - tool_parser=chat_settings["tool_parser"], - enable_prompt_tokens_details=chat_settings["enable_prompt_tokens_details"] + self._model["openai_serving_chat"] = self._vllm["OpenAIServingChat"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + response_role=vllm_model_config["response_role"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + chat_template_content_format=chat_settings["chat_template_content_format"], + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"], + enable_reasoning=chat_settings["enable_reasoning"], + reasoning_parser=chat_settings["reasoning_parser"], + enable_auto_tools=chat_settings["enable_auto_tools"], + tool_parser=chat_settings["tool_parser"], + enable_prompt_tokens_details=chat_settings["enable_prompt_tokens_details"] ) if model_config.runner_type == "generate" else None - self._model["openai_serving_completion"] = self._vllm[ - "entrypoints" - ].openai.serving_completion.OpenAIServingCompletion( - async_engine_client, - model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + self._model["openai_serving_completion"] = self._vllm["OpenAIServingCompletion"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] ) if model_config.runner_type == "generate" else None - self._model["openai_serving_embedding"] = self._vllm[ - "entrypoints" - ].openai.serving_embedding.OpenAIServingEmbedding( - async_engine_client, - model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"], - chat_template_content_format=chat_settings["chat_template_content_format"] + self._model["openai_serving_embedding"] = self._vllm["OpenAIServingEmbedding"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + chat_template_content_format=chat_settings["chat_template_content_format"] ) if model_config.task == "embed" else None - self._model["openai_serving_tokenization"] = self._vllm[ - "entrypoints" - ].openai.serving_tokenization.OpenAIServingTokenization( - async_engine_client, - model_config, - self._model["openai_serving_models"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"], - chat_template_content_format=chat_settings["chat_template_content_format"] + self._model["openai_serving_tokenization"] = self._vllm["OpenAIServingTokenization"]( + async_engine_client, + model_config, + self._model["openai_serving_models"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + chat_template_content_format=chat_settings["chat_template_content_format"] ) self.logger.info("vLLM Engine was successfully initialized") self.is_already_loaded = True @@ -733,7 +750,7 @@ class VllmEngine(Singleton): def add_models(self, name: str, model_path: str): self._model["openai_serving_models"].base_model_paths.append( - self._vllm["entrypoints"].openai.serving_models.BaseModelPath( + self._vllm["BaseModelPath"]( name=name, model_path=model_path ) @@ -759,13 +776,12 @@ class VllmEngine(Singleton): message="The model does not support Completions API" ) generator = await handler.create_completion(request=request, raw_request=raw_request) - if isinstance(generator, self._vllm["entrypoints"].openai.protocol.ErrorResponse): + if isinstance(generator, self._vllm["ErrorResponse"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - elif isinstance(generator, self._vllm["entrypoints"].openai.protocol.CompletionResponse): + elif isinstance(generator, self._vllm["CompletionResponse"]): return self._fastapi["json_response"](content=generator.model_dump()) return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") - async def chat_completions( self, data: Any, @@ -784,12 +800,20 @@ class VllmEngine(Singleton): message="The model does not support Chat Completions API" ) generator = await handler.create_chat_completion(request=request, raw_request=raw_request) - if isinstance(generator, self._vllm["entrypoints"].openai.protocol.ErrorResponse): + if isinstance(generator, self._vllm["ErrorResponse"]): return self._fastapi["json_response"](content=generator.model_dump(), status_code=generator.code) - elif isinstance(generator, self._vllm["entrypoints"].openai.protocol.ChatCompletionResponse): + elif isinstance(generator, self._vllm["ChatCompletionResponse"]): return self._fastapi["json_response"](content=generator.model_dump()) return self._fastapi["streaming_response"](content=generator, media_type="text/event-stream") + async def models( + self, + data: Any, + state: dict, + collect_custom_statistics_fn: Callable[[dict], None] = None + ) -> Any: + pass + @BasePreprocessRequest.register_engine("vllm", modules=["vllm", "fastapi"]) class VllmPreprocessRequest(BasePreprocessRequest): @@ -881,7 +905,7 @@ class VllmPreprocessRequest(BasePreprocessRequest): The actual processing function. We run the process in this context """ - return self._vllm_engine.completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) + return await self._vllm_engine.completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) async def chat_completions(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: @@ -889,9 +913,16 @@ class VllmPreprocessRequest(BasePreprocessRequest): The actual processing function. We run the process in this context """ - return self._vllm_engine.chat_completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) + return await self._vllm_engine.chat_completions(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) + async def models(self, data: Any, state: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any: + """ + The actual processing function. + We run the process in this context + """ + return self._vllm_engine.models(data=data, state=state, collect_custom_statistics_fn=collect_custom_statistics_fn) + @staticmethod async def _preprocess_send_request(_, endpoint: str, version: str = None, data: dict = None) -> Optional[dict]: endpoint = "/openai/v1/{}".format(endpoint.strip("/")) diff --git a/docker/docker-compose-gpu.yml b/docker/docker-compose-gpu.yml index 221f6d0..dbb063b 100644 --- a/docker/docker-compose-gpu.yml +++ b/docker/docker-compose-gpu.yml @@ -105,7 +105,7 @@ services: GOOGLE_APPLICATION_CREDENTIALS: ${GOOGLE_APPLICATION_CREDENTIALS:-} AZURE_STORAGE_ACCOUNT: ${AZURE_STORAGE_ACCOUNT:-} AZURE_STORAGE_KEY: ${AZURE_STORAGE_KEY:-} - VLLM_ENGINE_ARGS: ${VLLM_ENGINE_ARGS:-'{"disable_log_requests":true,"disable_log_stats":false,"gpu_memory_utilization":0.95,"quantization":null,"enforce_eager":true}'} + VLLM_ENGINE_ARGS: ${VLLM_ENGINE_ARGS:-'{"disable_log_requests":true,"disable_log_stats":false,"gpu_memory_utilization":0.95,"enforce_eager":true}'} depends_on: - kafka networks: diff --git a/examples/vllm/preprocess.py b/examples/vllm/preprocess.py index aa0f13a..001dd58 100644 --- a/examples/vllm/preprocess.py +++ b/examples/vllm/preprocess.py @@ -11,18 +11,6 @@ class Preprocess: def load(self, local_file_name: str) -> Optional[Any]: # noqa - # vllm_engine_config = { - # "model": local_file_name, - # "tokenizer": local_file_name, - # "disable_log_requests": True, - # "disable_log_stats": False, - # "gpu_memory_utilization": 0.9, - # "quantization": None, - # "enforce_eager": True, - # "served_model_name": "test_vllm", - # "dtype": "float16", - # "max_model_len": 8192 - # } vllm_model_config = { "lora_modules": None, # [LoRAModulePath(name=a, path=b)] "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] @@ -39,66 +27,7 @@ class Preprocess: "enable_prompt_tokens_details": False, "chat_template_content_format": "auto" } - # self._model = {} - # engine_args = AsyncEngineArgs(**self.vllm_engine_config) - # async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) - # model_config = async_engine_client.engine.get_model_config() - # request_logger = RequestLogger(max_log_len=self.vllm_model_config["max_log_len"]) - # self._model["openai_serving_models"] = OpenAIServingModels( - # async_engine_client, - # self.model_config, - # [BaseModelPath(name=self.vllm_engine_config["served_model_name"], model_path=self.vllm_engine_config["model"])], - # lora_modules=self.vllm_model_config["lora_modules"], - # prompt_adapters=self.vllm_model_config["prompt_adapters"], - # ) - # self._model["openai_serving"] = OpenAIServing( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # request_logger=request_logger, - # return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] - # ) - # self._model["openai_serving_chat"] = OpenAIServingChat( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # response_role=self.vllm_model_config["response_role"], - # request_logger=request_logger, - # chat_template=self.vllm_model_config["chat_template"], - # chat_template_content_format=self.chat_settings["chat_template_content_format"], - # return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"], - # enable_reasoning=self.chat_settings["enable_reasoning"], - # reasoning_parser=self.chat_settings["reasoning_parser"], - # enable_auto_tools=self.chat_settings["enable_auto_tools"], - # tool_parser=self.chat_settings["tool_parser"], - # enable_prompt_tokens_details=self.chat_settings["enable_prompt_tokens_details"] - # ) if self.model_config.runner_type == "generate" else None - # self._model["openai_serving_completion"] = OpenAIServingCompletion( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # request_logger=request_logger, - # return_tokens_as_token_ids=self.vllm_model_config["return_tokens_as_token_ids"] - # ) if self.model_config.runner_type == "generate" else None - # self._model["openai_serving_embedding"] = OpenAIServingEmbedding( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # request_logger=request_logger, - # chat_template=self.vllm_model_config["chat_template"], - # chat_template_content_format=self.chat_settings["chat_template_content_format"] - # ) if self.model_config.task == "embed" else None - # self._model["openai_serving_tokenization"] = OpenAIServingTokenization( - # async_engine_client, - # self.model_config, - # self._model["openai_serving_models"], - # request_logger=request_logger, - # chat_template=self.vllm_model_config["chat_template"], - # chat_template_content_format=self.chat_settings["chat_template_content_format"] - # ) - # return self._model return { - # "vllm_engine_config": vllm_engine_config, "vllm_model_config": vllm_model_config, "chat_settings": chat_settings }