clearml-serving/examples/vllm/preprocess.py
2025-03-09 01:46:05 +03:00

103 lines
4.6 KiB
Python

"""Hugginface preprocessing module for ClearML Serving."""
from typing import Any, Optional
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization
from vllm.usage.usage_lib import UsageContext
from vllm.entrypoints.openai.serving_engine import LoRAModulePath, PromptAdapterPath
class Preprocess:
"""Processing class will be run by the ClearML inference services before and after each request."""
def __init__(self):
"""Set internal state, this will be called only once. (i.e. not per request)."""
self.model_endpoint = None
def load(self, local_file_name: str) -> Optional[Any]: # noqa
vllm_engine_config = {
"model": local_file_name,
"tokenizer": local_file_name,
"disable_log_requests": True,
"disable_log_stats": False,
"gpu_memory_utilization": 0.9,
"quantization": None,
"enforce_eager": True,
"served_model_name": "test_vllm",
"dtype": "float16"
}
vllm_model_config = {
"lora_modules": None, # [LoRAModulePath(name=a, path=b)]
"prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)]
"response_role": "assistant",
"chat_template": None,
"return_tokens_as_token_ids": False,
"max_log_len": None
}
self._model = {}
engine_args = AsyncEngineArgs(**vllm_engine_config)
async_engine_client = AsyncLLMEngine.from_engine_args(engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
model_config = async_engine_client.engine.get_model_config()
request_logger = RequestLogger(max_log_len=vllm_model_config["max_log_len"])
self._model["openai_serving_chat"] = OpenAIServingChat(
async_engine_client,
model_config,
served_model_names=[vllm_engine_config["served_model_name"]],
response_role=vllm_model_config["response_role"],
lora_modules=vllm_model_config["lora_modules"],
prompt_adapters=vllm_model_config["prompt_adapters"],
request_logger=request_logger,
chat_template=vllm_model_config["chat_template"],
return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"]
)
self._model["openai_serving_completion"] = OpenAIServingCompletion(
async_engine_client,
model_config,
served_model_names=[vllm_engine_config["served_model_name"]],
lora_modules=vllm_model_config["lora_modules"],
prompt_adapters=vllm_model_config["prompt_adapters"],
request_logger=request_logger,
return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"]
)
self._model["openai_serving_embedding"] = OpenAIServingEmbedding(
async_engine_client,
model_config,
served_model_names=[vllm_engine_config["served_model_name"]],
request_logger=request_logger
)
self._model["openai_serving_tokenization"] = OpenAIServingTokenization(
async_engine_client,
model_config,
served_model_names=[vllm_engine_config["served_model_name"]],
lora_modules=vllm_model_config["lora_modules"],
request_logger=request_logger,
chat_template=vllm_model_config["chat_template"]
)
return self._model
def remove_extra_system_prompts(self, messages: List) -> List:
system_messages_indices = []
for i, msg in enumerate(messages):
if msg["role"] == "system":
system_messages_indices.append(i)
else:
break
if len(system_messages_indices) > 1:
last_system_index = system_messages_indices[-1]
messages = [msg for i, msg in enumerate(messages) if msg["role"] != "system" or i == last_system_index]
return messages
def preprocess(
self,
body: Union[bytes, dict],
state: dict,
collect_custom_statistics_fn: Optional[Callable[[dict], None]],
) -> Any: # noqa
if "messages" in body:
body["messages"] = self.remove_extra_system_prompts(body["messages"])
return body