mirror of
https://github.com/clearml/clearml-serving
synced 2025-06-26 18:16:00 +00:00
add vllm example
This commit is contained in:
parent
f51bf2e081
commit
32d72bcd1c
@ -1241,7 +1241,7 @@ class ModelRequestProcessor(object):
|
||||
if processor.is_process_async \
|
||||
else processor.chat_completion(preprocessed, state, stats_collect_fn)
|
||||
else:
|
||||
raise ValueError(f"wrong url_type: expected 'completions' and 'chat/completions', got {serve_type}")
|
||||
raise ValueError(f"wrong url_type: expected 'process', 'completions' or 'chat/completions', got {serve_type}")
|
||||
# noinspection PyUnresolvedReferences
|
||||
return_value = await processor.postprocess(processed, state, stats_collect_fn) \
|
||||
if processor.is_postprocess_async \
|
||||
|
@ -675,68 +675,6 @@ class VllmPreprocessRequest(BasePreprocessRequest):
|
||||
return s.connect_ex(('localhost', port)) == 0
|
||||
if not is_port_in_use(8000):
|
||||
prometheus_client.start_http_server(8000)
|
||||
|
||||
vllm_engine_config = {
|
||||
"model":f"{local_file_name}/model",
|
||||
"tokenizer":f"{local_file_name}/tokenizer",
|
||||
"disable_log_requests": True,
|
||||
"disable_log_stats": False,
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"quantization": None,
|
||||
"enforce_eager": True,
|
||||
"served_model_name": "ai_operator_hyp22v4"
|
||||
}
|
||||
vllm_model_config = {
|
||||
"lora_modules": None, # [LoRAModulePath(name=a, path=b)]
|
||||
"prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)]
|
||||
"response_role": "assistant",
|
||||
"chat_template": None,
|
||||
"return_tokens_as_token_ids": False,
|
||||
"max_log_len": None
|
||||
}
|
||||
|
||||
self.engine_args = AsyncEngineArgs(**vllm_engine_config)
|
||||
self.async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
|
||||
|
||||
|
||||
model_config = self.async_engine_client.engine.get_model_config()
|
||||
|
||||
request_logger = RequestLogger(max_log_len=vllm_model_config["max_log_len"])
|
||||
|
||||
self.openai_serving_chat = OpenAIServingChat(
|
||||
self.async_engine_client,
|
||||
model_config,
|
||||
served_model_names=[vllm_engine_config["served_model_name"]],
|
||||
response_role=vllm_model_config["response_role"],
|
||||
lora_modules=vllm_model_config["lora_modules"],
|
||||
prompt_adapters=vllm_model_config["prompt_adapters"],
|
||||
request_logger=request_logger,
|
||||
chat_template=vllm_model_config["chat_template"],
|
||||
return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"]
|
||||
)
|
||||
self.openai_serving_completion = OpenAIServingCompletion(
|
||||
self.async_engine_client,
|
||||
model_config,
|
||||
served_model_names=[vllm_engine_config["served_model_name"]],
|
||||
lora_modules=vllm_model_config["lora_modules"],
|
||||
prompt_adapters=vllm_model_config["prompt_adapters"],
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"]
|
||||
)
|
||||
self.openai_serving_embedding = OpenAIServingEmbedding(
|
||||
self.async_engine_client,
|
||||
model_config,
|
||||
served_model_names=[vllm_engine_config["served_model_name"]],
|
||||
request_logger=request_logger
|
||||
)
|
||||
self.openai_serving_tokenization = OpenAIServingTokenization(
|
||||
self.async_engine_client,
|
||||
model_config,
|
||||
served_model_names=[vllm_engine_config["served_model_name"]],
|
||||
lora_modules=vllm_model_config["lora_modules"],
|
||||
request_logger=request_logger,
|
||||
chat_template=vllm_model_config["chat_template"]
|
||||
)
|
||||
# override `send_request` method with the async version
|
||||
self._preprocess.__class__.send_request = VllmPreprocessRequest._preprocess_send_request
|
||||
|
||||
@ -818,7 +756,7 @@ class VllmPreprocessRequest(BasePreprocessRequest):
|
||||
)
|
||||
request = CompletionRequest(**body)
|
||||
logger.info(f"Received chat completion request: {request}")
|
||||
generator = await self.openai_serving_completion.create_completion(
|
||||
generator = await self._model["openai_serving_completion"].create_completion(
|
||||
request=request,
|
||||
raw_request=raw_request
|
||||
)
|
||||
@ -835,16 +773,13 @@ class VllmPreprocessRequest(BasePreprocessRequest):
|
||||
The actual processing function.
|
||||
We run the process in this context
|
||||
"""
|
||||
# if self._preprocess is not None and hasattr(self._preprocess, 'chat_completion'):
|
||||
# return await self._preprocess.chat_completion(data, state, collect_custom_statistics_fn)
|
||||
# return None
|
||||
if REMOVE_WEB_ADDITIONAL_PROMPTS:
|
||||
if "messages" in body:
|
||||
body["messages"] = remove_extra_system_prompts(body["messages"])
|
||||
|
||||
request = ChatCompletionRequest(**body)
|
||||
logger.info(f"Received chat completion request: {request}")
|
||||
generator = await self.openai_serving_chat.create_chat_completion(
|
||||
generator = await self._model["self.openai_serving_chat"].create_chat_completion(
|
||||
request=request, raw_request=None
|
||||
)
|
||||
if isinstance(generator, ErrorResponse):
|
||||
|
74
examples/vllm/preprocess.py
Normal file
74
examples/vllm/preprocess.py
Normal file
@ -0,0 +1,74 @@
|
||||
"""Hugginface preprocessing module for ClearML Serving."""
|
||||
from typing import Any
|
||||
|
||||
|
||||
# Notice Preprocess class Must be named "Preprocess"
|
||||
class Preprocess:
|
||||
"""Processing class will be run by the ClearML inference services before and after each request."""
|
||||
|
||||
def __init__(self):
|
||||
"""Set internal state, this will be called only once. (i.e. not per request)."""
|
||||
self.model_endpoint = None
|
||||
|
||||
def load(self, local_file_name: str) -> Optional[Any]: # noqa
|
||||
vllm_engine_config = {
|
||||
"model":f"{local_file_name}/model",
|
||||
"tokenizer":f"{local_file_name}/tokenizer",
|
||||
"disable_log_requests": True,
|
||||
"disable_log_stats": False,
|
||||
"gpu_memory_utilization": 0.9,
|
||||
"quantization": None,
|
||||
"enforce_eager": True,
|
||||
"served_model_name": "ai_operator_hyp22v4"
|
||||
}
|
||||
vllm_model_config = {
|
||||
"lora_modules": None, # [LoRAModulePath(name=a, path=b)]
|
||||
"prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)]
|
||||
"response_role": "assistant",
|
||||
"chat_template": None,
|
||||
"return_tokens_as_token_ids": False,
|
||||
"max_log_len": None
|
||||
}
|
||||
self._model = {}
|
||||
self._model["engine_args"] = AsyncEngineArgs(**vllm_engine_config)
|
||||
self._model["async_engine_client"] = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
|
||||
|
||||
|
||||
self._model["model_config"] = self.async_engine_client.engine.get_model_config()
|
||||
|
||||
self._model["request_logger"] = RequestLogger(max_log_len=vllm_model_config["max_log_len"])
|
||||
|
||||
self._model["self.openai_serving_chat"] = OpenAIServingChat(
|
||||
self.async_engine_client,
|
||||
model_config,
|
||||
served_model_names=[vllm_engine_config["served_model_name"]],
|
||||
response_role=vllm_model_config["response_role"],
|
||||
lora_modules=vllm_model_config["lora_modules"],
|
||||
prompt_adapters=vllm_model_config["prompt_adapters"],
|
||||
request_logger=request_logger,
|
||||
chat_template=vllm_model_config["chat_template"],
|
||||
return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"]
|
||||
)
|
||||
self._model["openai_serving_completion"] = OpenAIServingCompletion(
|
||||
self.async_engine_client,
|
||||
model_config,
|
||||
served_model_names=[vllm_engine_config["served_model_name"]],
|
||||
lora_modules=vllm_model_config["lora_modules"],
|
||||
prompt_adapters=vllm_model_config["prompt_adapters"],
|
||||
request_logger=request_logger,
|
||||
return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"]
|
||||
)
|
||||
self._model["self.openai_serving_embedding"] = OpenAIServingEmbedding(
|
||||
self.async_engine_client,
|
||||
model_config,
|
||||
served_model_names=[vllm_engine_config["served_model_name"]],
|
||||
request_logger=request_logger
|
||||
)
|
||||
self._model["self.openai_serving_tokenization"] = OpenAIServingTokenization(
|
||||
self.async_engine_client,
|
||||
model_config,
|
||||
served_model_names=[vllm_engine_config["served_model_name"]],
|
||||
lora_modules=vllm_model_config["lora_modules"],
|
||||
request_logger=request_logger,
|
||||
chat_template=vllm_model_config["chat_template"]
|
||||
)
|
51
examples/vllm/readme.md
Normal file
51
examples/vllm/readme.md
Normal file
@ -0,0 +1,51 @@
|
||||
# Deploy vLLM model
|
||||
|
||||
## setting up the serving service
|
||||
|
||||
1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
|
||||
2. Make sure to add any required additional packages (for your custom model) to the [docker-compose.yml](https://github.com/allegroai/clearml-serving/blob/826f503cf4a9b069b89eb053696d218d1ce26f47/docker/docker-compose.yml#L97) (or as environment variable to the `clearml-serving-inference` container), by defining for example: `CLEARML_EXTRA_PYTHON_PACKAGES="vllm==0.5.4"`
|
||||
3. Create model endpoint:
|
||||
`clearml-serving --id <service_id> model add --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples"`
|
||||
|
||||
Or auto update
|
||||
|
||||
`clearml-serving --id <service_id> model auto-update --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" --max-versions 2`
|
||||
|
||||
Or add Canary endpoint
|
||||
|
||||
`clearml-serving --id <service_id> model canary --endpoint "test_vllm" --weights 0.1 0.9 --input-endpoint-prefix test_vllm`
|
||||
|
||||
4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint.
|
||||
|
||||
Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
|
||||
|
||||
5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory):
|
||||
|
||||
```python
|
||||
|
||||
import openai
|
||||
openai.api_key = "dummy"
|
||||
openai.api_base = f"http://serving.apps.okd.mts.ai/clearml/v1"
|
||||
|
||||
|
||||
r0 = await openai.ChatCompletion.acreate(
|
||||
model=vllm_endpoint,
|
||||
messages=[{"role": "system", "content": ""}, {"role": "user", "content": "Hi there, goodman!"}],
|
||||
temperature=1.0,
|
||||
max_tokens=1024,
|
||||
top_p=1.0,
|
||||
request_timeout=10000,
|
||||
)
|
||||
|
||||
print(f"ChatCompletion: {r0['choices'][0]['message']}")
|
||||
|
||||
r1 = await openai.Completion.acreate(
|
||||
model=vllm_endpoint,
|
||||
prompt="Hi there, goodman!",
|
||||
temperature=1.0,
|
||||
max_tokens=256,
|
||||
)
|
||||
|
||||
print(f"Completion: \n {r1['choices'][0]['text']}")
|
||||
|
||||
```
|
Loading…
Reference in New Issue
Block a user