diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 0f6bfa8..c952ddc 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1241,7 +1241,7 @@ class ModelRequestProcessor(object): if processor.is_process_async \ else processor.chat_completion(preprocessed, state, stats_collect_fn) else: - raise ValueError(f"wrong url_type: expected 'completions' and 'chat/completions', got {serve_type}") + raise ValueError(f"wrong url_type: expected 'process', 'completions' or 'chat/completions', got {serve_type}") # noinspection PyUnresolvedReferences return_value = await processor.postprocess(processed, state, stats_collect_fn) \ if processor.is_postprocess_async \ diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index d29f5f8..b9b6cda 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -675,68 +675,6 @@ class VllmPreprocessRequest(BasePreprocessRequest): return s.connect_ex(('localhost', port)) == 0 if not is_port_in_use(8000): prometheus_client.start_http_server(8000) - - vllm_engine_config = { - "model":f"{local_file_name}/model", - "tokenizer":f"{local_file_name}/tokenizer", - "disable_log_requests": True, - "disable_log_stats": False, - "gpu_memory_utilization": 0.9, - "quantization": None, - "enforce_eager": True, - "served_model_name": "ai_operator_hyp22v4" - } - vllm_model_config = { - "lora_modules": None, # [LoRAModulePath(name=a, path=b)] - "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] - "response_role": "assistant", - "chat_template": None, - "return_tokens_as_token_ids": False, - "max_log_len": None - } - - self.engine_args = AsyncEngineArgs(**vllm_engine_config) - self.async_engine_client = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) - - - model_config = self.async_engine_client.engine.get_model_config() - - request_logger = RequestLogger(max_log_len=vllm_model_config["max_log_len"]) - - self.openai_serving_chat = OpenAIServingChat( - self.async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - response_role=vllm_model_config["response_role"], - lora_modules=vllm_model_config["lora_modules"], - prompt_adapters=vllm_model_config["prompt_adapters"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"], - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] - ) - self.openai_serving_completion = OpenAIServingCompletion( - self.async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - lora_modules=vllm_model_config["lora_modules"], - prompt_adapters=vllm_model_config["prompt_adapters"], - request_logger=request_logger, - return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] - ) - self.openai_serving_embedding = OpenAIServingEmbedding( - self.async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - request_logger=request_logger - ) - self.openai_serving_tokenization = OpenAIServingTokenization( - self.async_engine_client, - model_config, - served_model_names=[vllm_engine_config["served_model_name"]], - lora_modules=vllm_model_config["lora_modules"], - request_logger=request_logger, - chat_template=vllm_model_config["chat_template"] - ) # override `send_request` method with the async version self._preprocess.__class__.send_request = VllmPreprocessRequest._preprocess_send_request @@ -818,7 +756,7 @@ class VllmPreprocessRequest(BasePreprocessRequest): ) request = CompletionRequest(**body) logger.info(f"Received chat completion request: {request}") - generator = await self.openai_serving_completion.create_completion( + generator = await self._model["openai_serving_completion"].create_completion( request=request, raw_request=raw_request ) @@ -835,16 +773,13 @@ class VllmPreprocessRequest(BasePreprocessRequest): The actual processing function. We run the process in this context """ - # if self._preprocess is not None and hasattr(self._preprocess, 'chat_completion'): - # return await self._preprocess.chat_completion(data, state, collect_custom_statistics_fn) - # return None if REMOVE_WEB_ADDITIONAL_PROMPTS: if "messages" in body: body["messages"] = remove_extra_system_prompts(body["messages"]) request = ChatCompletionRequest(**body) logger.info(f"Received chat completion request: {request}") - generator = await self.openai_serving_chat.create_chat_completion( + generator = await self._model["self.openai_serving_chat"].create_chat_completion( request=request, raw_request=None ) if isinstance(generator, ErrorResponse): diff --git a/examples/vllm/preprocess.py b/examples/vllm/preprocess.py new file mode 100644 index 0000000..f54b390 --- /dev/null +++ b/examples/vllm/preprocess.py @@ -0,0 +1,74 @@ +"""Hugginface preprocessing module for ClearML Serving.""" +from typing import Any + + +# Notice Preprocess class Must be named "Preprocess" +class Preprocess: + """Processing class will be run by the ClearML inference services before and after each request.""" + + def __init__(self): + """Set internal state, this will be called only once. (i.e. not per request).""" + self.model_endpoint = None + + def load(self, local_file_name: str) -> Optional[Any]: # noqa + vllm_engine_config = { + "model":f"{local_file_name}/model", + "tokenizer":f"{local_file_name}/tokenizer", + "disable_log_requests": True, + "disable_log_stats": False, + "gpu_memory_utilization": 0.9, + "quantization": None, + "enforce_eager": True, + "served_model_name": "ai_operator_hyp22v4" + } + vllm_model_config = { + "lora_modules": None, # [LoRAModulePath(name=a, path=b)] + "prompt_adapters": None, # [PromptAdapterPath(name=a, path=b)] + "response_role": "assistant", + "chat_template": None, + "return_tokens_as_token_ids": False, + "max_log_len": None + } + self._model = {} + self._model["engine_args"] = AsyncEngineArgs(**vllm_engine_config) + self._model["async_engine_client"] = AsyncLLMEngine.from_engine_args(self.engine_args, usage_context=UsageContext.OPENAI_API_SERVER) + + + self._model["model_config"] = self.async_engine_client.engine.get_model_config() + + self._model["request_logger"] = RequestLogger(max_log_len=vllm_model_config["max_log_len"]) + + self._model["self.openai_serving_chat"] = OpenAIServingChat( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + response_role=vllm_model_config["response_role"], + lora_modules=vllm_model_config["lora_modules"], + prompt_adapters=vllm_model_config["prompt_adapters"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"], + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + ) + self._model["openai_serving_completion"] = OpenAIServingCompletion( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + lora_modules=vllm_model_config["lora_modules"], + prompt_adapters=vllm_model_config["prompt_adapters"], + request_logger=request_logger, + return_tokens_as_token_ids=vllm_model_config["return_tokens_as_token_ids"] + ) + self._model["self.openai_serving_embedding"] = OpenAIServingEmbedding( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + request_logger=request_logger + ) + self._model["self.openai_serving_tokenization"] = OpenAIServingTokenization( + self.async_engine_client, + model_config, + served_model_names=[vllm_engine_config["served_model_name"]], + lora_modules=vllm_model_config["lora_modules"], + request_logger=request_logger, + chat_template=vllm_model_config["chat_template"] + ) diff --git a/examples/vllm/readme.md b/examples/vllm/readme.md new file mode 100644 index 0000000..fd03eb9 --- /dev/null +++ b/examples/vllm/readme.md @@ -0,0 +1,51 @@ +# Deploy vLLM model + +## setting up the serving service + +1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID) +2. Make sure to add any required additional packages (for your custom model) to the [docker-compose.yml](https://github.com/allegroai/clearml-serving/blob/826f503cf4a9b069b89eb053696d218d1ce26f47/docker/docker-compose.yml#L97) (or as environment variable to the `clearml-serving-inference` container), by defining for example: `CLEARML_EXTRA_PYTHON_PACKAGES="vllm==0.5.4"` +3. Create model endpoint: +`clearml-serving --id model add --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples"` + +Or auto update + +`clearml-serving --id model auto-update --engine vllm --endpoint "test_vllm" --preprocess "examples/vllm/preprocess.py" --name "test vllm" --project "serving examples" --max-versions 2` + +Or add Canary endpoint + +`clearml-serving --id model canary --endpoint "test_vllm" --weights 0.1 0.9 --input-endpoint-prefix test_vllm` + +4. If you already have the `clearml-serving` docker-compose running, it might take it a minute or two to sync with the new endpoint. + +Or you can run the clearml-serving container independently `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID= clearml-serving:latest` + +5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): + +```python + +import openai +openai.api_key = "dummy" +openai.api_base = f"http://serving.apps.okd.mts.ai/clearml/v1" + + +r0 = await openai.ChatCompletion.acreate( + model=vllm_endpoint, + messages=[{"role": "system", "content": ""}, {"role": "user", "content": "Hi there, goodman!"}], + temperature=1.0, + max_tokens=1024, + top_p=1.0, + request_timeout=10000, +) + +print(f"ChatCompletion: {r0['choices'][0]['message']}") + +r1 = await openai.Completion.acreate( + model=vllm_endpoint, + prompt="Hi there, goodman!", + temperature=1.0, + max_tokens=256, +) + +print(f"Completion: \n {r1['choices'][0]['text']}") + +```