From 8ecb51f1db10175fd03aea0daf64d012b2d13ac9 Mon Sep 17 00:00:00 2001 From: IlyaMescheryakov1402 Date: Wed, 12 Mar 2025 01:09:50 +0300 Subject: [PATCH] add models endpoint --- clearml_serving/serving/main.py | 2 +- .../serving/model_request_processor.py | 17 ----------------- clearml_serving/serving/preprocess_service.py | 8 ++++++++ examples/vllm/readme.md | 10 +++++++++- examples/vllm/test_openai_api.py | 12 +++++++----- 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py index 4683838..b2540fe 100644 --- a/clearml_serving/serving/main.py +++ b/clearml_serving/serving/main.py @@ -206,7 +206,7 @@ async def validate_json_request(raw_request: Request): ) @router.post("/openai/v1/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) -@router.post("/openai/v1/{endpoint_type:path}/", dependencies=[Depends(validate_json_request)]) +@router.get("/openai/v1/{endpoint_type:path}", dependencies=[Depends(validate_json_request)]) async def openai_serve_model( endpoint_type: str, request: Union[CompletionRequest, ChatCompletionRequest], diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py index 0cf9084..11b6fd4 100644 --- a/clearml_serving/serving/model_request_processor.py +++ b/clearml_serving/serving/model_request_processor.py @@ -1230,23 +1230,6 @@ class ModelRequestProcessor(object): processed = await processed_func(preprocessed, state, stats_collect_fn) \ if processor.is_process_async \ else processed_func(preprocessed, state, stats_collect_fn) - # if serve_type == "process": - # # noinspection PyUnresolvedReferences - # processed = await processor.process(preprocessed, state, stats_collect_fn) \ - # if processor.is_process_async \ - # else processor.process(preprocessed, state, stats_collect_fn) - # elif serve_type == "completions": - # # noinspection PyUnresolvedReferences - # processed = await processor.completions(preprocessed, state, stats_collect_fn) \ - # if processor.is_process_async \ - # else processor.completions(preprocessed, state, stats_collect_fn) - # elif serve_type == "chat/completions": - # # noinspection PyUnresolvedReferences - # processed = await processor.chat_completions(preprocessed, state, stats_collect_fn) \ - # if processor.is_process_async \ - # else processor.chat_completions(preprocessed, state, stats_collect_fn) - # else: - # raise ValueError(f"wrong url_type: expected 'process', 'completions' or 'chat/completions', got {serve_type}") # noinspection PyUnresolvedReferences return_value = await processor.postprocess(processed, state, stats_collect_fn) \ if processor.is_postprocess_async \ diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py index b236833..c0271d7 100644 --- a/clearml_serving/serving/preprocess_service.py +++ b/clearml_serving/serving/preprocess_service.py @@ -763,6 +763,14 @@ class VllmEngine(Singleton): ) self.logger.info("Model {} was added to vllm engine".format(name)) return None + + def remove_model(self, name: str) -> None: + self.openai_serving_models.base_model_paths = [ + model for model in self.openai_serving_models.base_model_paths + if model.name != name + ] + self.logger.info("Model {} was removed from vllm engine".format(name)) + return None async def completions( self, diff --git a/examples/vllm/readme.md b/examples/vllm/readme.md index 33668cf..f85c645 100644 --- a/examples/vllm/readme.md +++ b/examples/vllm/readme.md @@ -32,9 +32,17 @@ 5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory): ```bash - python examples/vllm/test_openai_app.py + python examples/vllm/test_openai_api.py ``` + **Available routes**: + + + /v1/completions + + /v1/chat/completions + + /v1/models + + see [test_openai_api.py](test_openai_api.py) for more information. + NOTE! If you want to use send_request method, keep in mind that you have to pass "completions" or "chat/completions" in entrypoint (and pass model as a part of "data" parameter) and use it for non-streaming models: diff --git a/examples/vllm/test_openai_api.py b/examples/vllm/test_openai_api.py index 55d7b27..a6b908b 100644 --- a/examples/vllm/test_openai_api.py +++ b/examples/vllm/test_openai_api.py @@ -1,6 +1,6 @@ from openai import OpenAI -def main(model_name: str = "test_vllm"): +def main(model_name: str): client = OpenAI(api_key="-") client.base_url = "http://127.0.0.1:8080/serve/openai/v1" @@ -14,8 +14,7 @@ def main(model_name: str = "test_vllm"): max_tokens=1024, top_p=1.0 ) - - print(f"ChatCompletion: {chat_response.choices[0].message.content}") + print(f"ChatCompletion: \n\n {chat_response.choices[0].message.content}") comp_response = client.completions.create( model=model_name, @@ -23,10 +22,13 @@ def main(model_name: str = "test_vllm"): temperature=1.0, max_tokens=256 ) + print(f"\n\n Completion: \n\n {comp_response.choices[0].text}") - print(f"Completion: \n {comp_response.choices[0].text}") + fake_body = {"stream": False, "model": model_name, "prompt": "test"} + print(f"Models:\n") + print('\n\n'.join(map(str, client.models.list(extra_body=fake_body).data))) return None if __name__ == '__main__': - main() \ No newline at end of file + main(model_name="test_vllm") \ No newline at end of file