mirror of
https://github.com/clearml/clearml-serving
synced 2025-06-26 18:16:00 +00:00
add models endpoint
This commit is contained in:
parent
25e2940596
commit
8ecb51f1db
@ -206,7 +206,7 @@ async def validate_json_request(raw_request: Request):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@router.post("/openai/v1/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
|
@router.post("/openai/v1/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
|
||||||
@router.post("/openai/v1/{endpoint_type:path}/", dependencies=[Depends(validate_json_request)])
|
@router.get("/openai/v1/{endpoint_type:path}", dependencies=[Depends(validate_json_request)])
|
||||||
async def openai_serve_model(
|
async def openai_serve_model(
|
||||||
endpoint_type: str,
|
endpoint_type: str,
|
||||||
request: Union[CompletionRequest, ChatCompletionRequest],
|
request: Union[CompletionRequest, ChatCompletionRequest],
|
||||||
|
@ -1230,23 +1230,6 @@ class ModelRequestProcessor(object):
|
|||||||
processed = await processed_func(preprocessed, state, stats_collect_fn) \
|
processed = await processed_func(preprocessed, state, stats_collect_fn) \
|
||||||
if processor.is_process_async \
|
if processor.is_process_async \
|
||||||
else processed_func(preprocessed, state, stats_collect_fn)
|
else processed_func(preprocessed, state, stats_collect_fn)
|
||||||
# if serve_type == "process":
|
|
||||||
# # noinspection PyUnresolvedReferences
|
|
||||||
# processed = await processor.process(preprocessed, state, stats_collect_fn) \
|
|
||||||
# if processor.is_process_async \
|
|
||||||
# else processor.process(preprocessed, state, stats_collect_fn)
|
|
||||||
# elif serve_type == "completions":
|
|
||||||
# # noinspection PyUnresolvedReferences
|
|
||||||
# processed = await processor.completions(preprocessed, state, stats_collect_fn) \
|
|
||||||
# if processor.is_process_async \
|
|
||||||
# else processor.completions(preprocessed, state, stats_collect_fn)
|
|
||||||
# elif serve_type == "chat/completions":
|
|
||||||
# # noinspection PyUnresolvedReferences
|
|
||||||
# processed = await processor.chat_completions(preprocessed, state, stats_collect_fn) \
|
|
||||||
# if processor.is_process_async \
|
|
||||||
# else processor.chat_completions(preprocessed, state, stats_collect_fn)
|
|
||||||
# else:
|
|
||||||
# raise ValueError(f"wrong url_type: expected 'process', 'completions' or 'chat/completions', got {serve_type}")
|
|
||||||
# noinspection PyUnresolvedReferences
|
# noinspection PyUnresolvedReferences
|
||||||
return_value = await processor.postprocess(processed, state, stats_collect_fn) \
|
return_value = await processor.postprocess(processed, state, stats_collect_fn) \
|
||||||
if processor.is_postprocess_async \
|
if processor.is_postprocess_async \
|
||||||
|
@ -763,6 +763,14 @@ class VllmEngine(Singleton):
|
|||||||
)
|
)
|
||||||
self.logger.info("Model {} was added to vllm engine".format(name))
|
self.logger.info("Model {} was added to vllm engine".format(name))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def remove_model(self, name: str) -> None:
|
||||||
|
self.openai_serving_models.base_model_paths = [
|
||||||
|
model for model in self.openai_serving_models.base_model_paths
|
||||||
|
if model.name != name
|
||||||
|
]
|
||||||
|
self.logger.info("Model {} was removed from vllm engine".format(name))
|
||||||
|
return None
|
||||||
|
|
||||||
async def completions(
|
async def completions(
|
||||||
self,
|
self,
|
||||||
|
@ -32,9 +32,17 @@
|
|||||||
5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory):
|
5. Test new endpoint (do notice the first call will trigger the model pulling, so it might take longer, from here on, it's all in memory):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python examples/vllm/test_openai_app.py
|
python examples/vllm/test_openai_api.py
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Available routes**:
|
||||||
|
|
||||||
|
+ /v1/completions
|
||||||
|
+ /v1/chat/completions
|
||||||
|
+ /v1/models
|
||||||
|
|
||||||
|
see [test_openai_api.py](test_openai_api.py) for more information.
|
||||||
|
|
||||||
NOTE!
|
NOTE!
|
||||||
|
|
||||||
If you want to use send_request method, keep in mind that you have to pass "completions" or "chat/completions" in entrypoint (and pass model as a part of "data" parameter) and use it for non-streaming models:
|
If you want to use send_request method, keep in mind that you have to pass "completions" or "chat/completions" in entrypoint (and pass model as a part of "data" parameter) and use it for non-streaming models:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
|
||||||
def main(model_name: str = "test_vllm"):
|
def main(model_name: str):
|
||||||
client = OpenAI(api_key="-")
|
client = OpenAI(api_key="-")
|
||||||
client.base_url = "http://127.0.0.1:8080/serve/openai/v1"
|
client.base_url = "http://127.0.0.1:8080/serve/openai/v1"
|
||||||
|
|
||||||
@ -14,8 +14,7 @@ def main(model_name: str = "test_vllm"):
|
|||||||
max_tokens=1024,
|
max_tokens=1024,
|
||||||
top_p=1.0
|
top_p=1.0
|
||||||
)
|
)
|
||||||
|
print(f"ChatCompletion: \n\n {chat_response.choices[0].message.content}")
|
||||||
print(f"ChatCompletion: {chat_response.choices[0].message.content}")
|
|
||||||
|
|
||||||
comp_response = client.completions.create(
|
comp_response = client.completions.create(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
@ -23,10 +22,13 @@ def main(model_name: str = "test_vllm"):
|
|||||||
temperature=1.0,
|
temperature=1.0,
|
||||||
max_tokens=256
|
max_tokens=256
|
||||||
)
|
)
|
||||||
|
print(f"\n\n Completion: \n\n {comp_response.choices[0].text}")
|
||||||
|
|
||||||
print(f"Completion: \n {comp_response.choices[0].text}")
|
fake_body = {"stream": False, "model": model_name, "prompt": "test"}
|
||||||
|
print(f"Models:\n")
|
||||||
|
print('\n\n'.join(map(str, client.models.list(extra_body=fake_body).data)))
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main(model_name="test_vllm")
|
Loading…
Reference in New Issue
Block a user