Track token usage for LLM streaming responses

This commit is contained in:
Phil Szalay 2025-03-24 12:25:09 +01:00
parent c5446db233
commit c56dbe19cf

View File

@ -690,6 +690,10 @@ async def generate_chat_completion(
if "max_tokens" in payload and "max_completion_tokens" in payload:
del payload["max_tokens"]
# Add stream_options to include usage information in streaming responses
if "stream" in payload and payload["stream"]:
payload["stream_options"] = {"include_usage": True}
# Convert the modified body back to JSON
payload = json.dumps(payload)