clearml-serving/clearml_serving/triton_helper.py
2021-04-12 03:17:25 +03:00

215 lines
9.1 KiB
Python

import re
import subprocess
from argparse import ArgumentParser
from time import time
from typing import Optional
from pathlib2 import Path
from clearml import Task, Logger
from clearml.backend_api.utils import get_http_session_with_retry
from clearml_serving.serving_service import ServingService
class TritonHelper(object):
_metric_line_parsing = r"(\w+){(gpu_uuid=\"[\w\W]*\",)?model=\"(\w+)\",\s*version=\"(\d+)\"}\s*([0-9.]*)"
_default_metrics_port = 8002
def __init__(
self,
args, # Any
task, # type: Task
serving_id, # type: str
metric_host=None, # type: Optional[str]
metric_port=None, # type: int
):
# type: (...) -> None
self._http_session = get_http_session_with_retry()
self.args = dict(**args.__dict__) if args else {}
self.task = task
self.serving_id = serving_id
self.metric_host = metric_host or '0.0.0.0'
self.metric_port = metric_port or self._default_metrics_port
self._parse_metric = re.compile(self._metric_line_parsing)
self._timestamp = time()
print('String Triton Helper service\n{}\n'.format(self.args))
def report_metrics(self, remote_logger):
# type: (Optional[Logger]) -> bool
# iterations are seconds from start
iteration = int(time() - self._timestamp)
report_msg = "reporting metrics: relative time {} sec".format(iteration)
self.task.get_logger().report_text(report_msg)
if remote_logger:
remote_logger.report_text(report_msg)
# noinspection PyBroadException
try:
request = self._http_session.get('http://{}:{}/metrics'.format(
self.metric_host, self.metric_port))
if not request.ok:
return False
content = request.content.decode().split('\n')
except Exception:
return False
for line in content:
line = line.strip()
if not line or line.startswith('#'):
continue
# noinspection PyBroadException
try:
metric, gpu_uuid, variant, version, value = self._parse_metric.match(line).groups()
value = float(value)
except Exception:
continue
self.task.get_logger().report_scalar(
title=metric,
series='{}.v{}'.format(variant, version),
iteration=iteration,
value=value
)
# on the remote logger we add our own Task ID (unique ID),
# to support multiple servers reporting to the same service controller
if remote_logger:
remote_logger.report_scalar(
title=metric,
series='{}.v{}.{}'.format(variant, version, self.task.id),
iteration=iteration,
value=value
)
def maintenance_daemon(
self,
local_model_repo='/models', # type: str
update_frequency_sec=60.0, # type: float
metric_frequency_sec=60.0 # type: float
):
# type: (...) -> None
Path(local_model_repo).mkdir(parents=True, exist_ok=True)
a_service = ServingService(task_id=self.serving_id)
a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)
# noinspection PyProtectedMember
remote_logger = a_service._task.get_logger()
# todo: log triton server outputs when running locally
# we assume we can run the triton server
cmd = [
'tritonserver',
'--model-control-mode=poll',
'--model-repository={}'.format(local_model_repo),
'--repository-poll-secs={}'.format(update_frequency_sec),
'--metrics-port={}'.format(self._default_metrics_port),
'--allow-metrics=true',
'--allow-gpu-metrics=true',
]
for k, v in self.args.items():
if not v or not str(k).startswith('t_'):
continue
cmd.append('--{}={}'.format(k, v))
print('Starting server: {}'.format(cmd))
proc = subprocess.Popen(cmd)
base_freq = min(update_frequency_sec, metric_frequency_sec)
metric_tic = update_tic = time()
while True:
try:
error_code = proc.wait(timeout=base_freq)
if error_code == 0:
print("triton-server process ended with error code {}".format(error_code))
return
raise ValueError("triton-server process ended with error code {}".format(error_code))
except subprocess.TimeoutExpired:
pass
pass
# update models
if time() - update_tic > update_frequency_sec:
a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)
update_tic = time()
# update stats
if time() - metric_tic > metric_frequency_sec:
metric_tic = time()
self.report_metrics(remote_logger)
def main():
title = 'clearml-serving - Nvidia Triton Engine Helper'
print(title)
parser = ArgumentParser(prog='clearml-serving', description=title)
parser.add_argument(
'--serving-id', default=None, type=str, required=True,
help='Specify main serving service Task ID')
parser.add_argument(
'--project', default='serving', type=str,
help='Optional specify project for the serving engine Task')
parser.add_argument(
'--name', default='nvidia-triton', type=str,
help='Optional specify task name for the serving engine Task')
parser.add_argument(
'--update-frequency', default=10, type=float,
help='Model update frequency in minutes')
parser.add_argument(
'--metric-frequency', default=1, type=float,
help='Metric reporting update frequency in minutes')
parser.add_argument(
'--t-http-port', type=str, help='<integer> The port for the server to listen on for HTTP requests')
parser.add_argument(
'--t-http-thread-count', type=str, help='<integer> Number of threads handling HTTP requests')
parser.add_argument(
'--t-allow-grpc', type=str, help='<integer> Allow the server to listen for GRPC requests')
parser.add_argument(
'--t-grpc-port', type=str, help='<integer> The port for the server to listen on for GRPC requests')
parser.add_argument(
'--t-grpc-infer-allocation-pool-size', type=str,
help='<integer> The maximum number of inference request/response objects that remain '
'allocated for reuse. As long as the number of in-flight requests doesn\'t exceed '
'this value there will be no allocation/deallocation of request/response objects')
parser.add_argument(
'--t-pinned-memory-pool-byte-size', type=str,
help='<integer> The total byte size that can be allocated as pinned system '
'memory. If GPU support is enabled, the server will allocate pinned '
'system memory to accelerate data transfer between host and devices '
'until it exceeds the specified byte size. This option will not affect '
'the allocation conducted by the backend frameworks. Default is 256 MB')
parser.add_argument(
'--t-cuda-memory-pool-byte-size', type=str,
help='<<integer>:<integer>> The total byte size that can be allocated as CUDA memory for '
'the GPU device. If GPU support is enabled, the server will allocate '
'CUDA memory to minimize data transfer between host and devices '
'until it exceeds the specified byte size. This option will not affect '
'the allocation conducted by the backend frameworks. The argument '
'should be 2 integers separated by colons in the format <GPU device'
'ID>:<pool byte size>. This option can be used multiple times, but only '
'once per GPU device. Subsequent uses will overwrite previous uses for '
'the same GPU device. Default is 64 MB')
parser.add_argument(
'--t-min-supported-compute-capability', type=str,
help='<float> The minimum supported CUDA compute capability. GPUs that '
'don\'t support this compute capability will not be used by the server')
parser.add_argument(
'--t-buffer-manager-thread-count', type=str,
help='<integer> The number of threads used to accelerate copies and other'
'operations required to manage input and output tensor contents.'
'Default is 0')
args = parser.parse_args()
task = Task.init(project_name=args.project, task_name=args.name, task_type=Task.TaskTypes.inference)
helper = TritonHelper(args, task, serving_id=args.serving_id)
# this function will never end
helper.maintenance_daemon(
local_model_repo='/models',
update_frequency_sec=args.update_frequency*60.0,
metric_frequency_sec=args.metric_frequency*60.0,
)
if __name__ == '__main__':
main()