clearml-serving/clearml_serving/triton_helper.py

215 lines
9.1 KiB
Python
Raw Normal View History

2021-04-12 00:17:25 +00:00
import re
import subprocess
from argparse import ArgumentParser
from time import time
from typing import Optional
from pathlib2 import Path
from clearml import Task, Logger
from clearml.backend_api.utils import get_http_session_with_retry
from clearml_serving.serving_service import ServingService
class TritonHelper(object):
_metric_line_parsing = r"(\w+){(gpu_uuid=\"[\w\W]*\",)?model=\"(\w+)\",\s*version=\"(\d+)\"}\s*([0-9.]*)"
_default_metrics_port = 8002
def __init__(
self,
args, # Any
task, # type: Task
serving_id, # type: str
metric_host=None, # type: Optional[str]
metric_port=None, # type: int
):
# type: (...) -> None
self._http_session = get_http_session_with_retry()
self.args = dict(**args.__dict__) if args else {}
self.task = task
self.serving_id = serving_id
self.metric_host = metric_host or '0.0.0.0'
self.metric_port = metric_port or self._default_metrics_port
self._parse_metric = re.compile(self._metric_line_parsing)
self._timestamp = time()
print('String Triton Helper service\n{}\n'.format(self.args))
def report_metrics(self, remote_logger):
# type: (Optional[Logger]) -> bool
# iterations are seconds from start
iteration = int(time() - self._timestamp)
report_msg = "reporting metrics: relative time {} sec".format(iteration)
self.task.get_logger().report_text(report_msg)
if remote_logger:
remote_logger.report_text(report_msg)
# noinspection PyBroadException
try:
request = self._http_session.get('http://{}:{}/metrics'.format(
self.metric_host, self.metric_port))
if not request.ok:
return False
content = request.content.decode().split('\n')
except Exception:
return False
for line in content:
line = line.strip()
if not line or line.startswith('#'):
continue
# noinspection PyBroadException
try:
metric, gpu_uuid, variant, version, value = self._parse_metric.match(line).groups()
value = float(value)
except Exception:
continue
self.task.get_logger().report_scalar(
title=metric,
series='{}.v{}'.format(variant, version),
iteration=iteration,
value=value
)
# on the remote logger we add our own Task ID (unique ID),
# to support multiple servers reporting to the same service controller
if remote_logger:
remote_logger.report_scalar(
title=metric,
series='{}.v{}.{}'.format(variant, version, self.task.id),
iteration=iteration,
value=value
)
def maintenance_daemon(
self,
local_model_repo='/models', # type: str
update_frequency_sec=60.0, # type: float
metric_frequency_sec=60.0 # type: float
):
# type: (...) -> None
Path(local_model_repo).mkdir(parents=True, exist_ok=True)
a_service = ServingService(task_id=self.serving_id)
a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)
# noinspection PyProtectedMember
remote_logger = a_service._task.get_logger()
# todo: log triton server outputs when running locally
# we assume we can run the triton server
cmd = [
'tritonserver',
'--model-control-mode=poll',
'--model-repository={}'.format(local_model_repo),
'--repository-poll-secs={}'.format(update_frequency_sec),
'--metrics-port={}'.format(self._default_metrics_port),
'--allow-metrics=true',
'--allow-gpu-metrics=true',
]
for k, v in self.args.items():
if not v or not str(k).startswith('t_'):
continue
cmd.append('--{}={}'.format(k, v))
print('Starting server: {}'.format(cmd))
proc = subprocess.Popen(cmd)
base_freq = min(update_frequency_sec, metric_frequency_sec)
metric_tic = update_tic = time()
while True:
try:
error_code = proc.wait(timeout=base_freq)
if error_code == 0:
print("triton-server process ended with error code {}".format(error_code))
return
raise ValueError("triton-server process ended with error code {}".format(error_code))
except subprocess.TimeoutExpired:
pass
pass
# update models
if time() - update_tic > update_frequency_sec:
a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)
update_tic = time()
# update stats
if time() - metric_tic > metric_frequency_sec:
metric_tic = time()
self.report_metrics(remote_logger)
def main():
title = 'clearml-serving - Nvidia Triton Engine Helper'
print(title)
parser = ArgumentParser(prog='clearml-serving', description=title)
parser.add_argument(
'--serving-id', default=None, type=str, required=True,
help='Specify main serving service Task ID')
parser.add_argument(
'--project', default='serving', type=str,
help='Optional specify project for the serving engine Task')
parser.add_argument(
'--name', default='nvidia-triton', type=str,
help='Optional specify task name for the serving engine Task')
parser.add_argument(
'--update-frequency', default=10, type=float,
help='Model update frequency in minutes')
parser.add_argument(
'--metric-frequency', default=1, type=float,
help='Metric reporting update frequency in minutes')
parser.add_argument(
'--t-http-port', type=str, help='<integer> The port for the server to listen on for HTTP requests')
parser.add_argument(
'--t-http-thread-count', type=str, help='<integer> Number of threads handling HTTP requests')
parser.add_argument(
'--t-allow-grpc', type=str, help='<integer> Allow the server to listen for GRPC requests')
parser.add_argument(
'--t-grpc-port', type=str, help='<integer> The port for the server to listen on for GRPC requests')
parser.add_argument(
'--t-grpc-infer-allocation-pool-size', type=str,
help='<integer> The maximum number of inference request/response objects that remain '
'allocated for reuse. As long as the number of in-flight requests doesn\'t exceed '
'this value there will be no allocation/deallocation of request/response objects')
parser.add_argument(
'--t-pinned-memory-pool-byte-size', type=str,
help='<integer> The total byte size that can be allocated as pinned system '
'memory. If GPU support is enabled, the server will allocate pinned '
'system memory to accelerate data transfer between host and devices '
'until it exceeds the specified byte size. This option will not affect '
'the allocation conducted by the backend frameworks. Default is 256 MB')
parser.add_argument(
'--t-cuda-memory-pool-byte-size', type=str,
help='<<integer>:<integer>> The total byte size that can be allocated as CUDA memory for '
'the GPU device. If GPU support is enabled, the server will allocate '
'CUDA memory to minimize data transfer between host and devices '
'until it exceeds the specified byte size. This option will not affect '
'the allocation conducted by the backend frameworks. The argument '
'should be 2 integers separated by colons in the format <GPU device'
'ID>:<pool byte size>. This option can be used multiple times, but only '
'once per GPU device. Subsequent uses will overwrite previous uses for '
'the same GPU device. Default is 64 MB')
parser.add_argument(
'--t-min-supported-compute-capability', type=str,
help='<float> The minimum supported CUDA compute capability. GPUs that '
'don\'t support this compute capability will not be used by the server')
parser.add_argument(
'--t-buffer-manager-thread-count', type=str,
help='<integer> The number of threads used to accelerate copies and other'
'operations required to manage input and output tensor contents.'
'Default is 0')
args = parser.parse_args()
task = Task.init(project_name=args.project, task_name=args.name, task_type=Task.TaskTypes.inference)
helper = TritonHelper(args, task, serving_id=args.serving_id)
# this function will never end
helper.maintenance_daemon(
local_model_repo='/models',
update_frequency_sec=args.update_frequency*60.0,
metric_frequency_sec=args.metric_frequency*60.0,
)
if __name__ == '__main__':
main()