clearml-serving/clearml_serving/triton_helper.py

import re
import subprocess
from argparse import ArgumentParser
from time import time
from typing import Optional

from pathlib2 import Path

from clearml import Task, Logger
from clearml.backend_api.utils import get_http_session_with_retry
from clearml_serving.serving_service import ServingService


class TritonHelper(object):
    _metric_line_parsing = r"(\w+){(gpu_uuid=\"[\w\W]*\",)?model=\"(\w+)\",\s*version=\"(\d+)\"}\s*([0-9.]*)"
    _default_metrics_port = 8002

    def __init__(
            self,
            args,  # Any
            task,  # type: Task
            serving_id,  # type: str
            metric_host=None,  # type: Optional[str]
            metric_port=None,  # type: int
    ):
        # type: (...) -> None
        self._http_session = get_http_session_with_retry()
        self.args = dict(**args.__dict__) if args else {}
        self.task = task
        self.serving_id = serving_id
        self.metric_host = metric_host or '0.0.0.0'
        self.metric_port = metric_port or self._default_metrics_port
        self._parse_metric = re.compile(self._metric_line_parsing)
        self._timestamp = time()
        print('String Triton Helper service\n{}\n'.format(self.args))

    def report_metrics(self, remote_logger):
        # type: (Optional[Logger]) -> bool
        # iterations are seconds from start
        iteration = int(time() - self._timestamp)

        report_msg = "reporting metrics: relative time {} sec".format(iteration)
        self.task.get_logger().report_text(report_msg)
        if remote_logger:
            remote_logger.report_text(report_msg)

        # noinspection PyBroadException
        try:
            request = self._http_session.get('http://{}:{}/metrics'.format(
                self.metric_host, self.metric_port))
            if not request.ok:
                return False
            content = request.content.decode().split('\n')
        except Exception:
            return False

        for line in content:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            # noinspection PyBroadException
            try:
                metric, gpu_uuid, variant, version, value = self._parse_metric.match(line).groups()
                value = float(value)
            except Exception:
                continue
            self.task.get_logger().report_scalar(
                title=metric,
                series='{}.v{}'.format(variant, version),
                iteration=iteration,
                value=value
            )
            # on the remote logger we add our own Task ID (unique ID),
            # to support multiple servers reporting to the same service controller
            if remote_logger:
                remote_logger.report_scalar(
                    title=metric,
                    series='{}.v{}.{}'.format(variant, version, self.task.id),
                    iteration=iteration,
                    value=value
                )

    def maintenance_daemon(
            self,
            local_model_repo='/models',  # type: str
            update_frequency_sec=60.0,  # type: float
            metric_frequency_sec=60.0  # type: float
    ):
        # type: (...) -> None

        Path(local_model_repo).mkdir(parents=True, exist_ok=True)

        a_service = ServingService(task_id=self.serving_id)
        a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)

        # noinspection PyProtectedMember
        remote_logger = a_service._task.get_logger()

        # todo: log triton server outputs when running locally

        # we assume we can run the triton server
        cmd = [
            'tritonserver',
            '--model-control-mode=poll',
            '--model-repository={}'.format(local_model_repo),
            '--repository-poll-secs={}'.format(update_frequency_sec),
            '--metrics-port={}'.format(self._default_metrics_port),
            '--allow-metrics=true',
            '--allow-gpu-metrics=true',
        ]
        for k, v in self.args.items():
            if not v or not str(k).startswith('t_'):
                continue
            cmd.append('--{}={}'.format(k, v))

        print('Starting server: {}'.format(cmd))
        proc = subprocess.Popen(cmd)
        base_freq = min(update_frequency_sec, metric_frequency_sec)
        metric_tic = update_tic = time()
        while True:
            try:
                error_code = proc.wait(timeout=base_freq)
                if error_code == 0:
                    print("triton-server process ended with error code {}".format(error_code))
                    return
                raise ValueError("triton-server process ended with error code {}".format(error_code))
            except subprocess.TimeoutExpired:
                pass
            pass

            # update models
            if time() - update_tic > update_frequency_sec:
                a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)
                update_tic = time()

            # update stats
            if time() - metric_tic > metric_frequency_sec:
                metric_tic = time()
                self.report_metrics(remote_logger)


def main():
    title = 'clearml-serving - Nvidia Triton Engine Helper'
    print(title)
    parser = ArgumentParser(prog='clearml-serving', description=title)
    parser.add_argument(
        '--serving-id', default=None, type=str, required=True,
        help='Specify main serving service Task ID')
    parser.add_argument(
        '--project', default='serving', type=str,
        help='Optional specify project for the serving engine Task')
    parser.add_argument(
        '--name', default='nvidia-triton', type=str,
        help='Optional specify task name for the serving engine Task')
    parser.add_argument(
        '--update-frequency', default=10, type=float,
        help='Model update frequency in minutes')
    parser.add_argument(
        '--metric-frequency', default=1, type=float,
        help='Metric reporting update frequency in minutes')
    parser.add_argument(
        '--t-http-port', type=str, help='<integer> The port for the server to listen on for HTTP requests')
    parser.add_argument(
        '--t-http-thread-count', type=str, help='<integer> Number of threads handling HTTP requests')
    parser.add_argument(
        '--t-allow-grpc', type=str, help='<integer> Allow the server to listen for GRPC requests')
    parser.add_argument(
        '--t-grpc-port', type=str, help='<integer> The port for the server to listen on for GRPC requests')
    parser.add_argument(
        '--t-grpc-infer-allocation-pool-size', type=str,
        help='<integer> The maximum number of inference request/response objects that remain '
             'allocated for reuse. As long as the number of in-flight requests doesn\'t exceed '
             'this value there will be no allocation/deallocation of request/response objects')
    parser.add_argument(
        '--t-pinned-memory-pool-byte-size', type=str,
        help='<integer> The total byte size that can be allocated as pinned system '
             'memory. If GPU support is enabled, the server will allocate pinned '
             'system memory to accelerate data transfer between host and devices '
             'until it exceeds the specified byte size. This option will not affect '
             'the allocation conducted by the backend frameworks. Default is 256 MB')
    parser.add_argument(
        '--t-cuda-memory-pool-byte-size', type=str,
        help='<<integer>:<integer>> The total byte size that can be allocated as CUDA memory for '
             'the GPU device. If GPU support is enabled, the server will allocate '
             'CUDA memory to minimize data transfer between host and devices '
             'until it exceeds the specified byte size. This option will not affect '
             'the allocation conducted by the backend frameworks. The argument '
             'should be 2 integers separated by colons in the format <GPU device'
             'ID>:<pool byte size>. This option can be used multiple times, but only '
             'once per GPU device. Subsequent uses will overwrite previous uses for '
             'the same GPU device. Default is 64 MB')
    parser.add_argument(
        '--t-min-supported-compute-capability', type=str,
        help='<float> The minimum supported CUDA compute capability. GPUs that '
             'don\'t support this compute capability will not be used by the server')
    parser.add_argument(
        '--t-buffer-manager-thread-count', type=str,
        help='<integer> The number of threads used to accelerate copies and other'
             'operations required to manage input and output tensor contents.'
             'Default is 0')

    args = parser.parse_args()
    task = Task.init(project_name=args.project, task_name=args.name, task_type=Task.TaskTypes.inference)
    helper = TritonHelper(args, task, serving_id=args.serving_id)
    # this function will never end
    helper.maintenance_daemon(
        local_model_repo='/models',
        update_frequency_sec=args.update_frequency*60.0,
        metric_frequency_sec=args.metric_frequency*60.0,
    )


if __name__ == '__main__':
    main()
Initial commit 2021-04-12 00:17:25 +00:00			`import re`
			`import subprocess`
			`from argparse import ArgumentParser`
			`from time import time`
			`from typing import Optional`

			`from pathlib2 import Path`

			`from clearml import Task, Logger`
			`from clearml.backend_api.utils import get_http_session_with_retry`
			`from clearml_serving.serving_service import ServingService`


			`class TritonHelper(object):`
			`_metric_line_parsing = r"(\w+){(gpu_uuid=\"[\w\W]\",)?model=\"(\w+)\",\sversion=\"(\d+)\"}\s([0-9.])"`
			`_default_metrics_port = 8002`

			`def __init__(`
			`self,`
			`args, # Any`
			`task, # type: Task`
			`serving_id, # type: str`
			`metric_host=None, # type: Optional[str]`
			`metric_port=None, # type: int`
			`):`
			`# type: (...) -> None`
			`self._http_session = get_http_session_with_retry()`
			`self.args = dict(**args.__dict__) if args else {}`
			`self.task = task`
			`self.serving_id = serving_id`
			`self.metric_host = metric_host or '0.0.0.0'`
			`self.metric_port = metric_port or self._default_metrics_port`
			`self._parse_metric = re.compile(self._metric_line_parsing)`
			`self._timestamp = time()`
			`print('String Triton Helper service\n{}\n'.format(self.args))`

			`def report_metrics(self, remote_logger):`
			`# type: (Optional[Logger]) -> bool`
			`# iterations are seconds from start`
			`iteration = int(time() - self._timestamp)`

			`report_msg = "reporting metrics: relative time {} sec".format(iteration)`
			`self.task.get_logger().report_text(report_msg)`
			`if remote_logger:`
			`remote_logger.report_text(report_msg)`

			`# noinspection PyBroadException`
			`try:`
			`request = self._http_session.get('http://{}:{}/metrics'.format(`
			`self.metric_host, self.metric_port))`
			`if not request.ok:`
			`return False`
			`content = request.content.decode().split('\n')`
			`except Exception:`
			`return False`

			`for line in content:`
			`line = line.strip()`
			`if not line or line.startswith('#'):`
			`continue`
			`# noinspection PyBroadException`
			`try:`
			`metric, gpu_uuid, variant, version, value = self._parse_metric.match(line).groups()`
			`value = float(value)`
			`except Exception:`
			`continue`
			`self.task.get_logger().report_scalar(`
			`title=metric,`
			`series='{}.v{}'.format(variant, version),`
			`iteration=iteration,`
			`value=value`
			`)`
			`# on the remote logger we add our own Task ID (unique ID),`
			`# to support multiple servers reporting to the same service controller`
			`if remote_logger:`
			`remote_logger.report_scalar(`
			`title=metric,`
			`series='{}.v{}.{}'.format(variant, version, self.task.id),`
			`iteration=iteration,`
			`value=value`
			`)`

			`def maintenance_daemon(`
			`self,`
			`local_model_repo='/models', # type: str`
			`update_frequency_sec=60.0, # type: float`
			`metric_frequency_sec=60.0 # type: float`
			`):`
			`# type: (...) -> None`

			`Path(local_model_repo).mkdir(parents=True, exist_ok=True)`

			`a_service = ServingService(task_id=self.serving_id)`
			`a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)`

			`# noinspection PyProtectedMember`
			`remote_logger = a_service._task.get_logger()`

			`# todo: log triton server outputs when running locally`

			`# we assume we can run the triton server`
			`cmd = [`
			`'tritonserver',`
			`'--model-control-mode=poll',`
			`'--model-repository={}'.format(local_model_repo),`
			`'--repository-poll-secs={}'.format(update_frequency_sec),`
			`'--metrics-port={}'.format(self._default_metrics_port),`
			`'--allow-metrics=true',`
			`'--allow-gpu-metrics=true',`
			`]`
			`for k, v in self.args.items():`
			`if not v or not str(k).startswith('t_'):`
			`continue`
			`cmd.append('--{}={}'.format(k, v))`

			`print('Starting server: {}'.format(cmd))`
			`proc = subprocess.Popen(cmd)`
			`base_freq = min(update_frequency_sec, metric_frequency_sec)`
			`metric_tic = update_tic = time()`
			`while True:`
			`try:`
			`error_code = proc.wait(timeout=base_freq)`
			`if error_code == 0:`
			`print("triton-server process ended with error code {}".format(error_code))`
			`return`
			`raise ValueError("triton-server process ended with error code {}".format(error_code))`
			`except subprocess.TimeoutExpired:`
			`pass`
			`pass`

			`# update models`
			`if time() - update_tic > update_frequency_sec:`
			`a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)`
			`update_tic = time()`

			`# update stats`
			`if time() - metric_tic > metric_frequency_sec:`
			`metric_tic = time()`
			`self.report_metrics(remote_logger)`


			`def main():`
			`title = 'clearml-serving - Nvidia Triton Engine Helper'`
			`print(title)`
			`parser = ArgumentParser(prog='clearml-serving', description=title)`
			`parser.add_argument(`
			`'--serving-id', default=None, type=str, required=True,`
			`help='Specify main serving service Task ID')`
			`parser.add_argument(`
			`'--project', default='serving', type=str,`
			`help='Optional specify project for the serving engine Task')`
			`parser.add_argument(`
			`'--name', default='nvidia-triton', type=str,`
			`help='Optional specify task name for the serving engine Task')`
			`parser.add_argument(`
			`'--update-frequency', default=10, type=float,`
			`help='Model update frequency in minutes')`
			`parser.add_argument(`
			`'--metric-frequency', default=1, type=float,`
			`help='Metric reporting update frequency in minutes')`
			`parser.add_argument(`
			`'--t-http-port', type=str, help='<integer> The port for the server to listen on for HTTP requests')`
			`parser.add_argument(`
			`'--t-http-thread-count', type=str, help='<integer> Number of threads handling HTTP requests')`
			`parser.add_argument(`
			`'--t-allow-grpc', type=str, help='<integer> Allow the server to listen for GRPC requests')`
			`parser.add_argument(`
			`'--t-grpc-port', type=str, help='<integer> The port for the server to listen on for GRPC requests')`
			`parser.add_argument(`
			`'--t-grpc-infer-allocation-pool-size', type=str,`
			`help='<integer> The maximum number of inference request/response objects that remain '`
			`'allocated for reuse. As long as the number of in-flight requests doesn\'t exceed '`
			`'this value there will be no allocation/deallocation of request/response objects')`
			`parser.add_argument(`
			`'--t-pinned-memory-pool-byte-size', type=str,`
			`help='<integer> The total byte size that can be allocated as pinned system '`
			`'memory. If GPU support is enabled, the server will allocate pinned '`
			`'system memory to accelerate data transfer between host and devices '`
			`'until it exceeds the specified byte size. This option will not affect '`
			`'the allocation conducted by the backend frameworks. Default is 256 MB')`
			`parser.add_argument(`
			`'--t-cuda-memory-pool-byte-size', type=str,`
			`help='<<integer>:<integer>> The total byte size that can be allocated as CUDA memory for '`
			`'the GPU device. If GPU support is enabled, the server will allocate '`
			`'CUDA memory to minimize data transfer between host and devices '`
			`'until it exceeds the specified byte size. This option will not affect '`
			`'the allocation conducted by the backend frameworks. The argument '`
			`'should be 2 integers separated by colons in the format <GPU device'`
			`'ID>:<pool byte size>. This option can be used multiple times, but only '`
			`'once per GPU device. Subsequent uses will overwrite previous uses for '`
			`'the same GPU device. Default is 64 MB')`
			`parser.add_argument(`
			`'--t-min-supported-compute-capability', type=str,`
			`help='<float> The minimum supported CUDA compute capability. GPUs that '`
			`'don\'t support this compute capability will not be used by the server')`
			`parser.add_argument(`
			`'--t-buffer-manager-thread-count', type=str,`
			`help='<integer> The number of threads used to accelerate copies and other'`
			`'operations required to manage input and output tensor contents.'`
			`'Default is 0')`

			`args = parser.parse_args()`
			`task = Task.init(project_name=args.project, task_name=args.name, task_type=Task.TaskTypes.inference)`
			`helper = TritonHelper(args, task, serving_id=args.serving_id)`
			`# this function will never end`
			`helper.maintenance_daemon(`
			`local_model_repo='/models',`
			`update_frequency_sec=args.update_frequency*60.0,`
			`metric_frequency_sec=args.metric_frequency*60.0,`
			`)`


			`if __name__ == '__main__':`
			`main()`