diff --git a/examples/clearml_serving_simple_http_inference_request/68747470733a2f2f646174616d61646e6573732e6769746875622e696f2f6173736574732f696d616765732f74665f66696c655f666565642f4d4e4953545f64696769742e706e67.png b/examples/clearml_serving_simple_http_inference_request/68747470733a2f2f646174616d61646e6573732e6769746875622e696f2f6173736574732f696d616765732f74665f66696c655f666565642f4d4e4953545f64696769742e706e67.png new file mode 100644 index 0000000..0aafd0b Binary files /dev/null and b/examples/clearml_serving_simple_http_inference_request/68747470733a2f2f646174616d61646e6573732e6769746875622e696f2f6173736574732f696d616765732f74665f66696c655f666565642f4d4e4953545f64696769742e706e67.png differ diff --git a/examples/clearml_serving_simple_http_inference_request/client.py b/examples/clearml_serving_simple_http_inference_request/client.py new file mode 100644 index 0000000..462145f --- /dev/null +++ b/examples/clearml_serving_simple_http_inference_request/client.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +from PIL import Image +import numpy as np + +from http_triton import InferenceServerClient, InferInput + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-v', + '--verbose', + action="store_true", + required=False, + default=False, + help='Enable verbose output') + parser.add_argument('-u', + '--url', + type=str, + required=False, + default='localhost:8000', + help='Inference server URL. Default localhost:8000') + + FLAGS = parser.parse_args() + + model_name = "keras_mnist" + model_version = "1" + + input_name = "dense_input" + shape = (1, 784) + datatype = 'FP32' + + output_name = 'activation_2' + + # Path of an image + image_path = '68747470733a2f2f646174616d61646e6573732e6769746875622e696f2f6173736574732f696d616765732f74665f66696c655f666565642f4d4e4953545f64696769742e706e67.png' + + # The image is opened using Pillow, then converted to grayscale since the model deployed is trained on grayscale images + image = Image.open(image_path).convert('L') + + # The image is resized to 28x28 pixels + image = image.resize(shape, Image.ANTIALIAS) + + # The image is converted to a numpy array and data type is converted to float32 since the model is trained on float32 + np_image = np.array(image).astype(np.float32) + + # The image is reshaped to fit the model + np_image = np_image.reshape(-1, 784) + + # Create an InferInput object with the input name, its data type and its shape defined + inferInput = InferInput(name=input_name, datatype=datatype, shape=shape) + + # Set the data inside the InferInput object to the image in numpy format + inferInput.set_data_from_numpy(np_image) + + # Create an InferenceServerClient and pass to it the url of the server + client = InferenceServerClient(url=FLAGS.url, verbose=FLAGS.verbose) + + # Call client.infer(), pass the model name, version and the InferInput object inside a list since there can be multiple inputs + inferResult = client.infer(model_name=model_name, inputs=[inferInput], model_version=model_version) + + # Print the output of the model in numpy format, pass in the name of the output layer + print(inferResult.as_numpy(output_name)) \ No newline at end of file diff --git a/examples/clearml_serving_simple_http_inference_request/http_triton.py b/examples/clearml_serving_simple_http_inference_request/http_triton.py new file mode 100644 index 0000000..de0e103 --- /dev/null +++ b/examples/clearml_serving_simple_http_inference_request/http_triton.py @@ -0,0 +1,1970 @@ +# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +try: + from geventhttpclient import HTTPClient + from geventhttpclient.url import URL + import gevent + import gevent.pool + from urllib.parse import quote, quote_plus + import rapidjson as json + import numpy as np + import struct + import gzip, zlib +except ModuleNotFoundError as error: + raise RuntimeError( + 'The installation does not include http support. Specify \'http\' or \'all\' while installing the tritonclient package to include the support' + ) from error + +from tritonclient.utils import * + + +def _get_error(response): + """ + Returns the InferenceServerException object if response + indicates the error. If no error then return None + """ + if response.status_code != 200: + error_response = json.loads(response.read()) + return InferenceServerException(msg=error_response["error"]) + else: + return None + + +def _raise_if_error(response): + """ + Raise InferenceServerException if received non-Success + response from the server + """ + error = _get_error(response) + if error is not None: + raise error + + +def _get_query_string(query_params): + params = [] + for key, value in query_params.items(): + if isinstance(value, list): + for item in value: + params.append("%s=%s" % + (quote_plus(key), quote_plus(str(item)))) + else: + params.append("%s=%s" % (quote_plus(key), quote_plus(str(value)))) + if params: + return "&".join(params) + return '' + + +def _get_inference_request(inputs, request_id, outputs, sequence_id, + sequence_start, sequence_end, priority, timeout): + infer_request = {} + parameters = {} + if request_id != "": + infer_request['id'] = request_id + if sequence_id != 0 and sequence_id != "": + parameters['sequence_id'] = sequence_id + parameters['sequence_start'] = sequence_start + parameters['sequence_end'] = sequence_end + if priority != 0: + parameters['priority'] = priority + if timeout is not None: + parameters['timeout'] = timeout + + infer_request['inputs'] = [ + this_input._get_tensor() for this_input in inputs + ] + if outputs: + infer_request['outputs'] = [ + this_output._get_tensor() for this_output in outputs + ] + else: + # no outputs specified so set 'binary_data_output' True in the + # request so that all outputs are returned in binary format + parameters['binary_data_output'] = True + + if parameters: + infer_request['parameters'] = parameters + + request_body = json.dumps(infer_request) + json_size = len(request_body) + binary_data = None + for input_tensor in inputs: + raw_data = input_tensor._get_binary_data() + if raw_data is not None: + if binary_data is not None: + binary_data += raw_data + else: + binary_data = raw_data + + if binary_data is not None: + request_body = struct.pack( + '{}s{}s'.format(len(request_body), len(binary_data)), + request_body.encode(), binary_data) + return request_body, json_size + + return request_body, None + + +class InferenceServerClient: + """An InferenceServerClient object is used to perform any kind of + communication with the InferenceServer using http protocol. None + of the methods are thread safe. The object is intended to be used + by a single thread and simultaneously calling different methods + with different threads is not supported and will cause undefined + behavior. + + Parameters + ---------- + url : str + The inference server name, port and optional base path + in the following format: host:port/, e.g. + 'localhost:8000'. + + verbose : bool + If True generate verbose output. Default value is False. + concurrency : int + The number of connections to create for this client. + Default value is 1. + connection_timeout : float + The timeout value for the connection. Default value + is 60.0 sec. + network_timeout : float + The timeout value for the network. Default value is + 60.0 sec + max_greenlets : int + Determines the maximum allowed number of worker greenlets + for handling asynchronous inference requests. Default value + is None, which means there will be no restriction on the + number of greenlets created. + ssl : bool + If True, channels the requests to encrypted https scheme. + Some improper settings may cause connection to prematurely + terminate with an unsuccessful handshake. See + `ssl_context_factory` option for using secure default + settings. Default value for this option is False. + ssl_options : dict + Any options supported by `ssl.wrap_socket` specified as + dictionary. The argument is ignored if 'ssl' is specified + False. + ssl_context_factory : SSLContext callable + It must be a callbable that returns a SSLContext. Set to + `gevent.ssl.create_default_context` to use contexts with + secure default settings. This should most likely resolve + connection issues in a secure way. The default value for + this option is None which directly wraps the socket with + the options provided via `ssl_options`. The argument is + ignored if 'ssl' is specified False. + insecure : bool + If True, then does not match the host name with the certificate. + Default value is False. The argument is ignored if 'ssl' is + specified False. + + Raises + ------ + Exception + If unable to create a client. + + """ + + def __init__(self, + url, + verbose=False, + concurrency=1, + connection_timeout=60.0, + network_timeout=60.0, + max_greenlets=None, + ssl=False, + ssl_options=None, + ssl_context_factory=None, + insecure=False): + if url.startswith("http://") or url.startswith("https://"): + raise_error("url should not include the scheme") + scheme = "https://" if ssl else "http://" + self._parsed_url = URL(scheme + url) + self._base_uri = self._parsed_url.request_uri.rstrip('/') + self._client_stub = HTTPClient.from_url( + self._parsed_url, + concurrency=concurrency, + connection_timeout=connection_timeout, + network_timeout=network_timeout, + ssl_options=ssl_options, + ssl_context_factory=ssl_context_factory, + insecure=insecure) + self._pool = gevent.pool.Pool(max_greenlets) + self._verbose = verbose + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + def __del__(self): + self.close() + + def close(self): + """Close the client. Any future calls to server + will result in an Error. + + """ + self._pool.join() + self._client_stub.close() + + def _get(self, request_uri, headers, query_params): + """Issues the GET request to the server + + Parameters + ---------- + request_uri: str + The request URI to be used in GET request. + headers: dict + Additional HTTP headers to include in the request. + query_params: dict + Optional url query parameters to use in network + transaction. + + Returns + ------- + geventhttpclient.response.HTTPSocketPoolResponse + The response from server. + """ + if self._base_uri is not None: + request_uri = self._base_uri + "/" + request_uri + + if query_params is not None: + request_uri = request_uri + "?" + _get_query_string(query_params) + + if self._verbose: + print("GET {}, headers {}".format(request_uri, headers)) + + if headers is not None: + response = self._client_stub.get(request_uri, headers=headers) + else: + response = self._client_stub.get(request_uri) + + if self._verbose: + print(response) + + return response + + def _post(self, request_uri, request_body, headers, query_params): + """Issues the POST request to the server + + Parameters + ---------- + request_uri: str + The request URI to be used in POST request. + request_body: str + The body of the request + headers: dict + Additional HTTP headers to include in the request. + query_params: dict + Optional url query parameters to use in network + transaction. + + Returns + ------- + geventhttpclient.response.HTTPSocketPoolResponse + The response from server. + """ + if self._base_uri is not None: + request_uri = self._base_uri + "/" + request_uri + + if query_params is not None: + request_uri = request_uri + "?" + _get_query_string(query_params) + + if self._verbose: + print("POST {}, headers {}\n{}".format(request_uri, headers, + request_body)) + + if headers is not None: + response = self._client_stub.post(request_uri=request_uri, + body=request_body, + headers=headers) + else: + response = self._client_stub.post(request_uri=request_uri, + body=request_body) + + if self._verbose: + print(response) + + return response + + def is_server_live(self, headers=None, query_params=None): + """Contact the inference server and get liveness. + + Parameters + ---------- + headers: dict + Optional dictionary specifying additional HTTP + headers to include in the request. + query_params: dict + Optional url query parameters to use in network + transaction. + + Returns + ------- + bool + True if server is live, False if server is not live. + + Raises + ------ + Exception + If unable to get liveness. + + """ + + request_uri = "v2/health/live" + response = self._get(request_uri=request_uri, + headers=headers, + query_params=query_params) + + return response.status_code == 200 + + def is_server_ready(self, headers=None, query_params=None): + """Contact the inference server and get readiness. + + Parameters + ---------- + headers: dict + Optional dictionary specifying additional HTTP + headers to include in the request. + query_params: dict + Optional url query parameters to use in network + transaction. + + Returns + ------- + bool + True if server is ready, False if server is not ready. + + Raises + ------ + Exception + If unable to get readiness. + + """ + request_uri = "v2/health/ready" + response = self._get(request_uri=request_uri, + headers=headers, + query_params=query_params) + + return response.status_code == 200 + + def is_model_ready(self, + model_name, + model_version="", + headers=None, + query_params=None): + """Contact the inference server and get the readiness of specified model. + + Parameters + ---------- + model_name: str + The name of the model to check for readiness. + model_version: str + The version of the model to check for readiness. The default value + is an empty string which means then the server will choose a version + based on the model and internal policy. + headers: dict + Optional dictionary specifying additional HTTP + headers to include in the request. + query_params: dict + Optional url query parameters to use in network + transaction. + + Returns + ------- + bool + True if the model is ready, False if not ready. + + Raises + ------ + Exception + If unable to get model readiness. + + """ + if type(model_version) != str: + raise_error("model version must be a string") + if model_version != "": + request_uri = "v2/models/{}/versions/{}/ready".format( + quote(model_name), model_version) + else: + request_uri = "v2/models/{}/ready".format(quote(model_name)) + + response = self._get(request_uri=request_uri, + headers=headers, + query_params=query_params) + + return response.status_code == 200 + + def get_server_metadata(self, headers=None, query_params=None): + """Contact the inference server and get its metadata. + + Parameters + ---------- + headers: dict + Optional dictionary specifying additional HTTP + headers to include in the request. + query_params: dict + Optional url query parameters to use in network + transaction. + + Returns + ------- + dict + The JSON dict holding the metadata. + + Raises + ------ + InferenceServerException + If unable to get server metadata. + + """ + request_uri = "v2" + response = self._get(request_uri=request_uri, + headers=headers, + query_params=query_params) + _raise_if_error(response) + + content = response.read() + if self._verbose: + print(content) + + return json.loads(content) + + def get_model_metadata(self, + model_name, + model_version="", + headers=None, + query_params=None): + """Contact the inference server and get the metadata for specified model. + + Parameters + ---------- + model_name: str + The name of the model + model_version: str + The version of the model to get metadata. The default value + is an empty string which means then the server will choose + a version based on the model and internal policy. + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Returns + ------- + dict + The JSON dict holding the metadata. + + Raises + ------ + InferenceServerException + If unable to get model metadata. + + """ + if type(model_version) != str: + raise_error("model version must be a string") + if model_version != "": + request_uri = "v2/models/{}/versions/{}".format( + quote(model_name), model_version) + else: + request_uri = "v2/models/{}".format(quote(model_name)) + + response = self._get(request_uri=request_uri, + headers=headers, + query_params=query_params) + _raise_if_error(response) + + content = response.read() + if self._verbose: + print(content) + + return json.loads(content) + + def get_model_config(self, + model_name, + model_version="", + headers=None, + query_params=None): + """Contact the inference server and get the configuration for specified model. + + Parameters + ---------- + model_name: str + The name of the model + model_version: str + The version of the model to get configuration. The default value + is an empty string which means then the server will choose + a version based on the model and internal policy. + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Returns + ------- + dict + The JSON dict holding the model config. + + Raises + ------ + InferenceServerException + If unable to get model configuration. + + """ + if model_version != "": + request_uri = "v2/models/{}/versions/{}/config".format( + quote(model_name), model_version) + else: + request_uri = "v2/models/{}/config".format(quote(model_name)) + + response = self._get(request_uri=request_uri, + headers=headers, + query_params=query_params) + _raise_if_error(response) + + content = response.read() + if self._verbose: + print(content) + + return json.loads(content) + + def get_model_repository_index(self, headers=None, query_params=None): + """Get the index of model repository contents + + Parameters + ---------- + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Returns + ------- + dict + The JSON dict holding the model repository index. + + Raises + ------ + InferenceServerException + If unable to get the repository index. + + """ + request_uri = "v2/repository/index" + response = self._post(request_uri=request_uri, + request_body="", + headers=headers, + query_params=query_params) + _raise_if_error(response) + + content = response.read() + if self._verbose: + print(content) + + return json.loads(content) + + def load_model(self, model_name, headers=None, query_params=None): + """Request the inference server to load or reload specified model. + + Parameters + ---------- + model_name : str + The name of the model to be loaded. + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Raises + ------ + InferenceServerException + If unable to load the model. + + """ + request_uri = "v2/repository/models/{}/load".format(quote(model_name)) + response = self._post(request_uri=request_uri, + request_body="", + headers=headers, + query_params=query_params) + _raise_if_error(response) + if self._verbose: + print("Loaded model '{}'".format(model_name)) + + def unload_model(self, + model_name, + headers=None, + query_params=None, + unload_dependents=False): + """Request the inference server to unload specified model. + + Parameters + ---------- + model_name : str + The name of the model to be unloaded. + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + unload_dependents : bool + Whether the dependents of the model should also be unloaded. + + Raises + ------ + InferenceServerException + If unable to unload the model. + + """ + request_uri = "v2/repository/models/{}/unload".format(quote(model_name)) + unload_request = { + "parameters": { + "unload_dependents": unload_dependents + } + } + response = self._post(request_uri=request_uri, + request_body=json.dumps(unload_request), + headers=headers, + query_params=query_params) + _raise_if_error(response) + if self._verbose: + print("Loaded model '{}'".format(model_name)) + + def get_inference_statistics(self, + model_name="", + model_version="", + headers=None, + query_params=None): + """Get the inference statistics for the specified model name and + version. + + Parameters + ---------- + model_name : str + The name of the model to get statistics. The default value is + an empty string, which means statistics of all models will + be returned. + model_version: str + The version of the model to get inference statistics. The + default value is an empty string which means then the server + will return the statistics of all available model versions. + headers: dict + Optional dictionary specifying additional HTTP + headers to include in the request. + query_params: dict + Optional url query parameters to use in network + transaction + + Returns + ------- + dict + The JSON dict holding the model inference statistics. + + Raises + ------ + InferenceServerException + If unable to get the model inference statistics. + + """ + + if model_name != "": + if type(model_version) != str: + raise_error("model version must be a string") + if model_version != "": + request_uri = "v2/models/{}/versions/{}/stats".format( + quote(model_name), model_version) + else: + request_uri = "v2/models/{}/stats".format(quote(model_name)) + else: + request_uri = "v2/models/stats" + + response = self._get(request_uri=request_uri, + headers=headers, + query_params=query_params) + _raise_if_error(response) + + content = response.read() + if self._verbose: + print(content) + + return json.loads(content) + + def get_system_shared_memory_status(self, + region_name="", + headers=None, + query_params=None): + """Request system shared memory status from the server. + + Parameters + ---------- + region_name : str + The name of the region to query status. The default + value is an empty string, which means that the status + of all active system shared memory will be returned. + headers: dict + Optional dictionary specifying additional HTTP + headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Returns + ------- + dict + The JSON dict holding system shared memory status. + + Raises + ------ + InferenceServerException + If unable to get the status of specified shared memory. + + """ + if region_name != "": + request_uri = "v2/systemsharedmemory/region/{}/status".format( + quote(region_name)) + else: + request_uri = "v2/systemsharedmemory/status" + + response = self._get(request_uri=request_uri, + headers=headers, + query_params=query_params) + _raise_if_error(response) + + content = response.read() + if self._verbose: + print(content) + + return json.loads(content) + + def register_system_shared_memory(self, + name, + key, + byte_size, + offset=0, + headers=None, + query_params=None): + """Request the server to register a system shared memory with the + following specification. + + Parameters + ---------- + name : str + The name of the region to register. + key : str + The key of the underlying memory object that contains the + system shared memory region. + byte_size : int + The size of the system shared memory region, in bytes. + offset : int + Offset, in bytes, within the underlying memory object to + the start of the system shared memory region. The default + value is zero. + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Raises + ------ + InferenceServerException + If unable to register the specified system shared memory. + + """ + request_uri = "v2/systemsharedmemory/region/{}/register".format( + quote(name)) + + register_request = { + 'key': key, + 'offset': offset, + 'byte_size': byte_size + } + request_body = json.dumps(register_request) + + response = self._post(request_uri=request_uri, + request_body=request_body, + headers=headers, + query_params=query_params) + _raise_if_error(response) + if self._verbose: + print("Registered system shared memory with name '{}'".format(name)) + + def unregister_system_shared_memory(self, + name="", + headers=None, + query_params=None): + """Request the server to unregister a system shared memory with the + specified name. + + Parameters + ---------- + name : str + The name of the region to unregister. The default value is empty + string which means all the system shared memory regions will be + unregistered. + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Raises + ------ + InferenceServerException + If unable to unregister the specified system shared memory region. + + """ + if name != "": + request_uri = "v2/systemsharedmemory/region/{}/unregister".format( + quote(name)) + else: + request_uri = "v2/systemsharedmemory/unregister" + + response = self._post(request_uri=request_uri, + request_body="", + headers=headers, + query_params=query_params) + _raise_if_error(response) + if self._verbose: + if name != "": + print("Unregistered system shared memory with name '{}'".format( + name)) + else: + print("Unregistered all system shared memory regions") + + def get_cuda_shared_memory_status(self, + region_name="", + headers=None, + query_params=None): + """Request cuda shared memory status from the server. + + Parameters + ---------- + region_name : str + The name of the region to query status. The default + value is an empty string, which means that the status + of all active cuda shared memory will be returned. + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Returns + ------- + dict + The JSON dict holding cuda shared memory status. + + Raises + ------ + InferenceServerException + If unable to get the status of specified shared memory. + + """ + if region_name != "": + request_uri = "v2/cudasharedmemory/region/{}/status".format( + quote(region_name)) + else: + request_uri = "v2/cudasharedmemory/status" + + response = self._get(request_uri=request_uri, + headers=headers, + query_params=query_params) + _raise_if_error(response) + + content = response.read() + if self._verbose: + print(content) + + return json.loads(content) + + def register_cuda_shared_memory(self, + name, + raw_handle, + device_id, + byte_size, + headers=None, + query_params=None): + """Request the server to register a system shared memory with the + following specification. + + Parameters + ---------- + name : str + The name of the region to register. + raw_handle : bytes + The raw serialized cudaIPC handle in base64 encoding. + device_id : int + The GPU device ID on which the cudaIPC handle was created. + byte_size : int + The size of the cuda shared memory region, in bytes. + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Raises + ------ + InferenceServerException + If unable to register the specified cuda shared memory. + + """ + request_uri = "v2/cudasharedmemory/region/{}/register".format( + quote(name)) + + register_request = { + 'raw_handle': { + 'b64': raw_handle + }, + 'device_id': device_id, + 'byte_size': byte_size + } + request_body = json.dumps(register_request) + + response = self._post(request_uri=request_uri, + request_body=request_body, + headers=headers, + query_params=query_params) + _raise_if_error(response) + if self._verbose: + print("Registered cuda shared memory with name '{}'".format(name)) + + def unregister_cuda_shared_memory(self, + name="", + headers=None, + query_params=None): + """Request the server to unregister a cuda shared memory with the + specified name. + + Parameters + ---------- + name : str + The name of the region to unregister. The default value is empty + string which means all the cuda shared memory regions will be + unregistered. + headers: dict + Optional dictionary specifying additional + HTTP headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction + + Raises + ------ + InferenceServerException + If unable to unregister the specified cuda shared memory region. + + """ + if name != "": + request_uri = "v2/cudasharedmemory/region/{}/unregister".format( + quote(name)) + else: + request_uri = "v2/cudasharedmemory/unregister" + + response = self._post(request_uri=request_uri, + request_body="", + headers=headers, + query_params=query_params) + _raise_if_error(response) + if self._verbose: + if name != "": + print("Unregistered cuda shared memory with name '{}'".format( + name)) + else: + print("Unregistered all cuda shared memory regions") + + @staticmethod + def generate_request_body(inputs, + outputs=None, + request_id="", + sequence_id=0, + sequence_start=False, + sequence_end=False, + priority=0, + timeout=None): + """Generate a request body for inference using the supplied 'inputs' + requesting the outputs specified by 'outputs'. + + Parameters + ---------- + inputs : list + A list of InferInput objects, each describing data for a input + tensor required by the model. + outputs : list + A list of InferRequestedOutput objects, each describing how the output + data must be returned. If not specified all outputs produced + by the model will be returned using default settings. + request_id: str + Optional identifier for the request. If specified will be returned + in the response. Default value is an empty string which means no + request_id will be used. + sequence_id : int or str + The unique identifier for the sequence being represented by the + object. A value of 0 or "" means that the request does not + belong to a sequence. Default is 0. + sequence_start: bool + Indicates whether the request being added marks the start of the + sequence. Default value is False. This argument is ignored if + 'sequence_id' is 0. + sequence_end: bool + Indicates whether the request being added marks the end of the + sequence. Default value is False. This argument is ignored if + 'sequence_id' is 0. + priority : int + Indicates the priority of the request. Priority value zero + indicates that the default priority level should be used + (i.e. same behavior as not specifying the priority parameter). + Lower value priorities indicate higher priority levels. Thus + the highest priority level is indicated by setting the parameter + to 1, the next highest is 2, etc. If not provided, the server + will handle the request using default setting for the model. + timeout : int + The timeout value for the request, in microseconds. If the request + cannot be completed within the time the server can take a + model-specific action such as terminating the request. If not + provided, the server will handle the request using default setting + for the model. + + Returns + ------- + Bytes + The request body of the inference. + Int + The byte size of the inference request header in the request body. + Returns None if the whole request body constitutes the request header. + + + Raises + ------ + InferenceServerException + If server fails to perform inference. + """ + return _get_inference_request(inputs=inputs, + request_id=request_id, + outputs=outputs, + sequence_id=sequence_id, + sequence_start=sequence_start, + sequence_end=sequence_end, + priority=priority, + timeout=timeout) + + @staticmethod + def parse_response_body(response_body, + verbose=False, + header_length=None, + content_encoding=None): + """Generate a InferResult object from the given 'response_body' + + Parameters + ---------- + response_body : bytes + The inference response from the server + verbose : bool + If True generate verbose output. Default value is False. + header_length : int + The length of the inference header if the header does not occupy + the whole response body. Default value is None. + content_encoding : string + The encoding of the response body if it is compressed. + Default value is None. + + Returns + ------- + InferResult + The InferResult object generated from the response body + """ + return InferResult.from_response_body(response_body, verbose, + header_length, content_encoding) + + def infer(self, + model_name, + inputs, + model_version="", + outputs=None, + request_id="", + sequence_id=0, + sequence_start=False, + sequence_end=False, + priority=0, + timeout=None, + headers=None, + query_params=None, + request_compression_algorithm=None, + response_compression_algorithm=None): + """Run synchronous inference using the supplied 'inputs' requesting + the outputs specified by 'outputs'. + + Parameters + ---------- + model_name: str + The name of the model to run inference. + inputs : list + A list of InferInput objects, each describing data for a input + tensor required by the model. + model_version: str + The version of the model to run inference. The default value + is an empty string which means then the server will choose + a version based on the model and internal policy. + outputs : list + A list of InferRequestedOutput objects, each describing how the output + data must be returned. If not specified all outputs produced + by the model will be returned using default settings. + request_id: str + Optional identifier for the request. If specified will be returned + in the response. Default value is an empty string which means no + request_id will be used. + sequence_id : int + The unique identifier for the sequence being represented by the + object. Default value is 0 which means that the request does not + belong to a sequence. + sequence_start: bool + Indicates whether the request being added marks the start of the + sequence. Default value is False. This argument is ignored if + 'sequence_id' is 0. + sequence_end: bool + Indicates whether the request being added marks the end of the + sequence. Default value is False. This argument is ignored if + 'sequence_id' is 0. + priority : int + Indicates the priority of the request. Priority value zero + indicates that the default priority level should be used + (i.e. same behavior as not specifying the priority parameter). + Lower value priorities indicate higher priority levels. Thus + the highest priority level is indicated by setting the parameter + to 1, the next highest is 2, etc. If not provided, the server + will handle the request using default setting for the model. + timeout : int + The timeout value for the request, in microseconds. If the request + cannot be completed within the time the server can take a + model-specific action such as terminating the request. If not + provided, the server will handle the request using default setting + for the model. + headers: dict + Optional dictionary specifying additional HTTP + headers to include in the request. + query_params: dict + Optional url query parameters to use in network + transaction. + request_compression_algorithm : str + Optional HTTP compression algorithm to use for the request body on client side. + Currently supports "deflate", "gzip" and None. By default, no + compression is used. + response_compression_algorithm : str + Optional HTTP compression algorithm to request for the response body. + Note that the response may not be compressed if the server does not + support the specified algorithm. Currently supports "deflate", + "gzip" and None. By default, no compression is requested. + + Returns + ------- + InferResult + The object holding the result of the inference. + + Raises + ------ + InferenceServerException + If server fails to perform inference. + """ + + request_body, json_size = _get_inference_request( + inputs=inputs, + request_id=request_id, + outputs=outputs, + sequence_id=sequence_id, + sequence_start=sequence_start, + sequence_end=sequence_end, + priority=priority, + timeout=timeout) + + if request_compression_algorithm == "gzip": + if headers is None: + headers = {} + headers["Content-Encoding"] = "gzip" + request_body = gzip.compress(request_body) + elif request_compression_algorithm == 'deflate': + if headers is None: + headers = {} + headers["Content-Encoding"] = "deflate" + # "Content-Encoding: deflate" actually means compressing in zlib structure + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding + request_body = zlib.compress(request_body) + + if response_compression_algorithm == "gzip": + if headers is None: + headers = {} + headers["Accept-Encoding"] = "gzip" + elif response_compression_algorithm == 'deflate': + if headers is None: + headers = {} + headers["Accept-Encoding"] = "deflate" + + if json_size is not None: + if headers is None: + headers = {} + headers["Inference-Header-Content-Length"] = json_size + + if type(model_version) != str: + raise_error("model version must be a string") + if model_version != "": + request_uri = "v2/models/{}/versions/{}/infer".format( + quote(model_name), model_version) + else: + request_uri = "v2/models/{}/infer".format(quote(model_name)) + + response = self._post(request_uri=request_uri, + request_body=request_body, + headers=headers, + query_params=query_params) + _raise_if_error(response) + + return InferResult(response, self._verbose) + + def async_infer(self, + model_name, + inputs, + model_version="", + outputs=None, + request_id="", + sequence_id=0, + sequence_start=False, + sequence_end=False, + priority=0, + timeout=None, + headers=None, + query_params=None, + request_compression_algorithm=None, + response_compression_algorithm=None): + """Run asynchronous inference using the supplied 'inputs' requesting + the outputs specified by 'outputs'. Even though this call is + non-blocking, however, the actual number of concurrent requests to + the server will be limited by the 'concurrency' parameter specified + while creating this client. In other words, if the inflight + async_infer exceeds the specified 'concurrency', the delivery of + the exceeding request(s) to server will be blocked till the slot is + made available by retrieving the results of previously issued requests. + + Parameters + ---------- + model_name: str + The name of the model to run inference. + inputs : list + A list of InferInput objects, each describing data for a input + tensor required by the model. + model_version: str + The version of the model to run inference. The default value + is an empty string which means then the server will choose + a version based on the model and internal policy. + outputs : list + A list of InferRequestedOutput objects, each describing how the output + data must be returned. If not specified all outputs produced + by the model will be returned using default settings. + request_id: str + Optional identifier for the request. If specified will be returned + in the response. Default value is 'None' which means no request_id + will be used. + sequence_id : int + The unique identifier for the sequence being represented by the + object. Default value is 0 which means that the request does not + belong to a sequence. + sequence_start: bool + Indicates whether the request being added marks the start of the + sequence. Default value is False. This argument is ignored if + 'sequence_id' is 0. + sequence_end: bool + Indicates whether the request being added marks the end of the + sequence. Default value is False. This argument is ignored if + 'sequence_id' is 0. + priority : int + Indicates the priority of the request. Priority value zero + indicates that the default priority level should be used + (i.e. same behavior as not specifying the priority parameter). + Lower value priorities indicate higher priority levels. Thus + the highest priority level is indicated by setting the parameter + to 1, the next highest is 2, etc. If not provided, the server + will handle the request using default setting for the model. + timeout : int + The timeout value for the request, in microseconds. If the request + cannot be completed within the time the server can take a + model-specific action such as terminating the request. If not + provided, the server will handle the request using default setting + for the model. + headers: dict + Optional dictionary specifying additional HTTP + headers to include in the request + query_params: dict + Optional url query parameters to use in network + transaction. + request_compression_algorithm : str + Optional HTTP compression algorithm to use for the request body on client side. + Currently supports "deflate", "gzip" and None. By default, no + compression is used. + response_compression_algorithm : str + Optional HTTP compression algorithm to request for the response body. + Note that the response may not be compressed if the server does not + support the specified algorithm. Currently supports "deflate", + "gzip" and None. By default, no compression is requested. + + Returns + ------- + InferAsyncRequest object + The handle to the asynchronous inference request. + + Raises + ------ + InferenceServerException + If server fails to issue inference. + """ + + def wrapped_post(request_uri, request_body, headers, query_params): + return self._post(request_uri, request_body, headers, query_params) + + request_body, json_size = _get_inference_request( + inputs=inputs, + request_id=request_id, + outputs=outputs, + sequence_id=sequence_id, + sequence_start=sequence_start, + sequence_end=sequence_end, + priority=priority, + timeout=timeout) + + if request_compression_algorithm == "gzip": + if headers is None: + headers = {} + headers["Content-Encoding"] = "gzip" + request_body = gzip.compress(request_body) + elif request_compression_algorithm == 'deflate': + if headers is None: + headers = {} + headers["Content-Encoding"] = "deflate" + # "Content-Encoding: deflate" actually means compressing in zlib structure + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding + request_body = zlib.compress(request_body) + + if response_compression_algorithm == "gzip": + if headers is None: + headers = {} + headers["Accept-Encoding"] = "gzip" + elif response_compression_algorithm == 'deflate': + if headers is None: + headers = {} + headers["Accept-Encoding"] = "deflate" + + if json_size is not None: + if headers is None: + headers = {} + headers["Inference-Header-Content-Length"] = json_size + + if type(model_version) != str: + raise_error("model version must be a string") + if model_version != "": + request_uri = "v2/models/{}/versions/{}/infer".format( + quote(model_name), model_version) + else: + request_uri = "v2/models/{}/infer".format(quote(model_name)) + + g = self._pool.apply_async( + wrapped_post, (request_uri, request_body, headers, query_params)) + + # Schedule the greenlet to run in this loop iteration + g.start() + + # Relinquish control to greenlet loop. Using non-zero + # value to ensure the control is transferred to the + # event loop. + gevent.sleep(0.01) + + if self._verbose: + verbose_message = "Sent request" + if request_id != "": + verbose_message = verbose_message + " '{}'".format(request_id) + print(verbose_message) + + return InferAsyncRequest(g, self._verbose) + + +class InferAsyncRequest: + """An object of InferAsyncRequest class is used to describe + a handle to an ongoing asynchronous inference request. + + Parameters + ---------- + greenlet : gevent.Greenlet + The greenlet object which will provide the results. + For further details about greenlets refer + http://www.gevent.org/api/gevent.greenlet.html. + + verbose : bool + If True generate verbose output. Default value is False. + """ + + def __init__(self, greenlet, verbose=False): + self._greenlet = greenlet + self._verbose = verbose + + def get_result(self, block=True, timeout=None): + """Get the results of the associated asynchronous inference. + Parameters + ---------- + block : bool + If block is True, the function will wait till the + corresponding response is received from the server. + Default value is True. + timeout : int + The maximum wait time for the function. This setting is + ignored if the block is set False. Default is None, + which means the function will block indefinitely till + the corresponding response is received. + + Returns + ------- + InferResult + The object holding the result of the async inference. + + Raises + ------ + InferenceServerException + If server fails to perform inference or failed to respond + within specified timeout. + """ + + try: + response = self._greenlet.get(block=block, timeout=timeout) + except gevent.Timeout as e: + raise_error("failed to obtain inference response") + + _raise_if_error(response) + return InferResult(response, self._verbose) + + +class InferInput: + """An object of InferInput class is used to describe + input tensor for an inference request. + + Parameters + ---------- + name : str + The name of input whose data will be described by this object + shape : list + The shape of the associated input. + datatype : str + The datatype of the associated input. + """ + + def __init__(self, name, shape, datatype): + self._name = name + self._shape = shape + self._datatype = datatype + self._parameters = {} + self._data = None + self._raw_data = None + + def name(self): + """Get the name of input associated with this object. + + Returns + ------- + str + The name of input + """ + return self._name + + def datatype(self): + """Get the datatype of input associated with this object. + + Returns + ------- + str + The datatype of input + """ + return self._datatype + + def shape(self): + """Get the shape of input associated with this object. + + Returns + ------- + list + The shape of input + """ + return self._shape + + def set_shape(self, shape): + """Set the shape of input. + + Parameters + ---------- + shape : list + The shape of the associated input. + """ + self._shape = shape + + def set_data_from_numpy(self, input_tensor, binary_data=True): + """Set the tensor data from the specified numpy array for + input associated with this object. + + Parameters + ---------- + input_tensor : numpy array + The tensor data in numpy array format + binary_data : bool + Indicates whether to set data for the input in binary format + or explicit tensor within JSON. The default value is True, + which means the data will be delivered as binary data in the + HTTP body after the JSON object. + + Raises + ------ + InferenceServerException + If failed to set data for the tensor. + """ + if not isinstance(input_tensor, (np.ndarray,)): + raise_error("input_tensor must be a numpy array") + dtype = np_to_triton_dtype(input_tensor.dtype) + if self._datatype != dtype: + raise_error( + "got unexpected datatype {} from numpy array, expected {}". + format(dtype, self._datatype)) + valid_shape = True + if len(self._shape) != len(input_tensor.shape): + valid_shape = False + else: + for i in range(len(self._shape)): + if self._shape[i] != input_tensor.shape[i]: + valid_shape = False + if not valid_shape: + raise_error( + "got unexpected numpy array shape [{}], expected [{}]".format( + str(input_tensor.shape)[1:-1], + str(self._shape)[1:-1])) + + self._parameters.pop('shared_memory_region', None) + self._parameters.pop('shared_memory_byte_size', None) + self._parameters.pop('shared_memory_offset', None) + + if not binary_data: + self._parameters.pop('binary_data_size', None) + self._raw_data = None + if self._datatype == "BYTES": + self._data = [] + try: + if input_tensor.size > 0: + for obj in np.nditer(input_tensor, + flags=["refs_ok"], + order='C'): + # We need to convert the object to string using utf-8, + # if we want to use the binary_data=False. JSON requires + # the input to be a UTF-8 string. + if input_tensor.dtype == np.object_: + if type(obj.item()) == bytes: + self._data.append( + str(obj.item(), encoding='utf-8')) + else: + self._data.append(str(obj.item())) + else: + self._data.append( + str(obj.item(), encoding='utf-8')) + except UnicodeDecodeError: + raise_error( + f'Failed to encode "{obj.item()}" using UTF-8. Please use binary_data=True, if' + ' you want to pass a byte array.') + else: + self._data = [val.item() for val in input_tensor.flatten()] + else: + self._data = None + if self._datatype == "BYTES": + serialized_output = serialize_byte_tensor(input_tensor) + if serialized_output.size > 0: + self._raw_data = serialized_output.item() + else: + self._raw_data = b'' + else: + self._raw_data = input_tensor.tobytes() + self._parameters['binary_data_size'] = len(self._raw_data) + + def set_shared_memory(self, region_name, byte_size, offset=0): + """Set the tensor data from the specified shared memory region. + + Parameters + ---------- + region_name : str + The name of the shared memory region holding tensor data. + byte_size : int + The size of the shared memory region holding tensor data. + offset : int + The offset, in bytes, into the region where the data for + the tensor starts. The default value is 0. + + """ + self._data = None + self._raw_data = None + self._parameters.pop('binary_data_size', None) + + self._parameters['shared_memory_region'] = region_name + self._parameters['shared_memory_byte_size'] = byte_size + if offset != 0: + self._parameters['shared_memory_offset'].int64_param = offset + + def _get_binary_data(self): + """Returns the raw binary data if available + + Returns + ------- + bytes + The raw data for the input tensor + """ + return self._raw_data + + def _get_tensor(self): + """Retrieve the underlying input as json dict. + + Returns + ------- + dict + The underlying tensor specification as dict + """ + tensor = { + 'name': self._name, + 'shape': self._shape, + 'datatype': self._datatype + } + if self._parameters: + tensor['parameters'] = self._parameters + + if self._parameters.get('shared_memory_region') is None and \ + self._raw_data is None: + if self._data is not None: + tensor['data'] = self._data + return tensor + + +class InferRequestedOutput: + """An object of InferRequestedOutput class is used to describe a + requested output tensor for an inference request. + + Parameters + ---------- + name : str + The name of output tensor to associate with this object. + binary_data : bool + Indicates whether to return result data for the output in + binary format or explicit tensor within JSON. The default + value is True, which means the data will be delivered as + binary data in the HTTP body after JSON object. This field + will be unset if shared memory is set for the output. + class_count : int + The number of classifications to be requested. The default + value is 0 which means the classification results are not + requested. + """ + + def __init__(self, name, binary_data=True, class_count=0): + self._name = name + self._parameters = {} + if class_count != 0: + self._parameters['classification'] = class_count + self._binary = binary_data + self._parameters['binary_data'] = binary_data + + def name(self): + """Get the name of output associated with this object. + + Returns + ------- + str + The name of output + """ + return self._name + + def set_shared_memory(self, region_name, byte_size, offset=0): + """Marks the output to return the inference result in + specified shared memory region. + + Parameters + ---------- + region_name : str + The name of the shared memory region to hold tensor data. + byte_size : int + The size of the shared memory region to hold tensor data. + offset : int + The offset, in bytes, into the region where the data for + the tensor starts. The default value is 0. + + """ + if 'classification' in self._parameters: + raise_error("shared memory can't be set on classification output") + if self._binary: + self._parameters['binary_data'] = False + + self._parameters['shared_memory_region'] = region_name + self._parameters['shared_memory_byte_size'] = byte_size + if offset != 0: + self._parameters['shared_memory_offset'] = offset + + def unset_shared_memory(self): + """Clears the shared memory option set by the last call to + InferRequestedOutput.set_shared_memory(). After call to this + function requested output will no longer be returned in a + shared memory region. + """ + + self._parameters['binary_data'] = self._binary + self._parameters.pop('shared_memory_region', None) + self._parameters.pop('shared_memory_byte_size', None) + self._parameters.pop('shared_memory_offset', None) + + def _get_tensor(self): + """Retrieve the underlying input as json dict. + + Returns + ------- + dict + The underlying tensor as a dict + """ + tensor = {'name': self._name} + if self._parameters: + tensor['parameters'] = self._parameters + return tensor + + +class InferResult: + """An object of InferResult class holds the response of + an inference request and provide methods to retrieve + inference results. + + Parameters + ---------- + response : geventhttpclient.response.HTTPSocketPoolResponse + The inference response from the server + verbose : bool + If True generate verbose output. Default value is False. + """ + + def __init__(self, response, verbose): + header_length = response.get('Inference-Header-Content-Length') + + # Internal class that simulate the interface of 'response' + class DecompressedResponse: + + def __init__(self, decompressed_data): + self.decompressed_data_ = decompressed_data + self.offset_ = 0 + + def read(self, length=-1): + if length == -1: + return self.decompressed_data_[self.offset_:] + else: + prev_offset = self.offset_ + self.offset_ += length + return self.decompressed_data_[prev_offset:self.offset_] + + content_encoding = response.get('Content-Encoding') + if content_encoding is not None: + if content_encoding == "gzip": + response = DecompressedResponse(gzip.decompress( + response.read())) + elif content_encoding == 'deflate': + response = DecompressedResponse(zlib.decompress( + response.read())) + if header_length is None: + content = response.read() + if verbose: + print(content) + try: + self._result = json.loads(content) + except UnicodeDecodeError as e: + raise_error( + f'Failed to encode using UTF-8. Please use binary_data=True, if' + f' you want to pass a byte array. UnicodeError: {e}') + else: + header_length = int(header_length) + content = response.read(length=header_length) + if verbose: + print(content) + self._result = json.loads(content) + + # Maps the output name to the index in buffer for quick retrieval + self._output_name_to_buffer_map = {} + # Read the remaining data off the response body. + self._buffer = response.read() + buffer_index = 0 + for output in self._result['outputs']: + parameters = output.get("parameters") + if parameters is not None: + this_data_size = parameters.get("binary_data_size") + if this_data_size is not None: + self._output_name_to_buffer_map[ + output['name']] = buffer_index + buffer_index = buffer_index + this_data_size + + @classmethod + def from_response_body(cls, + response_body, + verbose=False, + header_length=None, + content_encoding=None): + """A class method to construct InferResult object + from a given 'response_body'. + + Parameters + ---------- + response_body : bytes + The inference response from the server + verbose : bool + If True generate verbose output. Default value is False. + header_length : int + The length of the inference header if the header does not occupy + the whole response body. Default value is None. + content_encoding : string + The encoding of the response body if it is compressed. + Default value is None. + + Returns + ------- + InferResult + The InferResult object generated from the response body + """ + + # Internal class that simulate the interface of 'response' + class Response: + + def __init__(self, response_body, header_length, content_encoding): + self.response_body_ = response_body + self.offset_ = 0 + self.parameters_ = { + 'Inference-Header-Content-Length': header_length, + 'Content-Encoding': content_encoding + } + + def get(self, key): + return self.parameters_.get(key) + + def read(self, length=-1): + if length == -1: + return self.response_body_[self.offset_:] + else: + prev_offset = self.offset_ + self.offset_ += length + return self.response_body_[prev_offset:self.offset_] + + return cls(Response(response_body, header_length, content_encoding), + verbose) + + def as_numpy(self, name): + """Get the tensor data for output associated with this object + in numpy format + + Parameters + ---------- + name : str + The name of the output tensor whose result is to be retrieved. + + Returns + ------- + numpy array + The numpy array containing the response data for the tensor or + None if the data for specified tensor name is not found. + """ + if self._result.get('outputs') is not None: + for output in self._result['outputs']: + if output['name'] == name: + datatype = output['datatype'] + has_binary_data = False + parameters = output.get("parameters") + if parameters is not None: + this_data_size = parameters.get("binary_data_size") + if this_data_size is not None: + has_binary_data = True + if this_data_size != 0: + start_index = self._output_name_to_buffer_map[ + name] + end_index = start_index + this_data_size + if datatype == 'BYTES': + # String results contain a 4-byte string length + # followed by the actual string characters. Hence, + # need to decode the raw bytes to convert into + # array elements. + np_array = deserialize_bytes_tensor( + self._buffer[start_index:end_index]) + else: + np_array = np.frombuffer( + self._buffer[start_index:end_index], + dtype=triton_to_np_dtype(datatype)) + else: + np_array = np.empty(0) + if not has_binary_data: + np_array = np.array(output['data'], + dtype=triton_to_np_dtype(datatype)) + np_array = np_array.reshape(output['shape']) + return np_array + return None + + def get_output(self, name): + """Retrieves the output tensor corresponding to the named ouput. + + Parameters + ---------- + name : str + The name of the tensor for which Output is to be + retrieved. + + Returns + ------- + Dict + If an output tensor with specified name is present in + the infer resonse then returns it as a json dict, + otherwise returns None. + """ + for output in self._result['outputs']: + if output['name'] == name: + return output + + return None + + def get_response(self): + """Retrieves the complete response + + Returns + ------- + dict + The underlying response dict. + """ + return self._result \ No newline at end of file diff --git a/examples/clearml_serving_simple_http_inference_request/sample_image.webp b/examples/clearml_serving_simple_http_inference_request/sample_image.webp new file mode 100644 index 0000000..9258c91 Binary files /dev/null and b/examples/clearml_serving_simple_http_inference_request/sample_image.webp differ