diff --git a/.gitignore b/.gitignore
index 1e12a66..f3038c1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 dist/
 build/
 *.egg-info/
+.tmp/
 
 
 # Compiled Python bytecode
@@ -23,6 +24,12 @@ Thumbs.db
 *.app
 *.exe
 *.war
+*.pkl
+*.pt
+*.pb
+data/
+runs/
+variables/
 
 # Large media files
 *.mp4
diff --git a/README.md b/README.md
index e1a165d..3ebe7f3 100644
--- a/README.md
+++ b/README.md
@@ -3,9 +3,9 @@
 
 <a href="https://app.community.clear.ml"><img src="https://github.com/allegroai/clearml/blob/master/docs/clearml-logo.svg?raw=true" width="250px"></a>
 
-**ClearML Serving - ML-Ops made easy**
+**ClearML Serving - Model deployment made easy**
 
-## **`clearml-serving` </br> Model-Serving Orchestration and Repository Solution**
+## **`clearml-serving v2.0` </br> :sparkles: Model Serving (ML/DL) Made Easy :tada:**
 
 
 ## :dizzy: New! version 2.0 in beta [now!](https://github.com/allegroai/clearml-serving/tree/dev) :confetti_ball:
@@ -20,128 +20,313 @@
 
 </div>
 
-<a href="https://app.community.clear.ml"><img src="https://github.com/allegroai/clearml-serving/blob/main/docs/webapp_screenshots.gif?raw=true" width="100%"></a>
 
-
-**`clearml-serving`** is a command line utility for the flexible orchestration of your model deployment.  
-**`clearml-serving`** can make use of a variety of serving engines (**Nvidia Triton, OpenVino Model Serving, KFServing**)
-setting them up for serving wherever you designate a ClearML Agent or on your ClearML Kubernetes cluster
+**`clearml-serving`** is a command line utility for model deployment and orchestration.  
+It enables model deployment including serving and preprocessing code to a Kubernetes cluster or custom container based solution.
 
 Features:
-* Spin serving engines on your Kubernetes cluster or ClearML Agent machine from CLI
-* Full usage & performance metrics integrated with ClearML UI
-* Multi-model support in a single serving engine container
-* Automatically deploy new model versions
-* Support Canary model releases
-* Integrates to ClearML Model Repository
-* Deploy & upgrade endpoints directly from ClearML UI
-* Programmatic interface for endpoint/versions/metric control
+* Easy to deploy & configure
+  * Support Machine Learning Models (Scikit Learn, XGBoost, LightGBM)
+  * Support Deep Learning Models (Tensorflow, PyTorch, ONNX)
+  * Customizable RestAPI for serving (i.e. allow per model pre/post-processing for easy integration)
+* Flexible  
+  * On-line model deployment 
+  * On-line endpoint model/version deployment (i.e. no need to take the service down)
+  * Per model standalone preprocessing and postprocessing python code 
+* Scalable
+  * Multi model per container
+  * Multi models per serving service
+  * Multi-service support (fully seperated multiple serving service running independently)
+  * Multi cluster support
+  * Out-of-the-box node auto-scaling based on load/usage
+* Efficient
+  * multi-container resource utilization
+  * Support for CPU & GPU nodes
+  * Auto-batching for DL models
+* Automatic deployment
+  * Automatic model upgrades w/ canary support 
+  * Programmable API for model deployment
+* Canary A/B deployment
+  * Online Canary updates
+* Model Monitoring
+  * Usage Metric reporting
+  * Metric Dashboard
+  * Model performance metric
+  * Model performance Dashboard
 
+## ClearML Serving Design 
 
-## Installing ClearML Serving
+### ClearML Serving Design Principles 
 
-1. Setup your [**ClearML Server**](https://github.com/allegroai/clearml-server) or use the [Free tier Hosting](https://app.community.clear.ml)
-2. Connect your ClearML Worker(s) to your **ClearML Server** (see [**ClearML Agent**](https://github.com/allegroai/clearml-agent) / [Kubernetes integration](https://github.com/allegroai/clearml-agent#kubernetes-integration-optional))
-3. Install `clearml-serving` (Note: `clearml-serving` is merely a control utility, it does not require any resources for actual serving)
+**Modular** , **Scalable** , **Flexible** , **Customizable** , **Open Source**
+
+<a href="https://excalidraw.com/#json=v0ip945hun2SnO4HVLe0h,QKHfB04TFQLds3_4aqeBjQ"><img src="https://github.com/allegroai/clearml-serving/blob/dev/docs/design_diagram.png?raw=true" width="100%"></a>
+
+## Installation
+
+### prerequisites
+
+* ClearML-Server : Model repository, Service Health, Control plane
+* Kubernetes / Single-instance Machine : Deploying containers 
+* CLI : Configuration & model deployment interface
+
+### :nail_care: Initial Setup
+
+1. Setup your [**ClearML Server**](https://github.com/allegroai/clearml-server) or use the [Free tier Hosting](https://app.clear.ml)
+2. Setup local access (if you haven't already), see introductions [here](https://clear.ml/docs/latest/docs/getting_started/ds/ds_first_steps#install-clearml)
+3. Install clearml-serving CLI: 
 ```bash
-pip install clearml-serving
+pip3 istall clearml-serving
+```
+4. Create the Serving Service Controller
+  - `clearml-serving create --name "serving example"`
+  - The new serving service UID should be printed `"New Serving Service created: id=aa11bb22aa11bb22`
+5. Write down the Serving Service UID
+6. Clone clearml-serving repository
+```bash
+git clone https://github.com/allegroai/clearml-serving.git
+```
+7. Edit the environment variables file (`docker/example.env`) with your clearml-server credentials and Serving Service UID. For example, you should have something like
+```bash
+cat docker/example.env
+```
+```bash
+  CLEARML_WEB_HOST="https://app.clear.ml"
+  CLEARML_API_HOST="https://api.clear.ml"
+  CLEARML_FILES_HOST="https://files.clear.ml"
+  CLEARML_API_ACCESS_KEY="<access_key_here>"
+  CLEARML_API_SECRET_KEY="<secret_key_here>"
+  CLEARML_SERVING_TASK_ID="<serving_service_id_here>"
+```
+8. Spin the clearml-serving containers with docker-compose (or if running on Kubernetes use the helm chart)
+```bash
+cd docker && docker-compose --env-file example.env -f docker-compose.yml up 
+```
+If you need Triton support (keras/pytorch/onnx etc.), use the triton docker-compose file
+```bash
+cd docker && docker-compose --env-file example.env -f docker-compose-triton.yml up 
+```
+:muscle: If running on a GPU instance w/ Triton support (keras/pytorch/onnx etc.), use the triton gpu docker-compose file
+```bash
+cd docker && docker-compose --env-file example.env -f docker-compose-triton-gpu.yml up 
 ```
 
-## Using ClearML Serving
-
-Clearml-Serving will automatically serve *published* models from your ClearML model repository, so the first step is getting a model into your ClearML model repository.  
-Background: When using `clearml` in your training code, any model stored by your python code is automatically registered (and, optionally, uploaded) to the model repository. This auto-magic logging is key for continuous model deployment.  
-To learn more on training models and the ClearML model repository, see the [ClearML documentation](https://clear.ml/docs/latest/docs/)
-
-### Training a toy model with Keras (about 2 minutes on a laptop)
-
-The main goal of `clearml-serving` is to seamlessly integrate with the development process and the model repository.
-This is achieved by combining ClearML's auto-magic logging which creates and uploads models directly from 
-the python training code, with accessing these models as they are automatically added into the model repository using the ClearML Server's REST API and its pythonic interface.  
-Let's demonstrate this seamless integration by training a toy Keras model to classify images based on the MNIST dataset. 
-Once we have a trained model in the model repository we will serve it using `clearml-serving`.
-
-We'll also see how we can retrain another version of the model, and have the model serving engine automatically upgrade to the new model version. 
-
-#### Keras mnist toy train example (single epoch mock training):
-
-1. install `tensorflow` (and of course `cleamrl`)
-   ```bash
-   pip install "tensorflow>2" clearml
-   ```
-
-2. Execute the training code
-   ```bash
-   cd examples/keras
-   python keras_mnist.py
-   ```
-   **Notice:** The only required integration code with `clearml` are the following two lines:
-   ```python
-   from clearml import Task
-   task = Task.init(project_name="examples", task_name="Keras MNIST serve example", output_uri=True)
-   ```
-   This call will make sure all outputs are automatically logged to the ClearML Server, this includes: console, Tensorboard, cmdline arguments, git repo etc.  
-   It also means any model stored by the code will be automatically uploaded and logged in the ClearML model repository.  
+> **Notice**: Any model that registers with "Triton" engine, will run the pre/post processing code on the Inference service container, and the model inference itself will be executed on the Triton Engine container.
 
 
-3. Review the models in the ClearML web UI:  
-   Go to the "Projects" section of your ClearML server ([free hosted](https://app.community.clear.ml) or [self-deployed](https://github.com/allegroai/clearml-server)).  
-   in the "examples" project, go to the Models tab (model repository).  
-   We should have a model named "Keras MNIST serve example - serving_model".  
-   Once a model-serving service is available, Right-clicking on the model and selecting "Publish" will trigger upgrading the model on the serving engine container.
-   
-Next we will spin the Serving Service and the serving-engine
+### :ocean: Optional: advanced setup - S3/GS/Azure access
 
-### Serving your models
+To add access credentials and allow the inference containers to download models from your S3/GS/Azure object-storage,
+add the respected environment variables to your env files (`example.env`)
+See further details on configuring the storage access [here](https://clear.ml/docs/latest/docs/integrations/storage#configuring-storage)
 
-In order to serve your models, `clearml-serving` will spawn a serving service which stores multiple endpoints and their configuration, 
-collects metric reports, and updates models when new versions are published in the model repository.  
-In addition, a serving engine is launched, which is the container actually running the inference engine.  
-(Currently supported engines are Nvidia-Triton, coming soon are Intel OpenVIno serving-engine and KFServing)
-
-Now that we have a published model in the ClearML model repository, we can spin a serving service and a serving engine.
-
-Starting a Serving Service:  
-
-1. Create a new serving instance.  
-   This is the control plane Task, we will see all its configuration logs and metrics in the "serving" project. We can have multiple serving services running in the same system.  
-   In this example we will make use of Nvidia-Triton engines.   
 ```bash
-clearml-serving triton --project "serving" --name "serving example"
-```
-2. Add models to the serving engine with specific endpoints.  
-Reminder: to view your model repository, login to your ClearML account, 
-   go to "examples" project and review the "Models" Tab
-```bash
-clearml-serving triton --endpoint "keras_mnist"  --model-project "examples" --model-name "Keras MNIST serve example - serving_model"
+AWS_ACCESS_KEY_ID
+AWS_SECRET_ACCESS_KEY
+AWS_DEFAULT_REGION
+
+GOOGLE_APPLICATION_CREDENTIALS
+
+AZURE_STORAGE_ACCOUNT
+AZURE_STORAGE_KEY
 ```
 
-3. Launch the serving service.  
-   The service will be launched on your "services" queue, which by default runs services on the ClearML server machine.  
-   (Read more on services queue [here](https://clear.ml/docs/latest/docs/clearml_agent#services-mode))  
-   We set our serving-engine to launch on the "default" queue, 
+### :information_desk_person: Concepts
+
+**CLI** - Secure configuration interface for on-line model upgrade/deployment on running Serving Services
+
+**Serving Service Task** - Control plane object storing configuration on all the endpoints. Support multiple separated instance, deployed on multiple clusters.
+
+**Inference Services** - Inference containers, performing model serving pre/post processing. Also support CPU model inferencing.
+
+**Serving Engine Services** - Inference engine containers (e.g. Nvidia Triton, TorchServe etc.) used by the Inference Services for heavier model inference.
+
+**Statistics Service** - Single instance per Serving Service  collecting and broadcasting model serving & performance statistics
+
+**Time-series DB** - Statistics collection service used by the Statistics Service, e.g. Prometheus
+
+**Dashboards** - Customizable dashboard-ing solution on top of the collected statistics, e.g. Grafana
+
+### :point_right: Toy model (scikit learn) deployment example 
+
+1. Train toy scikit-learn model
+  - create new python virtual environment
+  - `pip3 install -r examples/sklearn/requirements.txt`
+  - `python3 examples/sklearn/train_model.py`
+  - Model was automatically registered and uploaded into the model repository. For Manual model registration see [here](#registering--deploying-new-models-manually) 
+2. Register the new Model on the Serving Service
+  - `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --project "serving examples"`
+  - **Notice** the preprocessing python code is packaged and uploaded to the "Serving Service", to be used by any inference container, and downloaded in realtime when updated
+3. Spin the Inference Container
+  - Customize container [Dockerfile](clearml_serving/serving/Dockerfile) if needed
+  - Build container `docker build --tag clearml-serving-inference:latest -f clearml_serving/serving/Dockerfile .`
+  - Spin the inference container: `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> -e CLEARML_SERVING_POLL_FREQ=5 clearml-serving-inference:latest` 
+4. Test new model inference endpoint
+  - `curl -X POST "http://127.0.0.1:8080/serve/test_model_sklearn" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
+  
+**Notice**, now that we have an inference container running, we can add new model inference endpoints directly with the CLI. The inference container will automatically sync once every 5 minutes.
+
+**Notice** On the first few requests the inference container needs to download the model file and preprocessing python code, this means the request might take a little longer, once everything is cached, it will return almost immediately.
+
+**Notes:**
+> To review the model repository in the ClearML web UI, under the "serving examples" Project on your ClearML account/server ([free hosted](https://app.clear.ml) or [self-deployed](https://github.com/allegroai/clearml-server)).
+
+> Inference services status, console outputs and machine metrics are available in the ClearML UI in the Serving Service project (default: "DevOps" project)
+
+> To learn more on training models and the ClearML model repository, see the [ClearML documentation](https://clear.ml/docs)
+
+### :turtle: Registering & Deploying new models manually 
+
+Uploading an existing model file into the model repository can be done via the `clearml` RestAPI, the python interface, or with the `clearml-serving` CLI 
+
+> To learn more on training models and the ClearML model repository, see the [ClearML documentation](https://clear.ml/docs)
+
+- local model file on our laptop: 'examples/sklearn/sklearn-model.pkl'
+- Upload the model file to the `clearml-server` file storage and register it
+`clearml-serving --id <service_id> model upload --name "manual sklearn model" --project "serving examples" --framework "scikit-learn" --path examples/sklearn/sklearn-model.pkl`
+- We now have a new Model in the "serving examples" project, by the name of "manual sklearn model". The CLI output prints the UID of the newly created model, we will use it to register a new endpoint 
+- In the `clearml` web UI we can see the new model listed under the `Models` tab in the associated project. we can also download the model file itself directly from the web UI 
+- Register a new endpoint with the new model
+`clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --model-id <newly_created_model_id_here>`
+
+**Notice** we can also provide a differnt storage destination for the model, such as S3/GS/Azure, by passing
+`--destination="s3://bucket/folder"`, `gs://bucket/folder`, `azure://bucket/folder`. Yhere is no need to provide a unique path tp the destination argument, the location of the model will be a unique path based on the serving service ID and the model name
+
+
+### :rabbit: Automatic model deployment
+
+The clearml Serving Service support automatic model deployment and upgrades, directly connected with the model repository and API. When the model auto-deploy is configured, a new model versions will be automatically deployed when you "publish" or "tag" a new model in the `clearml` model repository. This automation interface allows for simpler CI/CD model deployment process, as a single API automatically deploy (or remove) a model from the Serving Service.
+
+#### Automatic model deployment example
+
+1. Configure the model auto-update on the Serving Service
+- `clearml-serving --id <service_id> model auto-update --engine sklearn --endpoint "test_model_sklearn_auto" --preprocess "preprocess.py" --name "train sklearn model" --project "serving examples" --max-versions 2`
+2. Deploy the Inference container (if not already deployed)
+3. Publish a new model the model repository
+- Go to the "serving examples" project in the ClearML web UI, click on the Models Tab, search for "train sklearn model" right click and select "Publish"
+- Use the RestAPI [details](https://clear.ml/docs/latest/docs/references/api/models#post-modelspublish_many)
+- Use Python interface: 
+```python
+from clearml import Model
+Model(model_id="unique_model_id_here").publish()
+```
+4. The new model is available on a new endpoint version (1), test with: 
+`curl -X POST "http://127.0.0.1:8080/serve/test_model_sklearn_auto/1" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
+
+### :bird: Canary endpoint setup
+
+Canary endpoint deployment add a new endpoint where the actual request is sent to a preconfigured set of endpoints with pre-provided distribution. For example, let's create a new endpoint "test_model_sklearn_canary", we can provide a list of endpoints and probabilities (weights).
+
 ```bash
-clearml-serving launch --queue default
+clearml-serving --id <service_id> model canary --endpoint "test_model_sklearn_canary" --weights 0.1 0.9 --input-endpoints test_model_sklearn/2 test_model_sklearn/1
+```
+This means that any request coming to `/test_model_sklearn_canary/` will be routed with probability of 90% to
+`/test_model_sklearn/1/` and with probability of 10% to `/test_model_sklearn/2/` 
+
+**Note:**
+> As with any other Serving Service configuration, we can configure the Canary endpoint while the Inference containers are already running and deployed, they will get updated in their next update cycle (default: once every 5 minutes)
+
+We Can also prepare a "fixed" canary endpoint, always splitting the load between the last two deployed models:
+```bash
+clearml-serving --id <service_id> model canary --endpoint "test_model_sklearn_canary" --weights 0.1 0.9 --input-endpoints-prefix test_model_sklearn/
 ```
 
-4. Optional: If you do not have a machine connected to your ClearML cluster, either read more on our Kubernetes integration, or spin a bare-metal worker and connect it with your ClearML Server.  
-   `clearml-serving` is leveraging the orchestration capabilities of `ClearML` to launch the serving engine on the cluster.  
-   Read more on the [ClearML Agent](https://github.com/allegroai/clearml-agent) orchestration module [here](https://clear.ml/docs/latest/docs/clearml_agent)  
-   If you have not yet setup a ClearML worker connected to your `clearml` account, you can do this now using:
-   ```bash
-   pip install clearml-agent
-   clearml-agent daemon --docker --queue default --detached
-   ```
+This means that is we have two model inference endpoints: `/test_model_sklearn/1/`, `/test_model_sklearn/2/`  
+the 10% probability (weight 0.1) will match the last (order by version number) endpoint, i.e. `/test_model_sklearn/2/` and the 90% will match `/test_model_sklearn/2/`
+When we add a new model endpoint version, e.g. `/test_model_sklearn/3/`, the canary distribution will automatically match the 90% probability to `/test_model_sklearn/2/` and the 10% to the new endpoint `/test_model_sklearn/3/`  
+
+Example:
+1. Add two endpoints:
+  - `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --version 1 --project "serving examples"`
+  -  `clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --version 2 --project "serving examples"`
+2. Add Canary endpoint:
+  - `clearml-serving --id <service_id> model canary --endpoint "test_model_sklearn_canary" --weights 0.1 0.9 --input-endpoints test_model_sklearn/2 test_model_sklearn/1`
+3. Test Canary endpoint:
+  - `curl -X POST "http://127.0.0.1:8080/serve/test_model" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'` 
 
 
-**We are done!** 
-To test the new served model, you can `curl` to the new endpoint:
+### Model monitoring and performance metrics
+
+![Grafana Screenshot](docs/grafana_screenshot.png)
+
+ClearML serving instances send serving statistics (count/latency) automatically to Prometheus and Grafana can be used 
+to visualize and create live dashboards. 
+
+The default docker-compose installation is preconfigured with Prometheus and Grafana, do notice that by default data/ate of both containers is *not* persistent. To add persistence we do recommend adding a volume mount.
+
+You can also add many custom metrics on the input/predictions of your models.
+Once a model endpoint is registered, adding custom metric can be done using the CLI.
+For example, assume we have our mock scikit-learn model deployed on endpoint `test_model_sklearn`, 
+we can log the requests inputs and outputs (see examples/sklearn/preprocess.py example):
 ```bash
-curl <serving-engine-ip>:8000/v2/models/keras_mnist/versions/1
+clearml-serving --id <serving_service_id_here> metrics add --endpoint test_model_sklearn --variable-scalar
+x0=0,0.1,0.5,1,10 x1=0,0.1,0.5,1,10 y=0,0.1,0.5,0.75,1
 ```
 
-**Notice**: If we re-run our keras training example and publish a new model in the repository, the engine will automatically update to the new model.
+This will create a distribution histogram (buckets specified via a list of less-equal values after `=` sign),
+that we will be able to visualize on Grafana.
+Notice we can also log time-series values with `--variable-value x2` or discrete results (e.g. classifications strings) with `--variable-enum animal=cat,dog,sheep`.
+Additional custom variables can be in the preprocess and postprocess with a call to `collect_custom_statistics_fn({'new_var': 1.337})` see clearml_serving/preprocess/preprocess_template.py
 
-Further reading on advanced topics [here](coming-soon)
+With the new metrics logged we can create a visualization dashboard over the latency of the calls, and the output distribution. 
+
+Grafana model performance example:
+
+- browse to http://localhost:3000
+- login with: admin/admin
+- create a new dashboard
+- select Prometheus as data source
+- Add a query: `100 * increase(test_model_sklearn:_latency_bucket[1m]) / increase(test_model_sklearn:_latency_sum[1m])`
+- Change type to heatmap, and select on the right hand-side under "Data Format" select "Time series buckets"
+- You now have the latency distribution, over time.
+- Repeat the same process for x0, the query would be `100 * increase(test_model_sklearn:x0_bucket[1m]) / increase(test_model_sklearn:x0_sum[1m])`
+
+> **Notice**: If not specified all serving requests will be logged, to change the default configure "CLEARML_DEFAULT_METRIC_LOG_FREQ", for example CLEARML_DEFAULT_METRIC_LOG_FREQ=0.2 means only 20% of all requests will be logged. You can also specify per endpoint log frequency with the `clearml-serving` CLI. Check the CLI documentation with `cleamrl-serving metrics --help`
+
+### :fire: Model Serving Examples
+
+- Scikit-Learn [example](examples/sklearn/readme.md) - random data 
+- Scikit-Learn Model Ensemble [example](examples/ensemble/readme.md) - random data 
+- XGBoost [example](examples/xgboost/readme.md) - iris dataset
+- LightGBM [example](examples/lightgbm/readme.md) - iris dataset
+- PyTorch [example](examples/pytorch/readme.md) - mnist dataset
+- TensorFlow/Keras [example](examples/keras/readme.md) - mnist dataset
+- Model Pipeline [example](examples/pipeline/readme.md) - random data
+
+### :pray: Status
+
+  - [x] FastAPI integration for inference service
+  - [x] multi-process Gunicorn for inference service
+  - [x] Dynamic preprocess python code loading (no need for container/process restart)
+  - [x] Model files download/caching (http/s3/gs/azure)
+  - [x] Scikit-learn. XGBoost, LightGBM integration
+  - [x] Custom inference, including dynamic code loading
+  - [x] Manual model upload/registration to model repository (http/s3/gs/azure)
+  - [x] Canary load balancing
+  - [x] Auto model endpoint deployment based on model repository state
+  - [x] Machine/Node health metrics
+  - [x] Dynamic online configuration
+  - [x] CLI configuration tool
+  - [x] Nvidia Triton integration
+  - [x] GZip request compression
+  - [x] TorchServe engine integration
+  - [x] Prebuilt Docker containers (dockerhub)
+  - [x] Docker-compose deployment (CPU/GPU)
+  - [x] Scikit-Learn example
+  - [x] XGBoost example
+  - [x] LightGBM example
+  - [x] PyTorch example
+  - [x] TensorFlow/Keras example
+  - [x] Model ensemble example
+  - [x] Model pipeline example
+  - [x] Statistics Service
+  - [x] Kafka install instructions
+  - [x] Prometheus install instructions
+  - [x] Grafana install instructions
+  - [ ] Kubernetes Helm Chart
+
+## Contributing
+
+**PRs are always welcomed** :heart: See more details in the ClearML [Guidelines for Contributing](https://github.com/allegroai/clearml/blob/master/docs/contributing.md).
 
 
diff --git a/clearml_serving/__main__.py b/clearml_serving/__main__.py
index 7dc608c..b1d97ac 100644
--- a/clearml_serving/__main__.py
+++ b/clearml_serving/__main__.py
@@ -1,179 +1,563 @@
 import json
-import os
-from argparse import ArgumentParser, FileType
+import os.path
+from argparse import ArgumentParser
+from pathlib import Path
 
-from .serving_service import ServingService
+from clearml_serving.serving.model_request_processor import ModelRequestProcessor, CanaryEP
+from clearml_serving.serving.endpoints import ModelMonitoring, ModelEndpoint, EndpointMetricLogging
+
+verbosity = False
 
 
-def restore_state(args):
-    session_state_file = os.path.expanduser('~/.clearml_serving.json')
-    # noinspection PyBroadException
-    try:
-        with open(session_state_file, 'rt') as f:
-            state = json.load(f)
-    except Exception:
-        state = {}
-    # store command line passed ID
-    args.cmd_id = getattr(args, 'id', None)
-    # restore ID from state
-    args.id = getattr(args, 'id', None) or state.get('id')
-    return args
+def func_metric_ls(args):
+    request_processor = ModelRequestProcessor(task_id=args.id)
+    print("List endpoint metrics, control task id={}".format(request_processor.get_id()))
+    request_processor.deserialize(skip_sync=True)
+    print("Logged Metrics:\n{}".format(
+        json.dumps({k: v.as_dict() for k, v in request_processor.list_metric_logging().items()}, indent=2)))
 
 
-def store_state(args, clear=False):
-    session_state_file = os.path.expanduser('~/.clearml_serving.json')
-    if clear:
-        state = {}
+def func_metric_rm(args):
+    request_processor = ModelRequestProcessor(task_id=args.id)
+    print("Serving service Task {}, Removing metrics from endpoint={}".format(
+        request_processor.get_id(), args.endpoint))
+    request_processor.deserialize(skip_sync=True)
+    for v in (args.variable or []):
+        if request_processor.remove_metric_logging(endpoint=args.endpoint, variable_name=v):
+            print("Removing static endpoint: {}".format(args.endpoint))
+        else:
+            raise ValueError("Could not remove {} from endpoin {}".format(v, args.endpoint))
+    print("Updating serving service")
+    request_processor.serialize()
+
+
+def func_metric_add(args):
+    request_processor = ModelRequestProcessor(task_id=args.id)
+    print("Serving service Task {}, Adding metric logging endpoint \'/{}/\'".format(
+        request_processor.get_id(), args.endpoint))
+    request_processor.deserialize(skip_sync=True)
+    metric = EndpointMetricLogging(endpoint=args.endpoint)
+    if args.log_freq is not None:
+        metric.log_frequency = float(args.log_freq)
+    for v in (args.variable_scalar or []):
+        if '=' not in v:
+            raise ValueError("Variable '{}' should be in the form of <name>=<buckets> "
+                             "example: x1=0,1,2,3,4,5".format(v))
+        name, buckets = v.split('=', 1)
+        if name in metric.metrics:
+            print("Warning: {} defined twice".format(name))
+        if '/' in buckets:
+            b_min, b_max, b_step = [float(b.strip()) for b in buckets.split('/', 2)]
+            buckets = list(range(b_min, b_max, b_step))
+        else:
+            buckets = [float(b.strip()) for b in buckets.split(',')]
+        metric.metrics[name] = dict(type="scalar", buckets=buckets)
+
+    for v in (args.variable_enum or []):
+        if '=' not in v:
+            raise ValueError("Variable '{}' should be in the form of <name>=<buckets> "
+                             "example: x1=cat,dog,sheep".format(v))
+        name, buckets = v.split('=', 1)
+        if name in metric.metrics:
+            print("Warning: {} defined twice".format(name))
+        buckets = [str(b.strip()) for b in buckets.split(',')]
+        metric.metrics[name] = dict(type="enum", buckets=buckets)
+
+    for v in (args.variable_value or []):
+        name = v.strip()
+        if name in metric.metrics:
+            print("Warning: {} defined twice".format(name))
+        metric.metrics[name] = dict(type="variable", buckets=None)
+
+    if not request_processor.add_metric_logging(metric=metric):
+        raise ValueError("Could not add metric logging endpoint {}".format(args.endpoint))
+
+    print("Updating serving service")
+    request_processor.serialize()
+
+
+def func_model_upload(args):
+    if not args.path and not args.url:
+        raise ValueError("Either --path or --url must be specified")
+    if args.path and args.url:
+        raise ValueError("Either --path or --url but not both")
+    if args.path and not os.path.exists(args.path):
+        raise ValueError("--path='{}' could not be found".format(args.path))
+    if not args.id:
+        raise ValueError("Serving Service ID must be provided, use --id <serving_id>")
+    from clearml import Task, OutputModel
+    from clearml.backend_interface.util import get_or_create_project
+    # todo: make it look nice
+    t = Task.get_task(task_id=args.id)
+    print("Creating new Model name='{}' project='{}' tags={}".format(args.name, args.project, args.tags or ""))
+    model = OutputModel(task=t, name=args.name, tags=args.tags or None, framework=args.framework)
+    destination = args.destination or t.get_output_destination() or t.get_logger().get_default_upload_destination()
+    model.set_upload_destination(uri=destination)
+    if args.path:
+        print("Uploading model file \'{}\' to {}".format(args.path, destination))
     else:
-        state = {str(k): str(v) if v is not None else None
-                 for k, v in args.__dict__.items() if not str(k).startswith('_') and k not in ('command', )}
-    # noinspection PyBroadException
-    try:
-        with open(session_state_file, 'wt') as f:
-            json.dump(state, f, sort_keys=True)
-    except Exception:
+        print("Registering model file \'{}\'".format(args.url))
+    model.update_weights(weights_filename=args.path, register_uri=args.url, auto_delete_file=False)
+    if args.project:
+        # noinspection PyProtectedMember
+        model._base_model.update(
+            project_id=get_or_create_project(session=t.session, project_name=args.project)
+        )
+    print("Model created and registered, new Model ID={}".format(model.id))
+    if args.publish:
+        model.publish()
+        print("Published Model ID={}".format(model.id))
+
+
+def func_model_ls(args):
+    request_processor = ModelRequestProcessor(task_id=args.id)
+    print("List model serving and endpoints, control task id={}".format(request_processor.get_id()))
+    request_processor.deserialize(skip_sync=True)
+    print("Endpoints:\n{}".format(
+        json.dumps({k: v.as_dict() for k, v in request_processor.get_endpoints().items()}, indent=2)))
+    print("Model Monitoring:\n{}".format(
+        json.dumps({k: v.as_dict() for k, v in request_processor.get_model_monitoring().items()}, indent=2)))
+    print("Canary:\n{}".format(
+        json.dumps({k: v.as_dict() for k, v in request_processor.get_canary_endpoints().items()}, indent=2)))
+
+
+def func_create_service(args):
+    request_processor = ModelRequestProcessor(
+        force_create=True, name=args.name, project=args.project, tags=args.tags or None)
+    print("New Serving Service created: id={}".format(request_processor.get_id()))
+
+
+def func_config_service(args):
+    request_processor = ModelRequestProcessor(task_id=args.id)
+    print("Configure serving service id={}".format(request_processor.get_id()))
+    request_processor.deserialize(skip_sync=True)
+    if args.base_serving_url:
+        print("Configuring serving service [id={}] base_serving_url={}".format(
+            request_processor.get_id(), args.base_serving_url))
+        request_processor.configure(external_serving_base_url=args.base_serving_url)
+    if args.triton_grpc_server:
+        print("Configuring serving service [id={}] triton_grpc_server={}".format(
+            request_processor.get_id(), args.triton_grpc_server))
+        request_processor.configure(external_triton_grpc_server=args.triton_grpc_server)
+    if args.kafka_metric_server:
+        request_processor.configure(external_kafka_service_server=args.kafka_metric_server)
+    if args.metric_log_freq is not None:
         pass
 
 
-def cmd_triton(args):
-    if not args.id and not args.name:
-        raise ValueError("Serving service must have a name, use --name <service_name>")
-
-    if args.cmd_id or (args.id and not args.name):
-        a_serving = ServingService(task_id=args.cmd_id or args.id)
+def func_list_services(_):
+    running_services = ModelRequestProcessor.list_control_plane_tasks()
+    print("Currently running Serving Services:\n")
+    if not running_services:
+        print("No running services found")
     else:
-        a_serving = ServingService(task_project=args.project, task_name=args.name, engine_type='triton')
-        args.id = a_serving.get_id()
+        for s in running_services:
+            print(s)
 
-    if args.endpoint:
-        print("Nvidia Triton Engine ID: {} - Adding serving endpoint: \n".format(args.id) +
-              ("model-project: '{}', model-name: '{}', model-tags: '{}', config-file: '{}'".format(
-                  args.model_project or '',
-                  args.model_name or '',
-                  args.model_tags or '',
-                  args.config or '') if not args.model_id else
-               "model-id: '{}', config-file: '{}'".format(args.model_id or '', args.config or '')))
 
-    if not args.endpoint and (args.model_project or args.model_tags or args.model_id or args.model_name):
-        raise ValueError("Serving endpoint must be provided, add --endpoint <endpoint_name>")
+def func_model_remove(args):
+    request_processor = ModelRequestProcessor(task_id=args.id)
+    print("Serving service Task {}, Removing Model endpoint={}".format(request_processor.get_id(), args.endpoint))
+    request_processor.deserialize(skip_sync=True)
+    if request_processor.remove_endpoint(endpoint_url=args.endpoint):
+        print("Removing static endpoint: {}".format(args.endpoint))
+    elif request_processor.remove_model_monitoring(model_base_url=args.endpoint):
+        print("Removing model monitoring endpoint: {}".format(args.endpoint))
+    elif request_processor.remove_canary_endpoint(endpoint_url=args.endpoint):
+        print("Removing model canary endpoint: {}".format(args.endpoint))
+    else:
+        raise ValueError("Could not find base endpoint URL: {}".format(args.endpoint))
 
-    if args.endpoint:
-        a_serving.add_model_serving(
+    print("Updating serving service")
+    request_processor.serialize()
+
+
+def func_canary_add(args):
+    request_processor = ModelRequestProcessor(task_id=args.id)
+    print("Serving service Task {}, Adding canary endpoint \'/{}/\'".format(
+        request_processor.get_id(), args.endpoint))
+    request_processor.deserialize(skip_sync=True)
+    if not request_processor.add_canary_endpoint(
+            canary=CanaryEP(
+                endpoint=args.endpoint,
+                weights=args.weights,
+                load_endpoints=args.input_endpoints,
+                load_endpoint_prefix=args.input_endpoint_prefix,
+            )
+    ):
+        raise ValueError("Could not add canary endpoint URL: {}".format(args.endpoint))
+
+    print("Updating serving service")
+    request_processor.serialize()
+
+
+def func_model_auto_update_add(args):
+    request_processor = ModelRequestProcessor(task_id=args.id)
+    print("Serving service Task {}, Adding Model monitoring endpoint: \'/{}/\'".format(
+        request_processor.get_id(), args.endpoint))
+
+    if args.aux_config:
+        if len(args.aux_config) == 1 and Path(args.aux_config[0]).exists():
+            aux_config = Path(args.aux_config[0]).read_text()
+        else:
+            from clearml.utilities.pyhocon import ConfigFactory
+            aux_config = ConfigFactory.parse_string('\n'.join(args.aux_config)).as_plain_ordered_dict()
+    else:
+        aux_config = None
+
+    request_processor.deserialize(skip_sync=True)
+    if not request_processor.add_model_monitoring(
+        ModelMonitoring(
+            base_serving_url=args.endpoint,
+            engine_type=args.engine,
+            monitor_project=args.project,
+            monitor_name=args.name,
+            monitor_tags=args.tags or None,
+            only_published=args.published,
+            max_versions=args.max_versions,
+            input_size=args.input_size,
+            input_type=args.input_type,
+            input_name=args.input_name,
+            output_size=args.output_size,
+            output_type=args.output_type,
+            output_name=args.output_name,
+            auxiliary_cfg=aux_config,
+        ),
+        preprocess_code=args.preprocess
+    ):
+        raise ValueError("Could not find base endpoint URL: {}".format(args.endpoint))
+
+    print("Updating serving service")
+    request_processor.serialize()
+
+
+def func_model_endpoint_add(args):
+    request_processor = ModelRequestProcessor(task_id=args.id)
+    print("Serving service Task {}, Adding Model endpoint \'/{}/\'".format(
+        request_processor.get_id(), args.endpoint))
+    request_processor.deserialize(skip_sync=True)
+
+    if args.aux_config:
+        if len(args.aux_config) == 1 and Path(args.aux_config[0]).exists():
+            aux_config = Path(args.aux_config[0]).read_text()
+        else:
+            from clearml.utilities.pyhocon import ConfigFactory
+            aux_config = ConfigFactory.parse_string('\n'.join(args.aux_config)).as_plain_ordered_dict()
+    else:
+        aux_config = None
+
+    if not request_processor.add_endpoint(
+        ModelEndpoint(
+            engine_type=args.engine,
             serving_url=args.endpoint,
-            model_project=args.model_project,
-            model_name=args.model_name,
-            model_tags=args.model_tags,
-            model_ids=[args.model_id] if args.model_id else None,
-            config_file=args.config,
-            max_versions=args.versions,
-        )
+            version=args.version,
+            model_id=args.model_id,
+            input_size=args.input_size,
+            input_type=args.input_type,
+            input_name=args.input_name,
+            output_size=args.output_size,
+            output_type=args.output_type,
+            output_name=args.output_name,
+            auxiliary_cfg=aux_config,
+        ),
+        preprocess_code=args.preprocess,
+        model_name=args.name,
+        model_project=args.project,
+        model_tags=args.tags or None,
+        model_published=args.published,
+    ):
+        raise ValueError("Could not find base endpoint URL: {}".format(args.endpoint))
 
-    a_serving.serialize(force=True)
-    store_state(args)
+    print("Updating serving service")
+    request_processor.serialize()
 
 
-def cmd_launch(args):
-    print('Launching Serving Engine: service: {}, queue: {}'.format(args.id, args.queue))
-
-    if not args.id:
-        raise ValueError("Serving service must specify serving service ID, use --id <service_id>")
-
-    a_serving = ServingService(task_id=args.id)
-
-    if a_serving.get_engine_type() not in ('triton',):
-        raise ValueError("Error, serving engine type \'{}\' is not supported".format(a_serving.get_engine_type()))
-
-    # launch services queue
-    a_serving.launch(queue_name=args.service_queue)
-    # launch engine
-    a_serving.launch_engine(
-        queue_name=args.queue,
-        container=args.engine_container or None,
-        container_args=args.engine_container_args or None,
-    )
-
-
-def cli(verbosity):
+def cli():
     title = 'clearml-serving - CLI for launching ClearML serving engine'
     print(title)
     parser = ArgumentParser(prog='clearml-serving', description=title)
     parser.add_argument('--debug', action='store_true', help='Print debug messages')
+    parser.add_argument(
+        '--id', type=str,
+        help='Control plane Task ID to configure '
+             '(if not provided automatically detect the running control plane Task)')
     subparsers = parser.add_subparsers(help='Serving engine commands', dest='command')
 
-    # create the launch command
-    parser_launch = subparsers.add_parser('launch', help='Launch a previously configured serving service')
-    parser_launch.add_argument(
-        '--id', default=None, type=str,
-        help='Specify a previously configured service ID, if not provided use the last created service')
-    parser_launch.add_argument(
-        '--queue', default=None, type=str, required=True,
-        help='Specify the clearml queue to be used for the serving engine server')
-    parser_launch.add_argument(
-        '--engine-container', default=None, type=str, required=False,
-        help='Specify the serving engine container to use.')
-    parser_launch.add_argument(
-        '--engine-container-args', default=None, type=str, required=False,
-        help='Specify the serving engine container execution arguments (single string). '
-             'Notice: this will override any default container arguments')
-    parser_launch.add_argument(
-        '--service-queue', default='services', type=str,
-        help='Specify the service queue to be used for the serving service, default: services queue')
-    parser_launch.set_defaults(func=cmd_launch)
+    parser_list = subparsers.add_parser('list', help='List running Serving Service')
+    parser_list.set_defaults(func=func_list_services)
 
-    # create the parser for the "triton" command
-    parser_trt = subparsers.add_parser('triton', help='Nvidia Triton Serving Engine')
-    parser_trt.add_argument(
-        '--id', default=None, type=str,
-        help='Add configuration to running serving session, pass serving Task ID, '
-             'if passed ignore --name / --project')
-    parser_trt.add_argument(
-        '--name', default=None, type=str,
-        help='Give serving service a name, should be a unique name')
-    parser_trt.add_argument(
-        '--project', default='DevOps', type=str,
-        help='Serving service project name, default: DevOps')
-    parser_trt.add_argument(
-        '--endpoint', required=False, type=str,
-        help='Serving endpoint, one per model, unique ')
-    parser_trt.add_argument(
-        '--versions', type=int,
-        help='Serving endpoint, support multiple versions, '
-             'max versions to deploy (version number always increase). Default (no versioning).')
-    parser_trt.add_argument(
-        '--config', required=False, type=FileType('r'),
-        help='Model `config.pbtxt` file, one per model, order matching with models')
-    parser_trt.add_argument(
+    parser_create = subparsers.add_parser('create', help='Create a new Serving Service')
+    parser_create.add_argument(
+        '--name', type=str,
+        help='[Optional] name the new serving service. Default: Serving-Service')
+    parser_create.add_argument(
+        '--tags', type=str, nargs='+',
+        help='[Optional] Specify tags for the new serving service')
+    parser_create.add_argument(
+        '--project', type=str,
+        help='[Optional] Specify project for the serving service. Default: DevOps')
+    parser_create.set_defaults(func=func_create_service)
+
+    parser_metrics = subparsers.add_parser('metrics', help='Configure inference metrics Service')
+    parser_metrics.set_defaults(func=parser_metrics.print_help)
+
+    metric_cmd = parser_metrics.add_subparsers(help='model metric command help')
+
+    parser_metrics_add = metric_cmd.add_parser('add', help='Add/modify metric for a specific endpoint')
+    parser_metrics_add.add_argument(
+        '--endpoint', type=str, required=True,
+        help='metric endpoint name including version, e.g. "model/1" or a prefix "model/*" '
+             'Notice: it will override any previous endpoint logged metrics')
+    parser_metrics_add.add_argument(
+        '--log-freq', type=float,
+        help='Optional: logging request frequency, between 0.0 to 1.0 '
+             'example: 1.0 means all requests are logged, 0.5 means half of the requests are logged '
+             'if not specified, use global logging frequency, see `config --metric-log-freq`')
+    parser_metrics_add.add_argument(
+        '--variable-scalar', type=str, nargs='+',
+        help='Add float (scalar) argument to the metric logger, '
+             '<name>=<histogram> example with specific buckets: "x1=0,0.2,0.4,0.6,0.8,1" or '
+             'with min/max/num_buckets "x1=0.0/1.0/5"')
+    parser_metrics_add.add_argument(
+        '--variable-enum', type=str, nargs='+',
+        help='Add enum (string) argument to the metric logger, '
+             '<name>=<optional_values> example: "detect=cat,dog,sheep"')
+    parser_metrics_add.add_argument(
+        '--variable-value', type=str, nargs='+',
+        help='Add non-samples scalar argument to the metric logger, '
+             '<name> example: "latency"')
+    parser_metrics_add.set_defaults(func=func_metric_add)
+
+    parser_metrics_rm = metric_cmd.add_parser('remove', help='Remove metric from a specific endpoint')
+    parser_metrics_rm.add_argument(
+        '--endpoint', type=str, help='metric endpoint name including version, e.g. "model/1" or a prefix "model/*"')
+    parser_metrics_rm.add_argument(
+        '--variable', type=str, nargs='+',
+        help='Remove (scalar/enum) argument from the metric logger, <name> example: "x1"')
+    parser_metrics_rm.set_defaults(func=func_metric_rm)
+
+    parser_metrics_ls = metric_cmd.add_parser('list', help='list metrics logged on all endpoints')
+    parser_metrics_ls.set_defaults(func=func_metric_ls)
+
+    parser_config = subparsers.add_parser('config', help='Configure a new Serving Service')
+    parser_config.add_argument(
+        '--base-serving-url', type=str,
+        help='External base serving service url. example: http://127.0.0.1:8080/serve')
+    parser_config.add_argument(
+        '--triton-grpc-server', type=str,
+        help='External ClearML-Triton serving container gRPC address. example: 127.0.0.1:9001')
+    parser_config.add_argument(
+        '--kafka-metric-server', type=str,
+        help='External Kafka service url. example: 127.0.0.1:9092')
+    parser_config.add_argument(
+        '--metric-log-freq', type=float,
+        help='Set default metric logging frequency. 1.0 is 100% of all requests are logged')
+    parser_config.set_defaults(func=func_config_service)
+
+    parser_model = subparsers.add_parser('model', help='Configure Model endpoints for an already running Service')
+    parser_model.set_defaults(func=parser_model.print_help)
+
+    model_cmd = parser_model.add_subparsers(help='model command help')
+
+    parser_model_ls = model_cmd.add_parser('list', help='List current models')
+    parser_model_ls.set_defaults(func=func_model_ls)
+
+    parser_model_rm = model_cmd.add_parser('remove', help='Remove model by it`s endpoint name')
+    parser_model_rm.add_argument(
+        '--endpoint', type=str, help='model endpoint name')
+    parser_model_rm.set_defaults(func=func_model_remove)
+
+    parser_model_upload = model_cmd.add_parser('upload', help='Upload and register model files/folder')
+    parser_model_upload.add_argument(
+        '--name', type=str, required=True,
+        help='Specifying the model name to be registered in')
+    parser_model_upload.add_argument(
+        '--tags', type=str, nargs='+',
+        help='Optional: Add tags to the newly created model')
+    parser_model_upload.add_argument(
+        '--project', type=str, required=True,
+        help='Specifying the project for the model tp be registered in')
+    parser_model_upload.add_argument(
+        '--framework', type=str, choices=("scikit-learn", "xgboost", "lightgbm", "tensorflow", "pytorch"),
+        help='[Optional] Specify the model framework: "scikit-learn", "xgboost", "lightgbm", "tensorflow", "pytorch"')
+    parser_model_upload.add_argument(
+        '--publish', action='store_true',
+        help='[Optional] Publish the newly created model '
+             '(change model state to "published" i.e. locked and ready to deploy')
+    parser_model_upload.add_argument(
+        '--path', type=str,
+        help='Specifying a model file/folder to be uploaded and registered/')
+    parser_model_upload.add_argument(
+        '--url', type=str,
+        help='Optional, Specifying an already uploaded model url '
+             '(e.g. s3://bucket/model.bin, gs://bucket/model.bin, azure://bucket/model.bin, '
+             'https://domain/model.bin)')
+    parser_model_upload.add_argument(
+        '--destination', type=str,
+        help='Optional, Specifying the target destination for the model to be uploaded '
+             '(e.g. s3://bucket/folder/, gs://bucket/folder/, azure://bucket/folder/)')
+    parser_model_upload.set_defaults(func=func_model_upload)
+
+    parser_model_lb = model_cmd.add_parser('canary', help='Add model Canary/A/B endpoint')
+    parser_model_lb.add_argument(
+        '--endpoint', type=str, help='model canary serving endpoint name (e.g. my_model/latest)')
+    parser_model_lb.add_argument(
+        '--weights', type=float, nargs='+', help='model canary weights (order matching model ep), (e.g. 0.2 0.8)')
+    parser_model_lb.add_argument(
+        '--input-endpoints', type=str, nargs='+',
+        help='Model endpoint prefixes, can also include version (e.g. my_model, my_model/v1)')
+    parser_model_lb.add_argument(
+        '--input-endpoint-prefix', type=str,
+        help='Model endpoint prefix, lexicographic order or by version <int> (e.g. my_model/1, my_model/v1) '
+             'where the first weight matches the last version.')
+    parser_model_lb.set_defaults(func=func_canary_add)
+
+    parser_model_monitor = model_cmd.add_parser('auto-update', help='Add/Modify model auto update service')
+    parser_model_monitor.add_argument(
+        '--endpoint', type=str,
+        help='Base Model endpoint (must be unique)')
+    parser_model_monitor.add_argument(
+        '--engine', type=str, required=True,
+        help='Model endpoint serving engine (triton, sklearn, xgboost, lightgbm)')
+    parser_model_monitor.add_argument(
+        '--max-versions', type=int, default=1,
+        help='max versions to store (and create endpoints) for the model. highest number is the latest version')
+    parser_model_monitor.add_argument(
+        '--name', type=str,
+        help='Specify Model Name to be selected and auto updated '
+             '(notice regexp selection use \"$name^\" for exact match)')
+    parser_model_monitor.add_argument(
+        '--tags', type=str, nargs='+',
+        help='Specify Tags to be selected and auto updated')
+    parser_model_monitor.add_argument(
+        '--project', type=str,
+        help='Specify Model Project to be selected and auto updated')
+    parser_model_monitor.add_argument(
+        '--published', action='store_true',
+        help='Only select published Model for the auto updated')
+    parser_model_monitor.add_argument(
+        '--preprocess', type=str,
+        help='Specify Pre/Post processing code to be used with the model (point to local file / folder) '
+             '- this should hold for all the models'
+    )
+    parser_model_monitor.add_argument(
+        '--input-size', type=int, nargs='+',
+        help='Optional: Specify the model matrix input size [Rows x Columns X Channels etc ...]'
+    )
+    parser_model_monitor.add_argument(
+        '--input-type', type=str,
+        help='Optional: Specify the model matrix input type, examples: uint8, float32, int16, float16 etc.'
+    )
+    parser_model_monitor.add_argument(
+        '--input-name', type=str,
+        help='Optional: Specify the model layer pushing input into, examples: layer_0'
+    )
+    parser_model_monitor.add_argument(
+        '--output-size', type=int, nargs='+',
+        help='Optional: Specify the model matrix output size [Rows x Columns X Channels etc ...]'
+    )
+    parser_model_monitor.add_argument(
+        '--output_type', type=str,
+        help='Optional: Specify the model matrix output type, examples: uint8, float32, int16, float16 etc.'
+    )
+    parser_model_monitor.add_argument(
+        '--output-name', type=str,
+        help='Optional: Specify the model layer pulling results from, examples: layer_99'
+    )
+    parser_model_monitor.add_argument(
+        '--aux-config', type=int, nargs='+',
+        help='Specify additional engine specific auxiliary configuration in the form of key=value. '
+             'Example: platform=onnxruntime_onnx response_cache.enable=true max_batch_size=8 '
+             'Notice: you can also pass full configuration file (e.g. Triton "config.pbtxt")'
+    )
+    parser_model_monitor.set_defaults(func=func_model_auto_update_add)
+
+    parser_model_add = model_cmd.add_parser('add', help='Add/Update model')
+    parser_model_add.add_argument(
+        '--engine', type=str, required=True,
+        help='Model endpoint serving engine (triton, sklearn, xgboost, lightgbm)')
+    parser_model_add.add_argument(
+        '--endpoint', type=str, required=True,
+        help='Model endpoint (must be unique)')
+    parser_model_add.add_argument(
+        '--version', type=str, default=None,
+        help='Model endpoint version (default: None)')
+    parser_model_add.add_argument(
         '--model-id', type=str,
-        help='(Optional) Model ID to deploy, if passed model-project/model-name/model-tags are ignored')
-    parser_trt.add_argument(
-        '--model-project', type=str, help='Automatic model deployment and upgrade, select model project (exact match)')
-    parser_trt.add_argument(
-        '--model-name', type=str, help='Automatic model deployment and upgrade, select model name (exact match)')
-    parser_trt.add_argument(
-        '--model-tags', nargs='*', type=str,
-        help='Automatic model deployment and upgrade, select model name tags to include, '
-             'model has to have all tags to be deployed/upgraded')
-    parser_trt.set_defaults(func=cmd_triton)
+        help='Specify a Model ID to be served')
+    parser_model_add.add_argument(
+        '--preprocess', type=str,
+        help='Specify Pre/Post processing code to be used with the model (point to local file / folder)'
+    )
+    parser_model_add.add_argument(
+        '--input-size', type=int, nargs='+',
+        help='Optional: Specify the model matrix input size [Rows x Columns X Channels etc ...]'
+    )
+    parser_model_add.add_argument(
+        '--input-type', type=str,
+        help='Optional: Specify the model matrix input type, examples: uint8, float32, int16, float16 etc.'
+    )
+    parser_model_add.add_argument(
+        '--input-name', type=str,
+        help='Optional: Specify the model layer pushing input into, examples: layer_0'
+    )
+    parser_model_add.add_argument(
+        '--output-size', type=int, nargs='+',
+        help='Optional: Specify the model matrix output size [Rows x Columns X Channels etc ...]'
+    )
+    parser_model_add.add_argument(
+        '--output-type', type=str,
+        help='Specify the model matrix output type, examples: uint8, float32, int16, float16 etc.'
+    )
+    parser_model_add.add_argument(
+        '--output-name', type=str,
+        help='Optional: Specify the model layer pulling results from, examples: layer_99'
+    )
+    parser_model_add.add_argument(
+        '--aux-config', type=int, nargs='+',
+        help='Specify additional engine specific auxiliary configuration in the form of key=value. '
+             'Example: platform=onnxruntime_onnx response_cache.enable=true max_batch_size=8 '
+             'Notice: you can also pass full configuration file (e.g. Triton "config.pbtxt")'
+    )
+    parser_model_add.add_argument(
+        '--name', type=str,
+        help='[Optional] Instead of specifying model-id select based on Model Name')
+    parser_model_add.add_argument(
+        '--tags', type=str, nargs='+',
+        help='[Optional] Instead of specifying model-id select based on Model Tags')
+    parser_model_add.add_argument(
+        '--project', type=str,
+        help='[Optional] Instead of specifying model-id select based on Model project')
+    parser_model_add.add_argument(
+        '--published', action='store_true',
+        help='[Optional] Instead of specifying model-id select based on Model published')
+    parser_model_add.set_defaults(func=func_model_endpoint_add)
 
     args = parser.parse_args()
-    verbosity['debug'] = args.debug
-    args = restore_state(args)
+    global verbosity
+    verbosity = args.debug
 
     if args.command:
-        args.func(args)
+        if args.command not in ("create", "list") and not args.id:
+            print("Notice! serving service ID not provided, selecting the first active service")
+
+        try:
+            args.func(args)
+        except AttributeError:
+            args.func()
     else:
         parser.print_help()
 
 
 def main():
-    verbosity = dict(debug=False)
+    global verbosity
     try:
-        cli(verbosity)
+        cli()
     except KeyboardInterrupt:
         print('\nUser aborted')
     except Exception as ex:
         print('\nError: {}'.format(ex))
-        if verbosity.get('debug'):
+        if verbosity:
             raise ex
         exit(1)
 
diff --git a/clearml_serving/engines/__init__.py b/clearml_serving/engines/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/clearml_serving/engines/triton/Dockerfile b/clearml_serving/engines/triton/Dockerfile
new file mode 100644
index 0000000..162940c
--- /dev/null
+++ b/clearml_serving/engines/triton/Dockerfile
@@ -0,0 +1,22 @@
+
+FROM nvcr.io/nvidia/tritonserver:22.02-py3
+
+
+ENV LC_ALL=C.UTF-8
+
+# install base package
+RUN pip3 install clearml-serving
+
+# get latest execution code from the git repository
+# RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git
+COPY clearml_serving /root/clearml/clearml_serving
+
+RUN pip3 install -r /root/clearml/clearml_serving/engines/triton/requirements.txt
+
+# default serving port
+EXPOSE 8001
+
+# environement variable to load Task from CLEARML_SERVING_TASK_ID, CLEARML_SERVING_PORT
+
+WORKDIR /root/clearml/
+ENTRYPOINT ["clearml_serving/engines/triton/entrypoint.sh"]
diff --git a/clearml_serving/engines/triton/__init__.py b/clearml_serving/engines/triton/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/clearml_serving/engines/triton/entrypoint.sh b/clearml_serving/engines/triton/entrypoint.sh
new file mode 100755
index 0000000..a896525
--- /dev/null
+++ b/clearml_serving/engines/triton/entrypoint.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# print configuration
+echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID"
+echo CLEARML_TRITON_POLL_FREQ="$CLEARML_TRITON_POLL_FREQ"
+echo CLEARML_TRITON_METRIC_FREQ="$CLEARML_TRITON_METRIC_FREQ"
+echo CLEARML_TRITON_HELPER_ARGS="$CLEARML_TRITON_HELPER_ARGS"
+echo EXTRA_PYTHON_PACKAGES="$EXTRA_PYTHON_PACKAGES"
+
+# we should also have clearml-server configurations
+
+if [ ! -z "$EXTRA_PYTHON_PACKAGES" ]
+then
+      python3 -m pip install $EXTRA_PYTHON_PACKAGES
+fi
+
+# start service
+PYTHONPATH=$(pwd) python3 clearml_serving/engines/triton/triton_helper.py $CLEARML_TRITON_HELPER_ARGS $@
diff --git a/clearml_serving/engines/triton/requirements.txt b/clearml_serving/engines/triton/requirements.txt
new file mode 100644
index 0000000..1bc4db8
--- /dev/null
+++ b/clearml_serving/engines/triton/requirements.txt
@@ -0,0 +1,6 @@
+clearml >= 1.3.1
+clearml-serving
+tritonclient[grpc]>=2.18.0,<2.19
+grpcio
+Pillow>=9.0.1,<10
+pathlib2
\ No newline at end of file
diff --git a/clearml_serving/engines/triton/triton_helper.py b/clearml_serving/engines/triton/triton_helper.py
new file mode 100644
index 0000000..fd6b760
--- /dev/null
+++ b/clearml_serving/engines/triton/triton_helper.py
@@ -0,0 +1,515 @@
+import os
+import re
+import shutil
+import subprocess
+from argparse import ArgumentParser
+from time import time
+from typing import Optional
+
+import numpy as np
+from clearml import Task, Logger, InputModel
+from clearml.backend_api.utils import get_http_session_with_retry
+from clearml.utilities.pyhocon import ConfigFactory, ConfigTree, HOCONConverter
+from pathlib import Path
+
+from clearml_serving.serving.endpoints import ModelEndpoint
+from clearml_serving.serving.model_request_processor import ModelRequestProcessor
+
+
+class TritonHelper(object):
+    _metric_line_parsing = r"(\w+){(gpu_uuid=\"[\w\W]*\",)?model=\"(\w+)\",\s*version=\"(\d+)\"}\s*([0-9.]*)"
+    _default_metrics_port = 8002
+
+    def __init__(
+            self,
+            args,  # Any
+            task,  # type: Task
+            serving_id,  # type: str
+            metric_host=None,  # type: Optional[str]
+            metric_port=None,  # type: int
+    ):
+        # type: (...) -> None
+        self._http_session = get_http_session_with_retry()
+        self.args = dict(**args.__dict__) if args else {}
+        self.task = task
+        self._serving_service_task_id = serving_id
+        self._serving_service_task = None  # type: Optional[ModelRequestProcessor]
+        self._current_endpoints = {}
+        self.metric_host = metric_host or '0.0.0.0'
+        self.metric_port = metric_port or self._default_metrics_port
+        self._parse_metric = re.compile(self._metric_line_parsing)
+        self._timestamp = time()
+        self._last_update_step = None
+        print('String Triton Helper service\n{}\n'.format(self.args))
+
+    def report_metrics(self, remote_logger):
+        # type: (Optional[Logger]) -> bool
+        # iterations are seconds from start
+        iteration = int(time() - self._timestamp)
+
+        report_msg = "reporting metrics: relative time {} sec".format(iteration)
+        self.task.get_logger().report_text(report_msg)
+        if remote_logger:
+            remote_logger.report_text(report_msg, print_console=False)
+
+        # noinspection PyBroadException
+        try:
+            # this is inside the container
+            request = self._http_session.get('http://{}:{}/metrics'.format(self.metric_host, self.metric_port))  # noqa
+            if not request.ok:
+                return False
+            content = request.content.decode().split('\n')
+        except Exception:
+            return False
+
+        for line in content:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            # noinspection PyBroadException
+            try:
+                metric, gpu_uuid, variant, version, value = self._parse_metric.match(line).groups()
+                value = float(value)
+            except Exception:
+                continue
+            self.task.get_logger().report_scalar(
+                title=metric,
+                series='{}.v{}'.format(variant, version),
+                iteration=iteration,
+                value=value
+            )
+            # on the remote logger we add our own Task ID (unique ID),
+            # to support multiple servers reporting to the same service controller
+            if remote_logger:
+                remote_logger.report_scalar(
+                    title=metric,
+                    series='{}.v{}.{}'.format(variant, version, self.task.id),
+                    iteration=iteration,
+                    value=value
+                )
+
+    def model_service_update_step(self, model_repository_folder=None, verbose=True):
+        # type: (Optional[str], bool) -> bool
+
+        if not self._serving_service_task:
+            return False
+
+        active_endpoints = self._serving_service_task.get_synced_endpoints()
+
+        self._last_update_step = time()
+
+        # nothing to do
+        if self._current_endpoints == active_endpoints:
+            return False
+
+        if not model_repository_folder:
+            model_repository_folder = '/models/'
+
+        if verbose:
+            print('Updating local model folder: {}'.format(model_repository_folder))
+
+        for url, endpoint in active_endpoints.items():
+
+            # skip if there is no change
+            if url in self._current_endpoints and self._current_endpoints.get(url) == endpoint:
+                continue
+
+            # skip if this is not a triton engine endpoint:
+            if endpoint.engine_type != "triton":
+                continue
+
+            url = url.replace("/", "_")
+
+            folder = Path(model_repository_folder) / url
+            folder.mkdir(parents=True, exist_ok=True)
+
+            config_pbtxt = folder / 'config.pbtxt'
+            # download model versions
+            version = 1
+            model_id = endpoint.model_id
+
+            model_folder = folder / str(version)
+
+            model_folder.mkdir(parents=True, exist_ok=True)
+            model = None
+            # noinspection PyBroadException
+            try:
+                model = InputModel(model_id)
+                local_path = model.get_local_copy()
+            except Exception:
+                local_path = None
+            if not local_path:
+                print("Error retrieving model ID {} []".format(model_id, model.url if model else ''))
+                continue
+
+            local_path = Path(local_path)
+
+            # prepare config.pbtxt
+            self.create_config_pbtxt(
+                endpoint, target_pbtxt_file=config_pbtxt.as_posix(), platform=model.framework
+            )
+
+            if verbose:
+                print('Update model v{} in {}'.format(version, model_folder))
+
+            # if this is a folder copy every and delete the temp folder
+            if local_path.is_dir() and model and (
+                    str(model.framework).lower().startswith("tensorflow") or
+                    str(model.framework).lower().startswith("keras")
+            ):
+                # we assume we have a `tensorflow.savedmodel` folder
+                model_folder /= 'model.savedmodel'
+                model_folder.mkdir(parents=True, exist_ok=True)
+                # rename to old
+                old_folder = None
+                if model_folder.exists():
+                    old_folder = model_folder.parent / '.old.{}'.format(model_folder.name)
+                    model_folder.replace(old_folder)
+                if verbose:
+                    print('copy model into {}'.format(model_folder))
+                shutil.copytree(
+                    local_path.as_posix(), model_folder.as_posix(), symlinks=False,
+                )
+                if old_folder:
+                    shutil.rmtree(path=old_folder.as_posix())
+                # delete temp folder
+                shutil.rmtree(local_path.as_posix())
+            else:
+                # single file should be moved
+                if model and str(model.framework).lower().startswith("pytorch"):
+                    target_path = model_folder / "model.pt"
+                else:
+                    target_path = model_folder / local_path.name
+
+                old_file = None
+                if target_path.exists():
+                    old_file = target_path.parent / '.old.{}'.format(target_path.name)
+                    target_path.replace(old_file)
+                shutil.move(local_path.as_posix(), target_path.as_posix())
+                if old_file:
+                    old_file.unlink()
+
+        # todo: trigger triton model reloading (instead of relaying on current poll mechanism)
+        # based on the model endpoint changes
+
+        # update current state
+        self._current_endpoints = active_endpoints
+
+        return True
+
+    def maintenance_daemon(
+            self,
+            local_model_repo='/models',  # type: str
+            update_frequency_sec=60.0,  # type: float
+            metric_frequency_sec=60.0  # type: float
+    ):
+        # type: (...) -> None
+
+        Path(local_model_repo).mkdir(parents=True, exist_ok=True)
+
+        self._serving_service_task = ModelRequestProcessor(task_id=self._serving_service_task_id)
+        self.model_service_update_step(model_repository_folder=local_model_repo, verbose=True)
+
+        # noinspection PyProtectedMember
+        remote_logger = self._serving_service_task._task.get_logger()
+
+        # todo: log triton server outputs when running locally
+
+        # we assume we can run the triton server
+        cmd = [
+            'tritonserver',
+            '--model-control-mode=poll',
+            '--model-repository={}'.format(local_model_repo),
+            '--repository-poll-secs={}'.format(update_frequency_sec),
+            '--metrics-port={}'.format(self._default_metrics_port),
+            '--allow-metrics=true',
+            '--allow-gpu-metrics=true',
+        ]
+        for k, v in self.args.items():
+            if not v or not str(k).startswith('t_'):
+                continue
+            cmd.append('--{}={}'.format(k, v))
+
+        print('Starting server: {}'.format(cmd))
+        try:
+            proc = subprocess.Popen(cmd)
+        except FileNotFoundError:
+            raise ValueError(
+                "Triton Server Engine (tritonserver) could not be found!\n"
+                "Verify you running inside the `nvcr.io/nvidia/tritonserver` docker container")
+        base_freq = min(update_frequency_sec, metric_frequency_sec)
+        metric_tic = update_tic = time()
+        while True:
+            try:
+                error_code = proc.wait(timeout=base_freq)
+                if error_code == 0:
+                    print("triton-server process ended with error code {}".format(error_code))
+                    return
+                raise ValueError("triton-server process ended with error code {}".format(error_code))
+            except subprocess.TimeoutExpired:
+                pass
+            pass
+
+            # update models
+            if time() - update_tic > update_frequency_sec:
+                print("Info: syncing models from main serving service")
+                if self.model_service_update_step(model_repository_folder=local_model_repo, verbose=True):
+                    print("Info: Models updated from main serving service")
+                update_tic = time()
+
+            # update stats
+            if time() - metric_tic > metric_frequency_sec:
+                metric_tic = time()
+                self.report_metrics(remote_logger)
+
+    @classmethod
+    def create_config_pbtxt(cls, endpoint, target_pbtxt_file, platform=None):
+        # type: (ModelEndpoint, str, Optional[str]) -> bool
+        """
+        Full spec available here:
+        https://github.com/triton-inference-server/server/blob/main/docs/model_configuration.md
+        """
+
+        def _convert_lists(config):
+            if isinstance(config, list):
+                return [_convert_lists(i) for i in config]
+
+            if not isinstance(config, ConfigTree):
+                return config
+
+            for k in list(config.keys()):
+                v = config[k]
+                # try to convert to list
+                if isinstance(v, (ConfigTree, list)):
+                    # noinspection PyBroadException
+                    try:
+                        a_list = config.get_list(k, [])
+                        if a_list:
+                            config[k] = _convert_lists(a_list)
+                            continue
+                    except Exception:
+                        pass
+
+                config[k] = _convert_lists(v)
+
+            return config
+
+        final_config_pbtxt = ""
+        config_dict = dict()
+
+        if endpoint.auxiliary_cfg and isinstance(endpoint.auxiliary_cfg, str):
+            final_config_pbtxt = endpoint.auxiliary_cfg + "\n"
+        elif endpoint.auxiliary_cfg and isinstance(endpoint.auxiliary_cfg, dict):
+            config_dict = dict(**endpoint.auxiliary_cfg)
+
+        config_dict = ConfigFactory.from_dict(config_dict)
+
+        # The framework for the model. Possible values are:
+        #   "tensorrt_plan", "tensorflow_graphdef",
+        #   "tensorflow_savedmodel", "onnxruntime_onnx",
+        #   "pytorch_libtorch".
+        # Default for TF: "tensorflow_savedmodel"
+
+        # replace ": [{" with ": [{" (currently not needed)
+        # pattern = re.compile(r"(?P<key>\w+)(?P<space>\s+)(?P<bracket>(\[)|({))")
+
+        if endpoint.input_size:
+            config_dict.put("input.0.dims", endpoint.input_size)
+
+        if endpoint.output_size:
+            config_dict.put("output.0.dims", endpoint.output_size)
+
+        input_type = None
+        if endpoint.input_type:
+            input_type = "TYPE_" + cls.np_to_triton_dtype(np.dtype(endpoint.input_type))
+            config_dict.put("input.0.data_type", input_type)
+
+        output_type = None
+        if endpoint.output_type:
+            output_type = "TYPE_" + cls.np_to_triton_dtype(np.dtype(endpoint.output_type))
+            config_dict.put("output.0.data_type", output_type)
+
+        if endpoint.input_name:
+            config_dict.put("input.0.name", endpoint.input_name)
+
+        if endpoint.output_name:
+            config_dict.put("output.0.name", endpoint.output_name)
+
+        if platform and not config_dict.get("platform", None) and not config_dict.get("backend", None):
+            platform = str(platform).lower()
+            if platform.startswith("tensorflow") or platform.startswith("keras"):
+                config_dict["platform"] = "tensorflow_savedmodel"
+            elif platform.startswith("pytorch") or platform.startswith("caffe"):
+                config_dict["backend"] = "pytorch"
+            elif platform.startswith("onnx"):
+                config_dict["platform"] = "onnxruntime_onnx"
+
+        # convert to lists anything that we can:
+        if config_dict:
+            config_dict = _convert_lists(config_dict)
+            # Convert HOCON standard to predefined message format
+            config_pbtxt = "\n" + HOCONConverter.to_hocon(config_dict). \
+                replace("=", ":").replace(" : ", ": ")
+            # conform types (remove string quotes)
+            if input_type:
+                config_pbtxt = config_pbtxt.replace(f"\"{input_type}\"", f"{input_type}")
+            if output_type:
+                config_pbtxt = config_pbtxt.replace(f"\"{output_type}\"", f"{output_type}")
+            # conform types (remove string quotes)
+            config_pbtxt = config_pbtxt.replace("\"KIND_CPU\"", "KIND_CPU").replace("\"KIND_GPU\"", "KIND_GPU")
+        else:
+            config_pbtxt = ""
+
+        # merge the two
+        final_config_pbtxt += config_pbtxt
+        print("INFO: target config.pbtxt file for endpoint '{}':\n{}\n".format(
+            endpoint.serving_url, final_config_pbtxt))
+
+        with open(target_pbtxt_file, "w") as config_file:
+            config_file.write(final_config_pbtxt)
+
+        return True
+
+    @staticmethod
+    def np_to_triton_dtype(np_dtype):
+        # type (np.dtype) -> str
+        """
+        copied from tritonclientutils import np_to_triton_dtype
+        """
+        if np_dtype == bool:
+            return "BOOL"
+        elif np_dtype == np.int8:
+            return "INT8"
+        elif np_dtype == np.int16:
+            return "INT16"
+        elif np_dtype == np.int32:
+            return "INT32"
+        elif np_dtype == np.int64:
+            return "INT64"
+        elif np_dtype == np.uint8:
+            return "UINT8"
+        elif np_dtype == np.uint16:
+            return "UINT16"
+        elif np_dtype == np.uint32:
+            return "UINT32"
+        elif np_dtype == np.uint64:
+            return "UINT64"
+        elif np_dtype == np.float16:
+            return "FP16"
+        elif np_dtype == np.float32:
+            return "FP32"
+        elif np_dtype == np.float64:
+            return "FP64"
+        elif np_dtype == np.object_ or np_dtype.type == np.bytes_:
+            return "BYTES"
+        return None
+
+
+def main():
+    title = 'clearml-serving - Nvidia Triton Engine Controller'
+    print(title)
+    parser = ArgumentParser(prog='clearml-serving', description=title)
+    parser.add_argument(
+        '--serving-id', default=os.environ.get('CLEARML_SERVING_TASK_ID'), type=str,
+        help='Specify main serving service Task ID')
+    parser.add_argument(
+        '--project', default=None, type=str,
+        help='Optional specify project for the serving engine Task')
+    parser.add_argument(
+        '--name', default='triton engine', type=str,
+        help='Optional specify task name for the serving engine Task')
+    parser.add_argument(
+        '--update-frequency', default=os.environ.get('CLEARML_TRITON_POLL_FREQ') or 10., type=float,
+        help='Model update frequency in minutes')
+    parser.add_argument(
+        '--metric-frequency', default=os.environ.get('CLEARML_TRITON_METRIC_FREQ') or 1., type=float,
+        help='Metric reporting update frequency in minutes')
+    parser.add_argument(
+        '--inference-task-id', default=None, type=str,
+        help='Optional: Specify the inference Task ID to report to. default: create a new one')
+    parser.add_argument(
+        '--t-http-port', type=str, help='<integer> The port for the server to listen on for HTTP requests')
+    parser.add_argument(
+        '--t-http-thread-count', type=str, help='<integer> Number of threads handling HTTP requests')
+    parser.add_argument(
+        '--t-allow-grpc', type=str, help='<integer> Allow the server to listen for GRPC requests')
+    parser.add_argument(
+        '--t-grpc-port', type=str, help='<integer> The port for the server to listen on for GRPC requests')
+    parser.add_argument(
+        '--t-grpc-infer-allocation-pool-size', type=str,
+        help='<integer> The maximum number of inference request/response objects that remain '
+             'allocated for reuse. As long as the number of in-flight requests doesn\'t exceed '
+             'this value there will be no allocation/deallocation of request/response objects')
+    parser.add_argument(
+        '--t-pinned-memory-pool-byte-size', type=str,
+        help='<integer> The total byte size that can be allocated as pinned system '
+             'memory. If GPU support is enabled, the server will allocate pinned '
+             'system memory to accelerate data transfer between host and devices '
+             'until it exceeds the specified byte size. This option will not affect '
+             'the allocation conducted by the backend frameworks. Default is 256 MB')
+    parser.add_argument(
+        '--t-cuda-memory-pool-byte-size', type=str,
+        help='<<integer>:<integer>> The total byte size that can be allocated as CUDA memory for '
+             'the GPU device. If GPU support is enabled, the server will allocate '
+             'CUDA memory to minimize data transfer between host and devices '
+             'until it exceeds the specified byte size. This option will not affect '
+             'the allocation conducted by the backend frameworks. The argument '
+             'should be 2 integers separated by colons in the format <GPU device'
+             'ID>:<pool byte size>. This option can be used multiple times, but only '
+             'once per GPU device. Subsequent uses will overwrite previous uses for '
+             'the same GPU device. Default is 64 MB')
+    parser.add_argument(
+        '--t-min-supported-compute-capability', type=str,
+        help='<float> The minimum supported CUDA compute capability. GPUs that '
+             'don\'t support this compute capability will not be used by the server')
+    parser.add_argument(
+        '--t-buffer-manager-thread-count', type=str,
+        help='<integer> The number of threads used to accelerate copies and other'
+             'operations required to manage input and output tensor contents.'
+             'Default is 0')
+
+    args = parser.parse_args()
+
+    # check Args OS overrides
+    prefix = "CLEARML_TRITON_"
+    for k, v in os.environ.items():
+        if not k.startswith(prefix):
+            continue
+        args_var = k.replace(prefix, "", 1).replace("-", "_").lower()
+        if args_var in args.__dict__:
+            # casting
+            t = type(getattr(args, args_var, None))
+            setattr(args, args_var, type(t)(v) if t is not None else v)
+
+    # noinspection PyProtectedMember
+    serving_task = ModelRequestProcessor._get_control_plane_task(task_id=args.inference_task_id)
+
+    task = Task.init(
+        project_name=args.project or serving_task.get_project_name() or "serving",
+        task_name="{} - {}".format(serving_task.name, args.name),
+        task_type=Task.TaskTypes.inference,
+        continue_last_task=args.inference_task_id or None
+    )
+    print("configuration args: {}".format(args))
+    helper = TritonHelper(args, task, serving_id=args.serving_id)
+
+    # safe casting
+    try:
+        update_frequency_sec = float(args.update_frequency) * 60.0
+    except (ValueError, TypeError):
+        update_frequency_sec = 600
+    try:
+        metric_frequency_sec = float(args.metric_frequency) * 60.0
+    except (ValueError, TypeError):
+        metric_frequency_sec = 60
+
+    # this function will never return
+    helper.maintenance_daemon(
+        local_model_repo='/models',
+        update_frequency_sec=update_frequency_sec,
+        metric_frequency_sec=metric_frequency_sec,
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/clearml_serving/preprocess/preprocess_template.py b/clearml_serving/preprocess/preprocess_template.py
new file mode 100644
index 0000000..274a8c3
--- /dev/null
+++ b/clearml_serving/preprocess/preprocess_template.py
@@ -0,0 +1,101 @@
+from typing import Any, Optional, List, Callable
+
+
+# Preprocess class Must be named "Preprocess"
+# No need to inherit or to implement all methods
+class Preprocess(object):
+    """
+    Preprocess class Must be named "Preprocess"
+    Otherwise there are No limitations, No need to inherit or to implement all methods
+    Notice! This is not thread safe! the same instance may be accessed from multiple threads simultaneously
+    """
+
+    def __init__(self):
+        # set internal state, this will be called only once. (i.e. not per request)
+        pass
+
+    def load(self, local_file_name: str) -> Optional[Any]:  # noqa
+        """
+        Optional: provide loading method for the model
+        useful if we need to load a model in a specific way for the prediction engine to work
+        :param local_file_name: file name / path to read load the model from
+        :return: Object that will be called with .predict() method for inference
+        """
+        pass
+
+    def preprocess(self, body: dict, collect_custom_statistics_fn: Optional[Callable[[dict], None]]) -> Any:  # noqa
+        """
+        Optional: do something with the request data, return any type of object.
+        The returned object will be passed as is to the inference engine
+
+        :param body: dictionary as recieved from the RestAPI
+        :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
+            to the statictics collector servicd.
+            None is passed if statiscs collector is not configured, or if the current request should not be collected
+
+            Usage example:
+            >>> print(body)
+            {"x0": 1, "x1": 2}
+            >>> if collect_custom_statistics_fn:
+            >>>   collect_custom_statistics_fn({"x0": 1, "x1": 2})
+
+        :return: Object to be passed directly to the model inference
+        """
+        return body
+
+    def postprocess(self, data: Any, collect_custom_statistics_fn: Optional[Callable[[dict], None]]) -> dict:  # noqa
+        """
+        Optional: post process the data returned from the model inference engine
+        returned dict will be passed back as the request result as is.
+
+        :param data: object as recieved from the inference model function
+        :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
+            to the statictics collector servicd.
+            None is passed if statiscs collector is not configured, or if the current request should not be collected
+
+            Usage example:
+            >>> if collect_custom_statistics_fn:
+            >>>   collect_custom_statistics_fn({"y": 1})
+
+        :return: Dictionary passed directly as the returned result of the RestAPI
+        """
+        return data
+
+    def process(self, data: Any, collect_custom_statistics_fn: Optional[Callable[[dict], None]]) -> Any:  # noqa
+        """
+        Optional: do something with the actual data, return any type of object.
+        The returned object will be passed as is to the postprocess function engine
+
+        :param data: object as recieved from the preprocessing function
+        :param collect_custom_statistics_fn: Optional, if provided allows to send a custom set of key/values
+            to the statictics collector servicd.
+            None is passed if statiscs collector is not configured, or if the current request should not be collected
+
+            Usage example:
+            >>> if collect_custom_statistics_fn:
+            >>>   collect_custom_statistics_fn({"type": "classification"})
+
+        :return: Object to be passed tp the post-processing function
+        """
+        return data
+
+    def send_request(  # noqa
+            self,
+            endpoint: str,
+            version: Optional[str] = None,
+            data: Optional[dict] = None
+    ) -> Optional[dict]:
+        """
+        NOTICE: This method will be replaced in runtime, by the inference service
+
+        Helper method to send model inference requests to the inference service itself.
+        This is designed to help with model ensemble, model pipelines, etc.
+        On request error return None, otherwise the request result data dictionary
+
+        Usage example:
+
+        >>> x0, x1 = 1, 2
+        >>> result = self.send_request(endpoint="test_model_sklearn", version="1", data={"x0": x0, "x1": x1})
+        >>> y = result["y"]
+        """
+        pass
diff --git a/clearml_serving/service.py b/clearml_serving/service.py
deleted file mode 100644
index 89de65f..0000000
--- a/clearml_serving/service.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from time import sleep
-from clearml import Task
-from clearml_serving.serving_service import ServingService
-
-
-def main():
-    # we should only be running in remotely by an agent
-    task = Task.init()
-    serving = ServingService(task=task)
-    while True:
-        serving.update()
-        serving.stats()
-        sleep(60.)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/clearml_serving/serving/Dockerfile b/clearml_serving/serving/Dockerfile
new file mode 100644
index 0000000..7d6c8c7
--- /dev/null
+++ b/clearml_serving/serving/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.9-bullseye
+
+
+ENV LC_ALL=C.UTF-8
+
+# install base package
+RUN pip3 install clearml-serving
+
+# get latest execution code from the git repository
+# RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git
+COPY clearml_serving /root/clearml/clearml_serving
+
+RUN pip3 install -r /root/clearml/clearml_serving/serving/requirements.txt
+
+# default serving port
+EXPOSE 8080
+
+# environement variable to load Task from CLEARML_SERVING_TASK_ID, CLEARML_SERVING_PORT
+
+WORKDIR /root/clearml/
+ENTRYPOINT ["clearml_serving/serving/entrypoint.sh"]
diff --git a/clearml_serving/serving/__init__.py b/clearml_serving/serving/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/clearml_serving/serving/endpoints.py b/clearml_serving/serving/endpoints.py
new file mode 100644
index 0000000..67eaa61
--- /dev/null
+++ b/clearml_serving/serving/endpoints.py
@@ -0,0 +1,100 @@
+import numpy as np
+from attr import attrib, attrs, asdict, validators
+
+
+def _engine_validator(inst, attr, value):  # noqa
+    from .preprocess_service import BasePreprocessRequest
+    if not BasePreprocessRequest.validate_engine_type(value):
+        raise TypeError("{} not supported engine type".format(value))
+
+
+def _matrix_type_validator(inst, attr, value):  # noqa
+    if value and not np.dtype(value):
+        raise TypeError("{} not supported matrix type".format(value))
+
+
+@attrs
+class BaseStruct(object):
+    def as_dict(self, remove_null_entries=False):
+        if not remove_null_entries:
+            return asdict(self)
+        return {k: v for k, v in asdict(self).items() if v is not None}
+
+
+@attrs
+class ModelMonitoring(BaseStruct):
+    base_serving_url = attrib(type=str)  # serving point url prefix (example: "detect_cat")
+    engine_type = attrib(type=str, validator=_engine_validator)  # engine type
+    monitor_project = attrib(type=str, default=None)  # monitor model project (for model auto update)
+    monitor_name = attrib(type=str, default=None)  # monitor model name (for model auto update, regexp selection)
+    monitor_tags = attrib(type=list, default=[])  # monitor model tag (for model auto update)
+    only_published = attrib(type=bool, default=False)  # only select published models
+    max_versions = attrib(type=int, default=None)  # Maximum number of models to keep serving (latest X models)
+    input_size = attrib(type=list, default=None)  # optional,  model matrix size
+    input_type = attrib(type=str, default=None, validator=_matrix_type_validator)  # optional, model matrix type
+    input_name = attrib(type=str, default=None)  # optional, layer name to push the input to
+    output_size = attrib(type=list, default=None)  # optional, model matrix size
+    output_type = attrib(type=str, default=None, validator=_matrix_type_validator)  # optional, model matrix type
+    output_name = attrib(type=str, default=None)  # optional, layer name to pull the results from
+    preprocess_artifact = attrib(
+        type=str, default=None)  # optional artifact name storing the model preprocessing code
+    auxiliary_cfg = attrib(type=dict, default=None)  # Auxiliary configuration (e.g. triton conf), Union[str, dict]
+
+
+@attrs
+class ModelEndpoint(BaseStruct):
+    engine_type = attrib(type=str, validator=_engine_validator)  # engine type
+    serving_url = attrib(type=str)  # full serving point url (including version) example: "detect_cat/v1"
+    model_id = attrib(type=str, default=None)  # model ID to serve (and download)
+    version = attrib(type=str, default="")  # key (version string), default no version
+    preprocess_artifact = attrib(
+        type=str, default=None)  # optional artifact name storing the model preprocessing code
+    input_size = attrib(type=list, default=None)  # optional,  model matrix size
+    input_type = attrib(type=str, default=None, validator=_matrix_type_validator)  # optional, model matrix type
+    input_name = attrib(type=str, default=None)  # optional, layer name to push the input to
+    output_size = attrib(type=list, default=None)  # optional, model matrix size
+    output_type = attrib(type=str, default=None, validator=_matrix_type_validator)  # optional, model matrix type
+    output_name = attrib(type=str, default=None)  # optional, layer name to pull the results from
+    auxiliary_cfg = attrib(type=dict, default=None)  # Optional: Auxiliary configuration (e.g. triton conf), [str, dict]
+
+
+@attrs
+class CanaryEP(BaseStruct):
+    endpoint = attrib(type=str)  # load balancer endpoint
+    weights = attrib(type=list)  # list of weights (order should be matching fixed_endpoints or prefix)
+    load_endpoints = attrib(type=list, default=[])  # list of endpoints to balance and route
+    load_endpoint_prefix = attrib(
+        type=str, default=None)  # endpoint prefix to list
+    # (any endpoint starting with this prefix will be listed, sorted lexicographically, or broken into /<int>)
+
+
+@attrs
+class EndpointMetricLogging(BaseStruct):
+    @attrs
+    class MetricType(BaseStruct):
+        type = attrib(type=str, validator=validators.in_(("scalar", "enum", "value", "counter")))
+        buckets = attrib(type=list, default=None)
+
+    endpoint = attrib(type=str)  # Specific endpoint to log metrics w/ version (example: "model/1")
+    # If endpoint name ends with a "*" any endpoint with a matching prefix will be selected
+
+    log_frequency = attrib(type=float, default=None)  # Specific endpoint to log frequency
+    # (0.0 to 1.0, where 1.0 is 100% of all requests are logged)
+
+    metrics = attrib(
+        type=dict, default={},
+        converter=lambda x: {k: v if isinstance(v, EndpointMetricLogging.MetricType)
+        else EndpointMetricLogging.MetricType(**v) for k, v in x.items()})  # key=variable, value=MetricType)
+    # example:
+    # {"x1": dict(type="scalar", buckets=[0,1,2,3]),
+    #  "y": dict(type="enum", buckets=["cat", "dog"]).
+    #  "latency": dict(type="value", buckets=[]).
+    #  }
+
+    def as_dict(self, remove_null_entries=False):
+        if not remove_null_entries:
+            return {k: v.as_dict(remove_null_entries) if isinstance(v, BaseStruct) else v
+                    for k, v in asdict(self).items()}
+
+        return {k: v.as_dict(remove_null_entries) if isinstance(v, BaseStruct) else v
+                for k, v in asdict(self).items() if v is not None}
diff --git a/clearml_serving/serving/entrypoint.sh b/clearml_serving/serving/entrypoint.sh
new file mode 100755
index 0000000..2e1bf71
--- /dev/null
+++ b/clearml_serving/serving/entrypoint.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# print configuration
+echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID"
+echo CLEARML_SERVING_PORT="$CLEARML_SERVING_PORT"
+echo CLEARML_USE_GUNICORN="$CLEARML_USE_GUNICORN"
+echo EXTRA_PYTHON_PACKAGES="$EXTRA_PYTHON_PACKAGES"
+echo CLEARML_SERVING_NUM_PROCESS="$CLEARML_SERVING_NUM_PROCESS"
+echo CLEARML_SERVING_POLL_FREQ="$CLEARML_SERVING_POLL_FREQ"
+echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL"
+echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL"
+
+SERVING_PORT="${CLEARML_SERVING_PORT:-8080}"
+GUNICORN_NUM_PROCESS="${CLEARML_SERVING_NUM_PROCESS:-4}"
+GUNICORN_SERVING_TIMEOUT="${GUNICORN_SERVING_TIMEOUT:-600}"
+UVICORN_SERVE_LOOP="${UVICORN_SERVE_LOOP:-asyncio}"
+
+# set default internal serve endpoint (for request pipelining)
+CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/serve}"
+CLEARML_DEFAULT_TRITON_GRPC_ADDR="${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-127.0.0.1:8001}"
+
+# print configuration
+echo WEB_CONCURRENCY="$WEB_CONCURRENCY"
+echo SERVING_PORT="$SERVING_PORT"
+echo GUNICORN_NUM_PROCESS="$GUNICORN_NUM_PROCESS"
+echo GUNICORN_SERVING_TIMEOUT="$GUNICORN_SERVING_PORT"
+echo GUNICORN_EXTRA_ARGS="$GUNICORN_EXTRA_ARGS"
+echo UVICORN_SERVE_LOOP="$UVICORN_SERVE_LOOP"
+echo UVICORN_EXTRA_ARGS="$UVICORN_EXTRA_ARGS"
+echo CLEARML_DEFAULT_BASE_SERVE_URL="$CLEARML_DEFAULT_BASE_SERVE_URL"
+echo CLEARML_DEFAULT_TRITON_GRPC_ADDR="$CLEARML_DEFAULT_TRITON_GRPC_ADDR"
+
+# runtime add extra python packages
+if [ ! -z "$EXTRA_PYTHON_PACKAGES" ]
+then
+      python3 -m pip install $EXTRA_PYTHON_PACKAGES
+fi
+
+if [ -z "$CLEARML_USE_GUNICORN" ]
+then
+  echo "Starting Uvicorn server"
+  PYTHONPATH=$(pwd) python3 -m uvicorn \
+      clearml_serving.serving.main:app --host 0.0.0.0 --port $SERVING_PORT --loop $UVICORN_SERVE_LOOP \
+      $UVICORN_EXTRA_ARGS
+else
+  echo "Starting Gunicorn server"
+  # start service
+  PYTHONPATH=$(pwd) python3 -m gunicorn \
+      --preload clearml_serving.serving.main:app \
+      --workers $GUNICORN_NUM_PROCESS \
+      --worker-class uvicorn.workers.UvicornWorker \
+      --timeout $GUNICORN_SERVING_TIMEOUT \
+      --bind 0.0.0.0:$SERVING_PORT \
+      $GUNICORN_EXTRA_ARGS
+fi
diff --git a/clearml_serving/serving/main.py b/clearml_serving/serving/main.py
new file mode 100644
index 0000000..f3473e7
--- /dev/null
+++ b/clearml_serving/serving/main.py
@@ -0,0 +1,102 @@
+import os
+from multiprocessing import Lock
+import gzip
+
+from fastapi import FastAPI, Request, Response, APIRouter, HTTPException
+from fastapi.routing import APIRoute
+
+from typing import Optional, Dict, Any, Callable
+
+from clearml import Task
+from clearml_serving.version import __version__
+from clearml_serving.serving.model_request_processor import ModelRequestProcessor
+from clearml_serving.serving.preprocess_service import BasePreprocessRequest
+
+
+class GzipRequest(Request):
+    async def body(self) -> bytes:
+        if not hasattr(self, "_body"):
+            body = await super().body()
+            if "gzip" in self.headers.getlist("Content-Encoding"):
+                body = gzip.decompress(body)
+            self._body = body  # noqa
+        return self._body
+
+
+class GzipRoute(APIRoute):
+    def get_route_handler(self) -> Callable:
+        original_route_handler = super().get_route_handler()
+
+        async def custom_route_handler(request: Request) -> Response:
+            request = GzipRequest(request.scope, request.receive)
+            return await original_route_handler(request)
+
+        return custom_route_handler
+
+
+# process Lock, so that we can have only a single process doing the model reloading at a time
+singleton_sync_lock = Lock()
+
+serving_service_task_id = os.environ.get("CLEARML_SERVING_TASK_ID", None)
+model_sync_frequency_secs = 5
+try:
+    model_sync_frequency_secs = float(os.environ.get("CLEARML_SERVING_POLL_FREQ", model_sync_frequency_secs))
+except (ValueError, TypeError):
+    pass
+
+# get the serving controller task
+# noinspection PyProtectedMember
+serving_task = ModelRequestProcessor._get_control_plane_task(task_id=serving_service_task_id)
+# set to running (because we are here)
+if serving_task.status != "in_progress":
+    serving_task.started(force=True)
+# create a new serving instance (for visibility and monitoring)
+instance_task = Task.init(
+    project_name=serving_task.get_project_name(),
+    task_name="{} - serve instance".format(serving_task.name),
+    task_type="inference",
+)
+instance_task.set_system_tags(["service"])
+processor = None  # type: Optional[ModelRequestProcessor]
+# preload modules into memory before forking
+BasePreprocessRequest.load_modules()
+# start FastAPI app
+app = FastAPI(title="ClearML Serving Service", version=__version__, description="ClearML Service Service router")
+
+
+@app.on_event("startup")
+async def startup_event():
+    global processor
+    print("Starting up ModelRequestProcessor [pid={}] [service_id={}]".format(os.getpid(), serving_service_task_id))
+    processor = ModelRequestProcessor(
+        task_id=serving_service_task_id, update_lock_guard=singleton_sync_lock,
+    )
+    print("ModelRequestProcessor [id={}] loaded".format(processor.get_id()))
+    processor.launch(poll_frequency_sec=model_sync_frequency_secs*60)
+
+
+router = APIRouter(
+    prefix="/serve",
+    tags=["models"],
+    responses={404: {"description": "Model Serving Endpoint Not found"}},
+    route_class=GzipRoute,  # mark-out to remove support for GZip content encoding
+)
+
+
+# cover all routing options for model version `/{model_id}`, `/{model_id}/123`, `/{model_id}?version=123`
+@router.post("/{model_id}/{version}")
+@router.post("/{model_id}/")
+@router.post("/{model_id}")
+def serve_model(model_id: str, version: Optional[str] = None, request: Dict[Any, Any] = None):
+    try:
+        return_value = processor.process_request(
+            base_url=model_id,
+            version=version,
+            request_body=request
+        )
+    except Exception as ex:
+        raise HTTPException(status_code=404, detail="Error processing request: {}".format(ex))
+    return return_value
+
+
+app.include_router(router)
diff --git a/clearml_serving/serving/model_request_processor.py b/clearml_serving/serving/model_request_processor.py
new file mode 100644
index 0000000..5d7d271
--- /dev/null
+++ b/clearml_serving/serving/model_request_processor.py
@@ -0,0 +1,1161 @@
+import json
+import os
+from pathlib import Path
+from queue import Queue
+from random import random
+from time import sleep, time
+from typing import Optional, Union, Dict, List
+import itertools
+import threading
+from multiprocessing import Lock
+from numpy.random import choice
+
+from clearml import Task, Model
+from clearml.storage.util import hash_dict
+from .preprocess_service import BasePreprocessRequest
+from .endpoints import ModelEndpoint, ModelMonitoring, CanaryEP, EndpointMetricLogging
+
+
+class FastWriteCounter(object):
+    def __init__(self):
+        self._counter_inc = itertools.count()
+        self._counter_dec = itertools.count()
+
+    def inc(self):
+        next(self._counter_inc)
+
+    def dec(self):
+        next(self._counter_dec)
+
+    def value(self):
+        return next(self._counter_inc) - next(self._counter_dec)
+
+
+class ModelRequestProcessor(object):
+    _system_tag = "serving-control-plane"
+    _kafka_topic = "clearml_inference_stats"
+    _config_key_serving_base_url = "serving_base_url"
+    _config_key_triton_grpc = "triton_grpc_server"
+    _config_key_kafka_stats = "kafka_service_server"
+    _config_key_def_metric_freq = "metric_logging_freq"
+
+    def __init__(
+            self,
+            task_id: Optional[str] = None,
+            update_lock_guard: Optional[Lock] = None,
+            name: Optional[str] = None,
+            project: Optional[str] = None,
+            tags: Optional[List[str]] = None,
+            force_create: bool = False,
+    ) -> None:
+        """
+        :param task_id: Optional specify existing Task ID of the ServingService
+        :param update_lock_guard: If provided use external (usually multi-process) lock guard for updates
+        :param name: Optional name current serving service
+        :param project: Optional select project for the current serving service
+        :param tags: Optional add tags to the serving service
+        :param force_create: force_create if provided, ignore task_id and create a new serving Task
+        """
+        self._task = self._create_task(name=name, project=project, tags=tags) \
+            if force_create else self._get_control_plane_task(task_id=task_id, name=name, project=project, tags=tags)
+        self._endpoints = dict()  # type: Dict[str, ModelEndpoint]
+        self._model_monitoring = dict()  # type: Dict[str, ModelMonitoring]
+        self._model_monitoring_versions = dict()  # type: Dict[str, Dict[int, str]]
+        self._model_monitoring_endpoints = dict()  # type: Dict[str, ModelEndpoint]
+        self._model_monitoring_update_request = True
+        # Dict[base_serve_url, Dict[version, model_id]]
+        self._canary_endpoints = dict()  # type: Dict[str, CanaryEP]
+        self._canary_route = dict()  # type: Dict[str, dict]
+        self._engine_processor_lookup = dict()  # type: Dict[str, BasePreprocessRequest]
+        self._metric_logging = dict()  # type: Dict[str, EndpointMetricLogging]
+        self._endpoint_metric_logging = dict()  # type: Dict[str, EndpointMetricLogging]
+        self._last_update_hash = None
+        self._sync_daemon_thread = None
+        self._stats_sending_thread = None
+        self._stats_queue = Queue()
+        # this is used for Fast locking mechanisms (so we do not actually need to use Locks)
+        self._update_lock_flag = False
+        self._request_processing_state = FastWriteCounter()
+        self._update_lock_guard = update_lock_guard or threading.Lock()
+        self._instance_task = None
+        # serving server config
+        self._configuration = {}
+        # deserialized values go here
+        self._kafka_stats_url = None
+        self._triton_grpc = None
+        self._serving_base_url = None
+        self._metric_log_freq = None
+
+    def process_request(self, base_url: str, version: str, request_body: dict) -> dict:
+        """
+        Process request coming in,
+        Raise Value error if url does not match existing endpoints
+        """
+        self._request_processing_state.inc()
+        # check if we need to stall
+        if self._update_lock_flag:
+            self._request_processing_state.dec()
+            while self._update_lock_flag:
+                sleep(1)
+            # retry to process
+            return self.process_request(base_url=base_url, version=version, request_body=request_body)
+
+        try:
+            # normalize url and version
+            url = self._normalize_endpoint_url(base_url, version)
+
+            # check canary
+            canary_url = self._process_canary(base_url=url)
+            if canary_url:
+                url = canary_url
+
+            ep = self._endpoints.get(url, None) or self._model_monitoring_endpoints.get(url, None)
+            if not ep:
+                raise ValueError("Model inference endpoint '{}' not found".format(url))
+
+            processor = self._engine_processor_lookup.get(url)
+            if not processor:
+                processor_cls = BasePreprocessRequest.get_engine_cls(ep.engine_type)
+                processor = processor_cls(model_endpoint=ep, task=self._task)
+                self._engine_processor_lookup[url] = processor
+
+            return_value = self._process_request(processor=processor, url=url, body=request_body)
+        finally:
+            self._request_processing_state.dec()
+
+        return return_value
+
+    def _process_canary(self, base_url: str) -> Optional[dict]:
+        canary = self._canary_route.get(base_url)
+        if not canary:
+            return None
+        # random choice
+        draw = choice(canary['endpoints'], 1, p=canary['weights'])
+        # the new endpoint to use
+        return draw[0]
+
+    def configure(
+            self,
+            external_serving_base_url: Optional[str] = None,
+            external_triton_grpc_server: Optional[str] = None,
+            external_kafka_service_server: Optional[str] = None,
+            default_metric_log_freq: Optional[float] = None,
+    ):
+        """
+        Set ModelRequestProcessor configuration arguments.
+
+        :param external_serving_base_url: Set the external base http endpoint for the serving service
+            This URL will be passed to user custom preprocess class,
+            allowing it to concatenate and combine multiple model requests into one
+        :param external_triton_grpc_server: set the external grpc tcp port of the Nvidia Triton clearml container.
+            Used by the clearml triton engine class to send inference requests
+        :param external_kafka_service_server: Optional, Kafka endpoint for the statistics controller collection.
+        :param default_metric_log_freq: Default request metric logging (0 to 1.0, 1. means 100% of requests are logged)
+        """
+        if external_serving_base_url is not None:
+            self._task.set_parameter(
+                name="General/{}".format(self._config_key_serving_base_url),
+                value=str(external_serving_base_url),
+                value_type="str",
+                description="external base http endpoint for the serving service"
+            )
+        if external_triton_grpc_server is not None:
+            self._task.set_parameter(
+                name="General/{}".format(self._config_key_triton_grpc),
+                value=str(external_triton_grpc_server),
+                value_type="str",
+                description="external grpc tcp port of the Nvidia Triton ClearML container running"
+            )
+        if external_kafka_service_server is not None:
+            self._task.set_parameter(
+                name="General/{}".format(self._config_key_kafka_stats),
+                value=str(external_kafka_service_server),
+                value_type="str",
+                description="external Kafka service url for the statistics controller server"
+            )
+        if default_metric_log_freq is not None:
+            self._task.set_parameter(
+                name="General/{}".format(self._config_key_def_metric_freq),
+                value=str(default_metric_log_freq),
+                value_type="float",
+                description="Request metric logging frequency"
+            )
+
+    def get_configuration(self) -> dict:
+        return dict(**self._configuration)
+
+    def add_endpoint(
+            self,
+            endpoint: Union[ModelEndpoint, dict],
+            preprocess_code: Optional[str] = None,
+            model_name: Optional[str] = None,
+            model_project: Optional[str] = None,
+            model_tags: Optional[List[str]] = None,
+            model_published: Optional[bool] = None,
+    ) -> str:
+        """
+        Return the unique name of the endpoint (endpoint + version)
+        Overwrite existing endpoint if already exists  (outputs a warning)
+
+        :param endpoint: New endpoint to register (overwrite existing endpoint if exists)
+        :param preprocess_code: If provided upload local code as artifact
+        :param model_name: If model-id not provided on, search based on model name
+        :param model_project: If model-id not provided on, search based on model project
+        :param model_tags: If model-id not provided on, search based on model tags
+        :param model_published: If model-id not provided on, search based on model published state
+        """
+        if not isinstance(endpoint, ModelEndpoint):
+            endpoint = ModelEndpoint(**endpoint)
+
+        # make sure we have everything configured
+        self._validate_model(endpoint)
+
+        url = self._normalize_endpoint_url(endpoint.serving_url, endpoint.version)
+        if url in self._endpoints:
+            print("Warning: Model endpoint \'{}\' overwritten".format(url))
+
+        if not endpoint.model_id and any([model_project, model_name, model_tags]):
+            model_query = dict(
+                project_name=model_project,
+                model_name=model_name,
+                tags=model_tags,
+                only_published=bool(model_published),
+                include_archived=False,
+            )
+            models = Model.query_models(max_results=2, **model_query)
+            if not models:
+                raise ValueError("Could not fine any Model to serve {}".format(model_query))
+            if len(models) > 1:
+                print("Warning: Found multiple Models for \'{}\', selecting id={}".format(model_query, models[0].id))
+            endpoint.model_id = models[0].id
+        elif not endpoint.model_id:
+            print("Warning: No Model provided for \'{}\'".format(url))
+
+        # upload as new artifact
+        if preprocess_code:
+            if not Path(preprocess_code).exists():
+                raise ValueError("Preprocessing code \'{}\' could not be found".format(preprocess_code))
+            preprocess_artifact_name = "py_code_{}".format(url.replace("/", "_"))
+            self._task.upload_artifact(
+                name=preprocess_artifact_name, artifact_object=Path(preprocess_code), wait_on_upload=True)
+            endpoint.preprocess_artifact = preprocess_artifact_name
+
+        self._endpoints[url] = endpoint
+        return url
+
+    def add_model_monitoring(
+            self,
+            monitoring: Union[ModelMonitoring, dict],
+            preprocess_code: Optional[str] = None,
+    ) -> str:
+        """
+        Return the unique name of the endpoint (endpoint + version)
+        Overwrite existing endpoint if already exists  (outputs a warning)
+
+        :param monitoring: Model endpoint monitor (overwrite existing endpoint if exists)
+        :param preprocess_code: If provided upload local code as artifact
+        :return: Unique model monitoring ID (base_model_url)
+        """
+        if not isinstance(monitoring, ModelMonitoring):
+            monitoring = ModelMonitoring(**monitoring)
+
+        # make sure we actually have something to monitor
+        if not any([monitoring.monitor_project, monitoring.monitor_name, monitoring.monitor_tags]):
+            raise ValueError("Model monitoring requires at least a "
+                             "project / name / tag to monitor, none were provided.")
+
+        # make sure we have everything configured
+        self._validate_model(monitoring)
+
+        name = monitoring.base_serving_url
+        if name in self._model_monitoring:
+            print("Warning: Model monitoring \'{}\' overwritten".format(name))
+
+        # upload as new artifact
+        if preprocess_code:
+            if not Path(preprocess_code).exists():
+                raise ValueError("Preprocessing code \'{}\' could not be found".format(preprocess_code))
+            preprocess_artifact_name = "py_code_{}".format(name.replace("/", "_"))
+            self._task.upload_artifact(
+                name=preprocess_artifact_name, artifact_object=Path(preprocess_code), wait_on_upload=True)
+            monitoring.preprocess_artifact = preprocess_artifact_name
+
+        self._model_monitoring[name] = monitoring
+        return name
+
+    def remove_model_monitoring(self, model_base_url: str) -> bool:
+        """
+        Remove model monitoring, use base_model_url as unique identifier
+        """
+        if model_base_url not in self._model_monitoring:
+            return False
+        self._model_monitoring.pop(model_base_url, None)
+        return True
+
+    def remove_endpoint(self, endpoint_url: str, version: Optional[str] = None) -> bool:
+        """
+        Remove specific model endpoint, use base_model_url as unique identifier
+        """
+        endpoint_url = self._normalize_endpoint_url(endpoint_url, version)
+        if endpoint_url not in self._endpoints:
+            return False
+        self._endpoints.pop(endpoint_url, None)
+        return True
+
+    def add_canary_endpoint(
+            self,
+            canary: Union[CanaryEP, dict],
+    ) -> str:
+        """
+        Return the unique name of the endpoint (endpoint + version)
+        Overwrite existing endpoint if already exists  (outputs a warning)
+
+        :param canary: Canary endpoint router (overwrite existing endpoint if exists)
+        :return: Unique canary ID (base_model_url)
+        """
+        if not isinstance(canary, CanaryEP):
+            canary = CanaryEP(**canary)
+        if canary.load_endpoints and canary.load_endpoint_prefix:
+            raise ValueError(
+                "Could not add canary endpoint with both "
+                "prefix ({}) and fixed set of endpoints ({})".format(
+                    canary.load_endpoints, canary.load_endpoint_prefix))
+        name = canary.endpoint
+        if name in self._canary_endpoints:
+            print("Warning: Model monitoring \'{}\' overwritten".format(name))
+
+        self._canary_endpoints[name] = canary
+        return name
+
+    def remove_canary_endpoint(self, endpoint_url: str) -> bool:
+        """
+        Remove specific canary model endpoint, use base_model_url as unique identifier
+        """
+        if endpoint_url not in self._canary_endpoints:
+            return False
+        self._canary_endpoints.pop(endpoint_url, None)
+        return True
+
+    def add_metric_logging(self, metric: Union[EndpointMetricLogging, dict]) -> bool:
+        """
+        Add metric logging to a specific endpoint
+        Valid metric variable are any variables on the request or response dictionary,
+        or a custom preprocess reported variable
+
+        When overwriting and existing monitored variable, output a warning.
+
+        :param metric: Metric variable to monitor
+        :return: True if successful
+        """
+        if not isinstance(metric, EndpointMetricLogging):
+            metric = EndpointMetricLogging(**metric)
+
+        name = str(metric.endpoint).strip("/")
+        metric.endpoint = name
+
+        if name not in self._endpoints and not name.endswith('*'):
+            raise ValueError("Metric logging \'{}\' references a nonexistent endpoint".format(name))
+
+        if name in self._metric_logging:
+            print("Warning: Metric logging \'{}\' overwritten".format(name))
+
+        self._metric_logging[name] = metric
+        return True
+
+    def remove_metric_logging(
+            self,
+            endpoint: str,
+            variable_name: str = None,
+    ) -> bool:
+        """
+        Remove existing logged metric variable. Use variable name and endpoint as unique identifier
+
+        :param endpoint: Endpoint name (including version, e.g. "model/1" or "model/*")
+        :param variable_name: Variable name (str), pass None to remove the entire endpoint logging
+
+        :return: True if successful
+        """
+
+        name = str(endpoint).strip("/")
+
+        if name not in self._metric_logging or \
+                (variable_name and variable_name not in self._metric_logging[name].metrics):
+            return False
+
+        if not variable_name:
+            self._metric_logging.pop(name, None)
+        else:
+            self._metric_logging[name].metrics.pop(variable_name, None)
+
+        return True
+
+    def list_metric_logging(self) -> Dict[str, EndpointMetricLogging]:
+        """
+        List existing logged metric variables.
+
+        :return: Dictionary, key='endpoint/version' value=EndpointMetricLogging
+        """
+
+        return dict(**self._metric_logging)
+
+    def list_endpoint_logging(self) -> Dict[str, EndpointMetricLogging]:
+        """
+        List endpoints (fully synced) current  metric logging state.
+
+        :return: Dictionary, key='endpoint/version' value=EndpointMetricLogging
+        """
+
+        return dict(**self._endpoint_metric_logging)
+
+    def deserialize(
+            self,
+            task: Task = None,
+            prefetch_artifacts: bool = False,
+            skip_sync: bool = False,
+            update_current_task: bool = True
+    ) -> bool:
+        """
+        Restore ModelRequestProcessor state from Task
+        return True if actually needed serialization, False nothing changed
+        :param task: Load data from Task
+        :param prefetch_artifacts: If True prefetch artifacts requested by the endpoints
+        :param skip_sync: If True do not update the canary/monitoring state
+        :param update_current_task: is not skip_sync, and is True,
+            update the current Task with the configuration synced from the serving service Task
+        """
+        if not task:
+            task = self._task
+        configuration = task.get_parameters_as_dict().get("General") or {}
+        endpoints = task.get_configuration_object_as_dict(name='endpoints') or {}
+        canary_ep = task.get_configuration_object_as_dict(name='canary') or {}
+        model_monitoring = task.get_configuration_object_as_dict(name='model_monitoring') or {}
+        metric_logging = task.get_configuration_object_as_dict(name='metric_logging') or {}
+
+        hashed_conf = hash_dict(
+            dict(endpoints=endpoints,
+                 canary_ep=canary_ep,
+                 model_monitoring=model_monitoring,
+                 metric_logging=metric_logging,
+                 configuration=configuration)
+        )
+        if self._last_update_hash == hashed_conf and not self._model_monitoring_update_request:
+            return False
+        print("Info: syncing model endpoint configuration, state hash={}".format(hashed_conf))
+        self._last_update_hash = hashed_conf
+
+        endpoints = {
+            k: ModelEndpoint(**{i: j for i, j in v.items() if hasattr(ModelEndpoint.__attrs_attrs__, i)})
+            for k, v in endpoints.items()
+        }
+        model_monitoring = {
+            k: ModelMonitoring(**{i: j for i, j in v.items() if hasattr(ModelMonitoring.__attrs_attrs__, i)})
+            for k, v in model_monitoring.items()
+        }
+        canary_endpoints = {
+            k: CanaryEP(**{i: j for i, j in v.items() if hasattr(CanaryEP.__attrs_attrs__, i)})
+            for k, v in canary_ep.items()
+        }
+        metric_logging = {
+            k: EndpointMetricLogging(**{i: j for i, j in v.items() if hasattr(EndpointMetricLogging.__attrs_attrs__, i)})
+            for k, v in metric_logging.items()
+        }
+
+        # if there is no need to sync Canary and Models we can just leave
+        if skip_sync:
+            self._endpoints = endpoints
+            self._model_monitoring = model_monitoring
+            self._canary_endpoints = canary_endpoints
+            self._metric_logging = metric_logging
+            self._deserialize_conf_dict(configuration)
+            return True
+
+        # make sure we only have one stall request at any given moment
+        with self._update_lock_guard:
+            # download artifacts
+            # todo: separate into two, download before lock, and overwrite inside lock
+            if prefetch_artifacts:
+                for item in list(endpoints.values()) + list(model_monitoring.values()):
+                    if item.preprocess_artifact:
+                        # noinspection PyBroadException
+                        try:
+                            self._task.artifacts[item.preprocess_artifact].get_local_copy(
+                                extract_archive=True,
+                            )
+                        except Exception:
+                            pass
+
+            # stall all requests
+            self._update_lock_flag = True
+            # wait until we have no request processed
+            while self._request_processing_state.value() != 0:
+                sleep(1)
+
+            self._endpoints = endpoints
+            self._model_monitoring = model_monitoring
+            self._canary_endpoints = canary_endpoints
+            self._metric_logging = metric_logging
+            self._deserialize_conf_dict(configuration)
+
+            # if we have models we need to sync, now is the time
+            self._sync_monitored_models()
+
+            self._update_canary_lookup()
+
+            self._sync_metric_logging()
+
+            # release stall lock
+            self._update_lock_flag = False
+
+            # update the state on the inference task
+            if update_current_task and Task.current_task() and Task.current_task().id != self._task.id:
+                self.serialize(task=Task.current_task())
+
+        return True
+
+    def serialize(self, task: Optional[Task] = None) -> None:
+        """
+        Store ModelRequestProcessor state into Task
+        """
+        if not task:
+            task = self._task
+        config_dict = {k: v.as_dict(remove_null_entries=True) for k, v in self._endpoints.items()}
+        task.set_configuration_object(name='endpoints', config_dict=config_dict)
+        config_dict = {k: v.as_dict(remove_null_entries=True) for k, v in self._canary_endpoints.items()}
+        task.set_configuration_object(name='canary', config_dict=config_dict)
+        config_dict = {k: v.as_dict(remove_null_entries=True) for k, v in self._model_monitoring.items()}
+        task.set_configuration_object(name='model_monitoring', config_dict=config_dict)
+        config_dict = {k: v.as_dict(remove_null_entries=True) for k, v in self._metric_logging.items()}
+        task.set_configuration_object(name='metric_logging', config_dict=config_dict)
+
+    def _update_canary_lookup(self):
+        canary_route = {}
+        for k, v in self._canary_endpoints.items():
+            if v.load_endpoint_prefix and v.load_endpoints:
+                print("Warning: Canary has both prefix and fixed endpoints, ignoring canary endpoint")
+                continue
+            if v.load_endpoints:
+                if len(v.load_endpoints) != len(v.weights):
+                    print("Warning: Canary \'{}\' weights [{}] do not match number of endpoints [{}], skipping!".format(
+                        k, v.weights, v.load_endpoints))
+                    continue
+                endpoints = []
+                weights = []
+                for w, ep in zip(v.weights, v.load_endpoints):
+                    if ep not in self._endpoints and ep not in self._model_monitoring_endpoints:
+                        print("Warning: Canary \'{}\' endpoint \'{}\' could not be found, skipping".format(k, ep))
+                        continue
+                    endpoints.append(ep)
+                    weights.append(float(w))
+                # normalize weights
+                sum_weights = sum(weights)
+                weights = [w/sum_weights for w in weights]
+                canary_route[k] = dict(endpoints=endpoints, weights=weights)
+            elif v.load_endpoint_prefix:
+                endpoints = [ep for ep in list(self._endpoints.keys()) + list(self._model_monitoring_endpoints.keys())
+                             if str(ep).startswith(v.load_endpoint_prefix)]
+                endpoints = sorted(
+                    endpoints,
+                    reverse=True,
+                    key=lambda x: '{}/{:0>9}'.format('/'.join(x.split('/')[:-1]), x.split('/')[-1]) if '/' in x else x
+                )
+                endpoints = endpoints[:len(v.weights)]
+                weights = v.weights[:len(endpoints)]
+                # normalize weights
+                sum_weights = sum(weights)
+                weights = [w/sum_weights for w in weights]
+                canary_route[k] = dict(endpoints=endpoints, weights=weights)
+                self._report_text(
+                    "Info: Canary endpoint \'{}\' selected [{}]".format(k, canary_route[k])
+                )
+
+        # update back
+        self._canary_route = canary_route
+
+    def _sync_monitored_models(self, force: bool = False) -> bool:
+        if not force and not self._model_monitoring_update_request:
+            return False
+        dirty = False
+
+        for serving_base_url, versions_model_id_dict in self._model_monitoring_versions.items():
+            # find existing endpoint versions
+            for ep_base_url in list(self._model_monitoring_endpoints.keys()):
+                # skip over endpoints that are not our own
+                if not ep_base_url.startswith(serving_base_url+"/"):
+                    continue
+                # find endpoint version
+                _, version = ep_base_url.split("/", 1)
+                if int(version) not in versions_model_id_dict:
+                    # remove old endpoint
+                    self._model_monitoring_endpoints.pop(ep_base_url, None)
+                    dirty = True
+                    continue
+
+            # add new endpoint
+            for version, model_id in versions_model_id_dict.items():
+                url = "{}/{}".format(serving_base_url, version)
+                if url in self._model_monitoring_endpoints:
+                    continue
+                model = self._model_monitoring.get(serving_base_url)
+                if not model:
+                    # this should never happen
+                    continue
+                ep = ModelEndpoint(
+                    engine_type=model.engine_type,
+                    serving_url=serving_base_url,
+                    model_id=model_id,
+                    version=str(version),
+                    preprocess_artifact=model.preprocess_artifact,
+                    input_size=model.input_size,
+                    input_type=model.input_type,
+                    output_size=model.output_size,
+                    output_type=model.output_type
+                )
+                self._model_monitoring_endpoints[url] = ep
+                dirty = True
+
+        # filter out old model monitoring endpoints
+        for ep_url in list(self._model_monitoring_endpoints.keys()):
+            if not any(True for url in self._model_monitoring_versions if ep_url.startswith(url+"/")):
+                self._model_monitoring_endpoints.pop(ep_url, None)
+                dirty = True
+
+        # reset flag
+        self._model_monitoring_update_request = False
+
+        if dirty:
+            config_dict = {k: v.as_dict(remove_null_entries=True) for k, v in self._model_monitoring_endpoints.items()}
+            self._task.set_configuration_object(name='model_monitoring_eps', config_dict=config_dict)
+
+        return dirty
+
+    def _update_monitored_models(self):
+        for model in self._model_monitoring.values():
+            current_served_models = self._model_monitoring_versions.get(model.base_serving_url, {})
+            # To Do: sort by updated time ?
+            models = Model.query_models(
+                project_name=model.monitor_project or None,
+                model_name=model.monitor_name or None,
+                tags=model.monitor_tags or None,
+                only_published=model.only_published,
+                max_results=model.max_versions,
+                include_archived=False,
+            )
+
+            # check what we already have:
+            current_model_id_version_lookup = dict(
+                zip(list(current_served_models.values()), list(current_served_models.keys()))
+            )
+            versions = sorted(current_served_models.keys(), reverse=True)
+
+            # notice, most updated model first
+            # first select only the new models
+            model_ids = [m.id for m in models]
+
+            # we want last updated model to be last (so it gets the highest version number)
+            max_v = 1 + (versions[0] if versions else 0)
+            versions_model_ids = []
+            for m_id in reversed(model_ids):
+                v = current_model_id_version_lookup.get(m_id)
+                if v is None:
+                    v = max_v
+                    max_v += 1
+                versions_model_ids.append((v, m_id))
+
+            # remove extra entries (old models)
+            versions_model_ids_dict = dict(versions_model_ids[:model.max_versions])
+
+            # mark dirty if something changed:
+            if versions_model_ids_dict != current_served_models:
+                self._model_monitoring_update_request = True
+
+            # update model serving state
+            self._model_monitoring_versions[model.base_serving_url] = versions_model_ids_dict
+
+        if not self._model_monitoring_update_request:
+            return False
+
+        self._report_text("INFO: Monitored Models updated: {}".format(
+            json.dumps(self._model_monitoring_versions, indent=2))
+        )
+        return True
+
+    def _sync_metric_logging(self, force: bool = False) -> bool:
+        if not force and not self._metric_logging:
+            return False
+
+        fixed_metric_endpoint = {
+            k: v for k, v in self._metric_logging.items() if "*/" not in k
+        }
+        prefix_metric_endpoint = {k.split("*/")[0]: v for k, v in self._metric_logging.items() if "*/" in k}
+
+        endpoint_metric_logging = {}
+        for k, ep in list(self._endpoints.items()) + list(self._model_monitoring_endpoints.items()):
+            if k in fixed_metric_endpoint:
+                if k not in endpoint_metric_logging:
+                    endpoint_metric_logging[k] = fixed_metric_endpoint[k]
+
+                continue
+            for p, v in prefix_metric_endpoint.items():
+                if k.startswith(p):
+                    if k not in endpoint_metric_logging:
+                        endpoint_metric_logging[k] = v
+
+                    break
+
+        self._endpoint_metric_logging = endpoint_metric_logging
+        return True
+
+    def launch(self, poll_frequency_sec=300):
+        """
+        Launch the background synchronization thread and monitoring thread
+        (updating runtime process based on changes on the Task, and monitoring model changes in the system)
+        :param poll_frequency_sec: Sync every X seconds (default 300 seconds)
+        """
+        if self._sync_daemon_thread:
+            return
+
+        # read state
+        self.deserialize(self._task, prefetch_artifacts=True)
+        # model monitoring sync
+        if self._update_monitored_models():
+            # update endpoints
+            self.deserialize(self._task, prefetch_artifacts=True)
+
+        # get the serving instance (for visibility and monitoring)
+        self._instance_task = Task.current_task()
+
+        # start the background thread
+        with self._update_lock_guard:
+            if self._sync_daemon_thread:
+                return
+            self._sync_daemon_thread = threading.Thread(
+                target=self._sync_daemon, args=(poll_frequency_sec, ), daemon=True)
+            self._stats_sending_thread = threading.Thread(
+                target=self._stats_send_loop, daemon=True)
+
+            self._sync_daemon_thread.start()
+            self._stats_sending_thread.start()
+
+        # we return immediately
+
+    def _sync_daemon(self, poll_frequency_sec: float = 300) -> None:
+        """
+        Background thread, syncing model changes into request service.
+        """
+        poll_frequency_sec = float(poll_frequency_sec)
+        # force mark started on the main serving service task
+        self._task.mark_started(force=True)
+        self._report_text("Launching - configuration sync every {} sec".format(poll_frequency_sec))
+        cleanup = False
+        self._update_serving_plot()
+        while True:
+            try:
+                # this should be the only place where we call deserialize
+                self._task.reload()
+                if self.deserialize(self._task):
+                    self._report_text("New configuration updated")
+                    # mark clean up for next round
+                    cleanup = True
+                # model monitoring sync
+                if self._update_monitored_models():
+                    self._report_text("Model monitoring synced")
+                    # update endpoints
+                    self.deserialize(self._task)
+                    # mark clean up for next round
+                    cleanup = True
+                # update serving layout plot
+                if cleanup:
+                    self._update_serving_plot()
+            except Exception as ex:
+                print("Exception occurred in monitoring thread: {}".format(ex))
+            sleep(poll_frequency_sec)
+            try:
+                # we assume that by now all old deleted endpoints requests already returned
+                if cleanup:
+                    cleanup = False
+                    for k in list(self._engine_processor_lookup.keys()):
+                        if k not in self._endpoints:
+                            # atomic
+                            self._engine_processor_lookup.pop(k, None)
+            except Exception as ex:
+                print("Exception occurred in monitoring thread: {}".format(ex))
+
+    def _stats_send_loop(self) -> None:
+        """
+        Background thread for sending stats to Kafka service
+        """
+        if not self._kafka_stats_url:
+            print("No Kafka Statistics service configured, shutting down statistics report")
+            return
+
+        print("Starting Kafka Statistics reporting: {}".format(self._kafka_stats_url))
+
+        from kafka import KafkaProducer  # noqa
+
+        while True:
+            try:
+                producer = KafkaProducer(
+                    bootstrap_servers=self._kafka_stats_url,  # ['localhost:9092'],
+                    value_serializer=lambda x: json.dumps(x).encode('utf-8'),
+                    compression_type='lz4',  # requires python lz4 package
+                )
+                break
+            except Exception as ex:
+                print("Error: failed opening Kafka consumer [{}]: {}".format(self._kafka_stats_url, ex))
+                print("Retrying in 30 seconds")
+                sleep(30)
+
+        while True:
+            try:
+                stats_dict = self._stats_queue.get(block=True)
+            except Exception as ex:
+                print("Warning: Statistics thread exception: {}".format(ex))
+                break
+            # send into kafka service
+            try:
+                producer.send(self._kafka_topic, value=stats_dict).get()
+            except Exception as ex:
+                print("Warning: Failed to send statistics packet to Kafka service: {}".format(ex))
+                pass
+
+    def get_id(self) -> str:
+        return self._task.id
+
+    def get_endpoints(self) -> Dict[str, ModelEndpoint]:
+        endpoints = dict(**self._endpoints)
+        endpoints.update(**self._model_monitoring_endpoints)
+        return endpoints
+
+    def get_synced_endpoints(self) -> Dict[str, ModelEndpoint]:
+        self._task.reload()
+        _endpoints = self._task.get_configuration_object_as_dict(name='endpoints') or {}
+        _monitor_endpoints = self._task.get_configuration_object_as_dict(name='model_monitoring_eps') or {}
+        endpoints = {
+            k: ModelEndpoint(**{i: j for i, j in v.items() if hasattr(ModelEndpoint.__attrs_attrs__, i)})
+            for k, v in _endpoints.items()}
+        endpoints.update({
+            k: ModelEndpoint(**{i: j for i, j in v.items() if hasattr(ModelEndpoint.__attrs_attrs__, i)})
+            for k, v in _monitor_endpoints.items()
+        })
+        return endpoints
+
+    def get_canary_endpoints(self) -> dict:
+        return self._canary_endpoints
+
+    def get_model_monitoring(self) -> dict:
+        return self._model_monitoring
+
+    def _get_instance_id(self) -> Optional[str]:
+        return self._instance_task.id if self._instance_task else None
+
+    def _report_text(self, text) -> Optional[str]:
+        return self._task.get_logger().report_text("Instance [{}, pid={}]: {}".format(
+            self._get_instance_id(), os.getpid(), text))
+
+    def _update_serving_plot(self) -> None:
+        """
+        Update the endpoint serving graph on the serving instance Task
+        """
+        if not self._instance_task:
+            return
+
+        # Generate configuration table and details
+        endpoints = list(self._endpoints.values()) + list(self._model_monitoring_endpoints.values())
+        if not endpoints:
+            # clear plot if we had any
+            return
+
+        endpoints = [e.as_dict() for e in endpoints]
+        table_values = [list(endpoints[0].keys())]
+        table_values += [[e[c] or "" for c in table_values[0]] for e in endpoints]
+        self._instance_task.get_logger().report_table(
+            title='Serving Endpoint Configuration', series='Details', iteration=0, table_plot=table_values,
+            extra_layout={"title": "Model Endpoints Details"})
+
+        # generate current endpoint view
+        sankey_node = dict(
+            label=[],
+            color=[],
+            customdata=[],
+            hovertemplate='%{customdata}<extra></extra>',
+            hoverlabel={"align": "left"},
+        )
+        sankey_link = dict(
+            source=[],
+            target=[],
+            value=[],
+            hovertemplate='<extra></extra>',
+        )
+        # root
+        sankey_node['color'].append("mediumpurple")
+        sankey_node['label'].append('{}'.format('external'))
+        sankey_node['customdata'].append("")
+
+        sankey_node_idx = {}
+
+        # base_url = self._task._get_app_server() + '/projects/*/models/{model_id}/general'
+
+        # draw all static endpoints
+        # noinspection PyProtectedMember
+        for i, ep in enumerate(endpoints):
+            serve_url = ep['serving_url']
+            full_url = '{}/{}'.format(serve_url, ep['version'] or "")
+            sankey_node['color'].append("blue")
+            sankey_node['label'].append("/{}/".format(full_url.strip("/")))
+            sankey_node['customdata'].append(
+                "model id: {}".format(ep['model_id'])
+            )
+            sankey_link['source'].append(0)
+            sankey_link['target'].append(i + 1)
+            sankey_link['value'].append(1. / len(self._endpoints))
+            sankey_node_idx[full_url] = i + 1
+
+        # draw all model monitoring
+        sankey_node['color'].append("mediumpurple")
+        sankey_node['label'].append('{}'.format('monitoring models'))
+        sankey_node['customdata'].append("")
+        monitoring_root_idx = len(sankey_node['customdata']) - 1
+
+        for i, m in enumerate(self._model_monitoring.values()):
+            serve_url = m.base_serving_url
+            sankey_node['color'].append("purple")
+            sankey_node['label'].append('{}'.format(serve_url))
+            sankey_node['customdata'].append(
+                "project: {}<br />name: {}<br />tags: {}".format(
+                    m.monitor_project or '', m.monitor_name or '', m.monitor_tags or '')
+            )
+            sankey_link['source'].append(monitoring_root_idx)
+            sankey_link['target'].append(monitoring_root_idx + i + 1)
+            sankey_link['value'].append(1. / len(self._model_monitoring))
+
+            # add links to the current models
+            serve_url = serve_url.rstrip("/") + "/"
+            for k in sankey_node_idx:
+                if k.startswith(serve_url):
+                    sankey_link['source'].append(monitoring_root_idx + i + 1)
+                    sankey_link['target'].append(sankey_node_idx[k])
+                    sankey_link['value'].append(1.0 / m.max_versions)
+
+        # add canary endpoints
+        # sankey_node['color'].append("mediumpurple")
+        # sankey_node['label'].append('{}'.format('Canary endpoints'))
+        # sankey_node['customdata'].append("")
+        canary_root_idx = len(sankey_node['customdata']) - 1
+
+        # sankey_link['source'].append(0)
+        # sankey_link['target'].append(canary_root_idx)
+        # sankey_link['value'].append(1.)
+
+        for i, c in enumerate(self._canary_endpoints.values()):
+            serve_url = c.endpoint
+            sankey_node['color'].append("green")
+            sankey_node['label'].append('CANARY: /{}/'.format(serve_url.strip("/")))
+            sankey_node['customdata'].append(
+                "outputs: {}".format(
+                    c.load_endpoints or c.load_endpoint_prefix)
+            )
+            sankey_link['source'].append(0)
+            sankey_link['target'].append(canary_root_idx + i + 1)
+            sankey_link['value'].append(1. / len(self._canary_endpoints))
+
+            # add links to the current models
+            if serve_url not in self._canary_route:
+                continue
+            for ep, w in zip(self._canary_route[serve_url]['endpoints'], self._canary_route[serve_url]['weights']):
+                idx = sankey_node_idx.get(ep)
+                if idx is None:
+                    continue
+                sankey_link['source'].append(canary_root_idx + i + 1)
+                sankey_link['target'].append(idx)
+                sankey_link['value'].append(w)
+
+        # create the sankey graph
+        dag_flow = dict(
+            link=sankey_link,
+            node=sankey_node,
+            textfont=dict(color='rgba(0,0,0,255)', size=10),
+            type='sankey',
+            orientation='h'
+        )
+        fig = dict(data=[dag_flow], layout={'xaxis': {'visible': False}, 'yaxis': {'visible': False}})
+
+        self._instance_task.get_logger().report_plotly(
+            title='Serving Endpoints Layout', series='', iteration=0, figure=fig)
+
+    def _deserialize_conf_dict(self, configuration: dict) -> None:
+        self._configuration = configuration
+
+        # deserialized values go here
+        self._kafka_stats_url = \
+            configuration.get(self._config_key_kafka_stats) or \
+            os.environ.get("CLEARML_DEFAULT_KAFKA_SERVE_URL")
+        self._triton_grpc = \
+            configuration.get(self._config_key_triton_grpc) or \
+            os.environ.get("CLEARML_DEFAULT_TRITON_GRPC_ADDR")
+        self._serving_base_url = \
+            configuration.get(self._config_key_serving_base_url) or \
+            os.environ.get("CLEARML_DEFAULT_BASE_SERVE_URL")
+        self._metric_log_freq = \
+            float(configuration.get(self._config_key_def_metric_freq,
+                                    os.environ.get("CLEARML_DEFAULT_METRIC_LOG_FREQ", 1.0)))
+        # update back configuration
+        self._configuration[self._config_key_kafka_stats] = self._kafka_stats_url
+        self._configuration[self._config_key_triton_grpc] = self._triton_grpc
+        self._configuration[self._config_key_serving_base_url] = self._serving_base_url
+        self._configuration[self._config_key_def_metric_freq] = self._metric_log_freq
+        # update preprocessing classes
+        BasePreprocessRequest.set_server_config(self._configuration)
+
+    def _process_request(self, processor: BasePreprocessRequest, url: str, body: dict) -> dict:
+        # collect statistics for this request
+        stats = {}
+        stats_collect_fn = None
+        collect_stats = False
+        freq = 1
+        # decide if we are collecting the stats
+        metric_endpoint = self._metric_logging.get(url)
+        if self._kafka_stats_url:
+            freq = metric_endpoint.log_frequency if metric_endpoint and metric_endpoint.log_frequency is not None \
+                else self._metric_log_freq
+
+            if freq and random() <= freq:
+                stats_collect_fn = stats.update
+                collect_stats = True
+
+        tic = time()
+        preprocessed = processor.preprocess(body, stats_collect_fn)
+        processed = processor.process(preprocessed, stats_collect_fn)
+        return_value = processor.postprocess(processed, stats_collect_fn)
+        tic = time() - tic
+        if collect_stats:
+            # 10th of a millisecond should be enough
+            stats['_latency'] = round(tic, 4)
+            stats['_count'] = int(1.0/freq)
+            stats['_url'] = url
+
+            # collect inputs
+            if metric_endpoint and body:
+                for k, v in body.items():
+                    if k in metric_endpoint.metrics:
+                        stats[k] = v
+            # collect outputs
+            if metric_endpoint and return_value:
+                for k, v in return_value.items():
+                    if k in metric_endpoint.metrics:
+                        stats[k] = v
+
+            # send stats in background, push it into a thread queue
+            # noinspection PyBroadException
+            try:
+                self._stats_queue.put(stats, block=False)
+            except Exception:
+                pass
+
+        return return_value
+
+    @classmethod
+    def list_control_plane_tasks(
+            cls,
+            task_id: Optional[str] = None,
+            name: Optional[str] = None,
+            project: Optional[str] = None,
+            tags: Optional[List[str]] = None
+    ) -> List[dict]:
+
+        # noinspection PyProtectedMember
+        tasks = Task.query_tasks(
+            task_name=name or None,
+            project_name=project or None,
+            tags=tags or None,
+            additional_return_fields=["id", "name", "project", "tags"],
+            task_filter={'type': ['service'],
+                         'status': ["created", "in_progress"],
+                         'system_tags': [cls._system_tag]}
+        )  # type: List[dict]
+        if not tasks:
+            return []
+
+        for t in tasks:
+            # noinspection PyProtectedMember
+            t['project'] = Task._get_project_name(t['project'])
+
+        return tasks
+
+    @classmethod
+    def _get_control_plane_task(
+            cls,
+            task_id: Optional[str] = None,
+            name: Optional[str] = None,
+            project: Optional[str] = None,
+            tags: Optional[List[str]] = None,
+            disable_change_state: bool = False,
+    ) -> Task:
+        if task_id:
+            task = Task.get_task(task_id=task_id)
+            if not task:
+                raise ValueError("Could not find Control Task ID={}".format(task_id))
+            task_status = task.status
+            if task_status not in ("created", "in_progress",):
+                if disable_change_state:
+                    raise ValueError(
+                        "Could Control Task ID={} status [{}] "
+                        "is not valid (only 'draft', 'running' are supported)".format(task_id, task_status))
+                else:
+                    task.mark_started(force=True)
+            return task
+
+        # noinspection PyProtectedMember
+        tasks = Task.query_tasks(
+            task_name=name or None,
+            project_name=project or None,
+            tags=tags or None,
+            task_filter={'type': ['service'],
+                         'status': ["created", "in_progress"],
+                         'system_tags': [cls._system_tag]}
+        )
+        if not tasks:
+            raise ValueError("Could not find any valid Control Tasks")
+
+        if len(tasks) > 1:
+            print("Warning: more than one valid Controller Tasks found, using Task ID={}".format(tasks[0]))
+
+        return Task.get_task(task_id=tasks[0])
+
+    @classmethod
+    def _create_task(
+            cls,
+            name: Optional[str] = None,
+            project: Optional[str] = None,
+            tags: Optional[List[str]] = None
+    ) -> Task:
+        task = Task.create(
+            project_name=project or "DevOps",
+            task_name=name or "Serving Service",
+            task_type="service",
+        )
+        task.set_system_tags([cls._system_tag])
+        if tags:
+            task.set_tags(tags)
+        return task
+
+    @classmethod
+    def _normalize_endpoint_url(cls, endpoint: str, version: Optional[str] = None) -> str:
+        return "{}/{}".format(endpoint.rstrip("/"), version or "").rstrip("/")
+
+    @classmethod
+    def _validate_model(cls, endpoint: Union[ModelEndpoint, ModelMonitoring]) -> bool:
+        """
+        Raise exception if validation fails, otherwise return True
+        """
+        if endpoint.engine_type in ("triton", ):
+            # verify we have all the info we need
+            d = endpoint.as_dict()
+            missing = [
+                k for k in [
+                    'input_type', 'input_size', 'input_name',
+                    'output_type', 'output_size', 'output_name',
+                ] if not d.get(k)
+            ]
+            if not endpoint.auxiliary_cfg and missing:
+                raise ValueError("Triton engine requires input description - missing values in {}".format(missing))
+        return True
diff --git a/clearml_serving/serving/preprocess_service.py b/clearml_serving/serving/preprocess_service.py
new file mode 100644
index 0000000..0f1195c
--- /dev/null
+++ b/clearml_serving/serving/preprocess_service.py
@@ -0,0 +1,373 @@
+import os
+from typing import Optional, Any, Callable, List
+
+import numpy as np
+from clearml import Task, Model
+from clearml.binding.artifacts import Artifacts
+from clearml.storage.util import sha256sum
+from requests import post as request_post
+
+from .endpoints import ModelEndpoint
+
+
+class BasePreprocessRequest(object):
+    __preprocessing_lookup = {}
+    __preprocessing_modules = set()
+    _default_serving_base_url = "http://127.0.0.1:8080/serve/"
+    _server_config = {}  # externally configured by the serving inference service
+    _timeout = None  # timeout in seconds for the entire request, set in __init__
+
+    def __init__(
+            self,
+            model_endpoint: ModelEndpoint,
+            task: Task = None,
+    ):
+        """
+        Notice this object is not be created per request, but once per Process
+        Make sure it is always thread-safe
+        """
+        self.model_endpoint = model_endpoint
+        self._preprocess = None
+        self._model = None
+        if self._timeout is None:
+            self._timeout = int(float(os.environ.get('GUNICORN_SERVING_TIMEOUT', 600)) * 0.8)
+
+        # load preprocessing code here
+        if self.model_endpoint.preprocess_artifact:
+            if not task or self.model_endpoint.preprocess_artifact not in task.artifacts:
+                raise ValueError("Error: could not find preprocessing artifact \'{}\' on Task id={}".format(
+                    self.model_endpoint.preprocess_artifact, task.id))
+            else:
+                try:
+                    self._instantiate_custom_preprocess_cls(task)
+                except Exception as ex:
+                    raise ValueError("Error: Failed loading preprocess code for \'{}\': {}".format(
+                        self.model_endpoint.preprocess_artifact, ex))
+
+    def _instantiate_custom_preprocess_cls(self, task: Task) -> None:
+        path = task.artifacts[self.model_endpoint.preprocess_artifact].get_local_copy()
+        # check file content hash, should only happens once?!
+        # noinspection PyProtectedMember
+        file_hash, _ = sha256sum(path, block_size=Artifacts._hash_block_size)
+        if file_hash != task.artifacts[self.model_endpoint.preprocess_artifact].hash:
+            print("INFO: re-downloading artifact '{}' hash changed".format(
+                self.model_endpoint.preprocess_artifact))
+            path = task.artifacts[self.model_endpoint.preprocess_artifact].get_local_copy(
+                extract_archive=True,
+                force_download=True,
+            )
+        else:
+            # extract zip if we need to, otherwise it will be the same
+            path = task.artifacts[self.model_endpoint.preprocess_artifact].get_local_copy(
+                extract_archive=True,
+            )
+
+        import importlib.util
+        spec = importlib.util.spec_from_file_location("Preprocess", path)
+        _preprocess = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(_preprocess)
+        Preprocess = _preprocess.Preprocess  # noqa
+        # override `send_request` method
+        Preprocess.send_request = BasePreprocessRequest._preprocess_send_request
+        # create preprocess class
+        self._preprocess = Preprocess()
+        # custom model load callback function
+        if callable(getattr(self._preprocess, 'load', None)):
+            self._model = self._preprocess.load(self._get_local_model_file())
+
+    def preprocess(self, request: dict, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Optional[Any]:
+        """
+        Raise exception to report an error
+        Return value will be passed to serving engine
+
+        :param request: dictionary as recieved from the RestAPI
+        :param collect_custom_statistics_fn: Optional, allows to send a custom set of key/values
+            to the statictics collector servicd
+
+            Usage example:
+            >>> print(request)
+            {"x0": 1, "x1": 2}
+            >>> collect_custom_statistics_fn({"x0": 1, "x1": 2})
+
+        :return: Object to be passed directly to the model inference
+        """
+        if self._preprocess is not None and hasattr(self._preprocess, 'preprocess'):
+            return self._preprocess.preprocess(request, collect_custom_statistics_fn)
+        return request
+
+    def postprocess(self, data: Any, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Optional[dict]:
+        """
+        Raise exception to report an error
+        Return value will be passed to serving engine
+
+        :param data: object as recieved from the inference model function
+        :param collect_custom_statistics_fn: Optional, allows to send a custom set of key/values
+            to the statictics collector servicd
+
+            Usage example:
+            >>> collect_custom_statistics_fn({"y": 1})
+
+        :return: Dictionary passed directly as the returned result of the RestAPI
+        """
+        if self._preprocess is not None and hasattr(self._preprocess, 'postprocess'):
+            return self._preprocess.postprocess(data, collect_custom_statistics_fn)
+        return data
+
+    def process(self, data: Any, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any:
+        """
+        The actual processing function. Can be send to external service
+
+        :param data: object as recieved from the preprocessing function
+        :param collect_custom_statistics_fn: Optional, allows to send a custom set of key/values
+            to the statictics collector servicd
+
+            Usage example:
+            >>> collect_custom_statistics_fn({"type": "classification"})
+
+        :return: Object to be passed tp the post-processing function
+        """
+        pass
+
+    def _get_local_model_file(self):
+        model_repo_object = Model(model_id=self.model_endpoint.model_id)
+        return model_repo_object.get_local_copy()
+
+    @classmethod
+    def set_server_config(cls, server_config: dict) -> None:
+        cls._server_config = server_config
+
+    @classmethod
+    def get_server_config(cls) -> dict:
+        return cls._server_config
+
+    @classmethod
+    def validate_engine_type(cls, engine: str) -> bool:
+        return engine in cls.__preprocessing_lookup
+
+    @classmethod
+    def get_engine_cls(cls, engine: str) -> Callable:
+        return cls.__preprocessing_lookup.get(engine)
+
+    @staticmethod
+    def register_engine(engine_name: str, modules: Optional[List[str]] = None) -> Callable:
+        """
+        A decorator to register an annotation type name for classes deriving from Annotation
+        """
+
+        def wrapper(cls):
+            cls.__preprocessing_lookup[engine_name] = cls
+            return cls
+
+        if modules:
+            BasePreprocessRequest.__preprocessing_modules |= set(modules)
+
+        return wrapper
+
+    @staticmethod
+    def load_modules() -> None:
+        for m in BasePreprocessRequest.__preprocessing_modules:
+            try:
+                # silently fail
+                import importlib
+                importlib.import_module(m)
+            except (ImportError, TypeError):
+                pass
+
+    @staticmethod
+    def _preprocess_send_request(self, endpoint: str, version: str = None, data: dict = None) -> Optional[dict]:
+        endpoint = "{}/{}".format(endpoint.strip("/"), version.strip("/")) if version else endpoint.strip("/")
+        base_url = BasePreprocessRequest.get_server_config().get("base_serving_url")
+        base_url = (base_url or BasePreprocessRequest._default_serving_base_url).strip("/")
+        url = "{}/{}".format(base_url, endpoint.strip("/"))
+        return_value = request_post(url, json=data, timeout=BasePreprocessRequest._timeout)
+        if not return_value.ok:
+            return None
+        return return_value.json()
+
+
+@BasePreprocessRequest.register_engine("triton", modules=["grpc", "tritonclient"])
+class TritonPreprocessRequest(BasePreprocessRequest):
+    _content_lookup = {
+        np.uint8: 'uint_contents',
+        np.int8: 'int_contents',
+        np.int64: 'int64_contents',
+        np.uint64: 'uint64_contents',
+        np.int: 'int_contents',
+        np.uint: 'uint_contents',
+        np.bool: 'bool_contents',
+        np.float32: 'fp32_contents',
+        np.float64: 'fp64_contents',
+    }
+    _default_grpc_address = "127.0.0.1:8001"
+    _ext_grpc = None
+    _ext_np_to_triton_dtype = None
+    _ext_service_pb2 = None
+    _ext_service_pb2_grpc = None
+
+    def __init__(self, model_endpoint: ModelEndpoint, task: Task = None):
+        super(TritonPreprocessRequest, self).__init__(
+            model_endpoint=model_endpoint, task=task)
+
+        # load Triton Module
+        if self._ext_grpc is None:
+            import grpc  # noqa
+            self._ext_grpc = grpc
+
+        if self._ext_np_to_triton_dtype is None:
+            from tritonclient.utils import np_to_triton_dtype  # noqa
+            self._ext_np_to_triton_dtype = np_to_triton_dtype
+
+        if self._ext_service_pb2 is None:
+            from tritonclient.grpc import service_pb2, service_pb2_grpc  # noqa
+            self._ext_service_pb2 = service_pb2
+            self._ext_service_pb2_grpc = service_pb2_grpc
+
+    def process(self, data: Any, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any:
+        """
+        The actual processing function.
+        Detect gRPC server and send the request to it
+
+        :param data: object as recieved from the preprocessing function
+        :param collect_custom_statistics_fn: Optional, allows to send a custom set of key/values
+            to the statictics collector servicd
+
+            Usage example:
+            >>> collect_custom_statistics_fn({"type": "classification"})
+
+        :return: Object to be passed tp the post-processing function
+        """
+        # allow to override bt preprocessing class
+        if self._preprocess is not None and hasattr(self._preprocess, "process"):
+            return self._preprocess.process(data, collect_custom_statistics_fn)
+
+        # Create gRPC stub for communicating with the server
+        triton_server_address = self._server_config.get("triton_grpc_server") or self._default_grpc_address
+        if not triton_server_address:
+            raise ValueError("External Triton gRPC server is not configured!")
+        try:
+            channel = self._ext_grpc.insecure_channel(triton_server_address)
+            grpc_stub = self._ext_service_pb2_grpc.GRPCInferenceServiceStub(channel)
+        except Exception as ex:
+            raise ValueError("External Triton gRPC server misconfigured [{}]: {}".format(triton_server_address, ex))
+
+        # Generate the request
+        request = self._ext_service_pb2.ModelInferRequest()
+        request.model_name = "{}/{}".format(self.model_endpoint.serving_url, self.model_endpoint.version).strip("/")
+        # we do not use the Triton model versions, we just assume a single version per endpoint
+        request.model_version = "1"
+
+        # take the input data
+        input_data = np.array(data, dtype=self.model_endpoint.input_type)
+
+        # Populate the inputs in inference request
+        input0 = request.InferInputTensor()
+        input0.name = self.model_endpoint.input_name
+        input_dtype = np.dtype(self.model_endpoint.input_type).type
+        input0.datatype = self._ext_np_to_triton_dtype(input_dtype)
+        input0.shape.extend(self.model_endpoint.input_size)
+
+        # to be inferred
+        input_func = self._content_lookup.get(input_dtype)
+        if not input_func:
+            raise ValueError("Input type nt supported {}".format(input_dtype))
+        input_func = getattr(input0.contents, input_func)
+        input_func[:] = input_data.flatten()
+
+        # push into request
+        request.inputs.extend([input0])
+
+        # Populate the outputs in the inference request
+        output0 = request.InferRequestedOutputTensor()
+        output0.name = self.model_endpoint.output_name
+
+        request.outputs.extend([output0])
+        response = grpc_stub.ModelInfer(
+            request,
+            compression=self._ext_grpc.Compression.Gzip,
+            timeout=self._timeout
+        )
+
+        output_results = []
+        index = 0
+        for output in response.outputs:
+            shape = []
+            for value in output.shape:
+                shape.append(value)
+            output_results.append(
+                np.frombuffer(response.raw_output_contents[index], dtype=self.model_endpoint.output_type))
+            output_results[-1] = np.resize(output_results[-1], shape)
+            index += 1
+
+        # if we have a single matrix, return it as is
+        return output_results[0] if index == 1 else output_results
+
+
+@BasePreprocessRequest.register_engine("sklearn", modules=["joblib", "sklearn"])
+class SKLearnPreprocessRequest(BasePreprocessRequest):
+    def __init__(self, model_endpoint: ModelEndpoint, task: Task = None):
+        super(SKLearnPreprocessRequest, self).__init__(
+            model_endpoint=model_endpoint, task=task)
+        if self._model is None:
+            # get model
+            import joblib  # noqa
+            self._model = joblib.load(filename=self._get_local_model_file())
+
+    def process(self, data: Any, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any:
+        """
+        The actual processing function.
+        We run the model in this context
+        """
+        return self._model.predict(data)
+
+
+@BasePreprocessRequest.register_engine("xgboost", modules=["xgboost"])
+class XGBoostPreprocessRequest(BasePreprocessRequest):
+    def __init__(self, model_endpoint: ModelEndpoint, task: Task = None):
+        super(XGBoostPreprocessRequest, self).__init__(
+            model_endpoint=model_endpoint, task=task)
+        if self._model is None:
+            # get model
+            import xgboost  # noqa
+            self._model = xgboost.Booster()
+            self._model.load_model(self._get_local_model_file())
+
+    def process(self, data: Any, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any:
+        """
+        The actual processing function.
+        We run the model in this context
+        """
+        return self._model.predict(data)
+
+
+@BasePreprocessRequest.register_engine("lightgbm", modules=["lightgbm"])
+class LightGBMPreprocessRequest(BasePreprocessRequest):
+    def __init__(self, model_endpoint: ModelEndpoint, task: Task = None):
+        super(LightGBMPreprocessRequest, self).__init__(
+            model_endpoint=model_endpoint, task=task)
+        if self._model is None:
+            # get model
+            import lightgbm  # noqa
+            self._model = lightgbm.Booster(model_file=self._get_local_model_file())
+
+    def process(self, data: Any, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any:
+        """
+        The actual processing function.
+        We run the model in this context
+        """
+        return self._model.predict(data)
+
+
+@BasePreprocessRequest.register_engine("custom")
+class CustomPreprocessRequest(BasePreprocessRequest):
+    def __init__(self, model_endpoint: ModelEndpoint, task: Task = None):
+        super(CustomPreprocessRequest, self).__init__(
+            model_endpoint=model_endpoint, task=task)
+
+    def process(self, data: Any, collect_custom_statistics_fn: Callable[[dict], None] = None) -> Any:
+        """
+        The actual processing function.
+        We run the process in this context
+        """
+        if self._preprocess is not None and hasattr(self._preprocess, 'process'):
+            return self._preprocess.process(data, collect_custom_statistics_fn)
+        return None
diff --git a/clearml_serving/serving/requirements.txt b/clearml_serving/serving/requirements.txt
new file mode 100644
index 0000000..bf350bc
--- /dev/null
+++ b/clearml_serving/serving/requirements.txt
@@ -0,0 +1,17 @@
+clearml>=1.3.1
+attrs>=20.3.0,<21
+fastapi[all]>=0.75.0,<0.76
+uvicorn[standard]
+gunicorn>=20.1.0,<20.2
+asyncio>=3.4.3,<3.5
+aiocache>=0.11.1,<0.12
+tritonclient[grpc]>=2.18.0,<2.19
+numpy>=1.20,<1.24
+scikit-learn>=1.0.2,<1.1
+grpcio
+Pillow>=9.0.1,<10
+xgboost>=1.5.2,<1.6
+lightgbm>=3.3.2,<3.4
+requests>=2.25.1,<2.26
+kafka-python>=2.0.2,<2.1
+lz4>=4.0.0,<5
\ No newline at end of file
diff --git a/clearml_serving/serving_service.py b/clearml_serving/serving_service.py
deleted file mode 100644
index 419f7c1..0000000
--- a/clearml_serving/serving_service.py
+++ /dev/null
@@ -1,589 +0,0 @@
-import json
-import shutil
-from logging import getLogger
-from pathlib import Path as Path3
-from time import time
-from typing import Optional, Union, Dict, Sequence
-
-from attr import attrib, attrs, asdict
-from pathlib2 import Path
-
-from clearml import Task, Model, InputModel
-
-
-class ServingService(object):
-    _config_pbtxt_section = 'config.pbtxt'
-    _supported_serving_engines = ('triton', 'ovms', 'kfserving')
-
-    @attrs
-    class EndPoint(object):
-        serving_url = attrib(type=str)
-        model_ids = attrib(type=list)
-        model_project = attrib(type=str)
-        model_name = attrib(type=str)
-        model_tags = attrib(type=list)
-        model_config_blob = attrib(type=str, default=None)
-        max_num_revisions = attrib(type=int, default=None)
-        versions = attrib(type=dict, default={})
-
-        def as_dict(self):
-            return asdict(self)
-
-    def __init__(self, task_id=None, task_project=None, task_name=None, task=None, engine_type='triton'):
-        # type: (Optional[str], Optional[str], Optional[str], Optional[Task], Optional[str]) -> None
-        """
-        :param task_id: Optional specify existing Task ID of the ServingService
-        :param task_project: Select the project where the new ServingService task will be created
-        :param task_name: Specify the Task name for the newly created ServingService
-        :param task: Optional pass existing ServingService Task object
-        :param engine_type: Specify the serving engine Type. Examples: triton, ovms, kfserving
-        """
-        assert engine_type in self._supported_serving_engines
-
-        if task:
-            self._task = task
-        elif task_id:
-            self._task = Task.get_task(task_id=task_id)
-        else:
-            # try to get a Task if we can find one
-            self._task = None
-            try:
-                # noinspection PyProtectedMember
-                if Task._query_tasks(project_name=task_project, task_name=task_name):
-                    self._task = Task.get_task(project_name=task_project, task_name=task_name)
-            except ValueError:
-                pass
-
-            if not self._task:
-                self._task = Task.create(
-                    project_name=task_project, task_name=task_name, task_type=Task.TaskTypes.service,
-                    repo="https://github.com/allegroai/clearml-serving.git",
-                    branch="main",
-                    commit="ad049c51c146e9b7852f87e2f040e97d88848a1f",
-                    script="clearml_serving/service.py",
-                    working_directory=".",
-                    add_task_init_call=False,
-                )
-                self._task.set_system_tags(list(self._task.get_system_tags()) + ['serving'])
-
-        # self._current_serving_endpoints = {'an_enpoint_url': {1: 'model_id'}}
-        self._current_serving_endpoints = {}  # type: Dict[str, Dict[int, str]]
-        # self._endpoints = {'an_enpoint_url': ServingService.EndPoint()}
-        self._endpoints = {}  # type: Dict[str, ServingService.EndPoint]
-        self._engine_type = engine_type
-        self._dirty = False
-        self._last_update_step = None
-        # try to deserialize from Task
-        # noinspection PyBroadException
-        try:
-            self._deserialize()
-        except Exception:
-            pass
-
-    def add_model_serving(
-            self,
-            serving_url,  # type: str
-            model_ids=None,  # type: Optional[Sequence[str]]
-            model_project=None,  # type: Optional[str]
-            model_name=None,  # type: Optional[str]
-            model_tags=None,  # type: Optional[Sequence[str]]
-            config_file=None,  # type: Optional[Union[Path, Path3, str]]
-            max_versions=1,  # type: Optional[int]
-    ):
-        """
-        Add new model serving endpoint, automatically published
-
-        :param serving_url:
-        :param model_ids:
-        :param model_project:
-        :param model_name:
-        :param model_tags:
-        :param config_file:
-        :param max_versions:
-        :return:
-        """
-        if not serving_url:
-            raise ValueError("serving_url is required")
-
-        if model_tags and not isinstance(model_tags, (list, tuple)):
-            raise ValueError("model_tags must be a list of strings")
-
-        # normalize endpoint url
-        serving_url = str(serving_url).strip('/')
-
-        endpoint = self.EndPoint(
-            serving_url=serving_url,
-            model_ids=list(model_ids) if model_ids else None,
-            model_name=model_name,
-            model_project=model_project,
-            model_tags=model_tags,
-            max_num_revisions=max_versions or None,
-            versions={},
-            model_config_blob='',
-        )
-        # load config file
-        if config_file:
-            with open(str(config_file), 'rt') as f:
-                endpoint.model_config_blob = f.read()
-        else:
-            # Look for the config on the Model generated Task
-            found_models = Model.query_models(project_name=model_project, model_name=model_name, tags=model_tags) or []
-
-            selected_model = None
-            # find the first model with config.pbtxt configuration
-            # prefer published models
-            found_models = [m for m in found_models if m.published] + [m for m in found_models if not m.published]
-            for m in found_models:
-                task_id = m.task
-                task = Task.get_task(task_id=task_id)
-                config_pbtxt = task.get_configuration_object(self._config_pbtxt_section)
-                if config_pbtxt and str(config_pbtxt).strip():
-                    endpoint.model_config_blob = config_pbtxt
-                    selected_model = m
-                    break
-
-            if not selected_model:
-                raise ValueError(
-                    "Requested Model project={} name={} tags={} not found. 'config.pbtxt' could not be inferred. "
-                    "please provide specific config.pbtxt definition.".format(model_project, model_name, model_tags))
-            elif len(found_models) > 1:
-                getLogger('clearml-serving').warning(
-                    "Found more than one Model, using model id={}".format(selected_model.id))
-
-        self._endpoints[serving_url] = endpoint
-        self._dirty = True
-
-    def launch(self,  queue_name='services', queue_id=None, force=False, verbose=True):
-        # type: (Optional[str], Optional[str], bool, bool) -> None
-        """
-        Launch serving service on a remote machine using the specified queue
-
-        :param queue_name: Queue name to launch the serving service control plane
-        :param queue_id: specify queue id (unique stand stable) instead of queue_name
-        :param force: if False check if service Task is already running before enqueuing
-        :param verbose: If True print progress to console
-        """
-        # check if we are not already running
-        if not force and ((self._task.data.execution.queue and self._task.status == 'in_progress')
-                          or self._task.status == 'queued'):
-            if verbose:
-                print('Serving service already running')
-        else:
-            if verbose:
-                print('Launching Serving service on {} queue'.format(queue_id or queue_name))
-            self.update_endpoint_graph(force=True)
-            self.update_model_endpoint_state()
-            self.serialize()
-            self._task.flush(wait_for_uploads=True)
-            self._task.reset()
-            self._task.enqueue(task=self._task, queue_name=queue_name, queue_id=queue_id)
-
-    def launch_engine(self, queue_name, queue_id=None, container=None, container_args=None, verbose=True):
-        # type: (Optional[str], Optional[str], Optional[str], Optional[str], bool) -> None
-        """
-        Launch serving engine on a specific queue
-
-        :param queue_name: Queue name to launch the engine service running the inference on.
-        :param queue_id: specify queue id (unique stand stable) instead of queue_name
-        :param container: Optional: specify serving engine container.
-        :param container_args: Optional: specify serving engine container arguments.
-        Notice these arguments will override any default container arguments!
-        :param verbose: If True print progress to console
-        """
-
-        # todo: add more engines
-        if self._engine_type == 'triton':
-            engine_type_container = "nvcr.io/nvidia/tritonserver:21.03-py3"
-            engine_type_args = "--ipc=host -p 8000:8000 -p 8001:8001 -p 8002:8002"
-
-            # create the serving engine Task
-            engine_task = Task.create(
-                project_name=self._task.get_project_name(),
-                task_name="triton serving engine",
-                task_type=Task.TaskTypes.inference,
-                repo="https://github.com/allegroai/clearml-serving.git",
-                branch="main",
-                commit="ad049c51c146e9b7852f87e2f040e97d88848a1f",
-                script="clearml_serving/triton_helper.py",
-                working_directory=".",
-                docker=container or engine_type_container,
-                docker_args=container_args or engine_type_args,
-                argparse_args=[('serving_id', self._task.id), ],
-                add_task_init_call=False,
-                packages=[
-                    'clearml',
-                    'azure-storage-blob>=2.0.1,<=2.1',
-                    'google-cloud-storage>=1.13.2',
-                    'boto3>=1.9',
-                ],
-            )
-            if verbose:
-                print('Launching engine {} on queue {}'.format(self._engine_type, queue_id or queue_name))
-            engine_task.enqueue(task=engine_task, queue_name=queue_name, queue_id=queue_id)
-
-    def update_endpoint_graph(self, force=False):
-        # type: (bool) -> None
-        """
-        Update the endpoint serving graph
-
-        :param force: If True always update, otherwise skip if service was not changed since lat time
-        """
-        if not force and not self._dirty:
-            return
-
-        # Generate configuration table and details
-        table_values = [["Endpoint", "Model ID", "Model Project", "Model Name", "Model Tags", "Max Versions"]]
-        for endpoint in sorted(self._endpoints.keys()):
-            n = self._endpoints[endpoint]
-            table_values.append([
-                str(n.serving_url or ''),
-                str(n.model_ids or ''),
-                str(n.model_project or ''),
-                str(n.model_name or ''),
-                str(n.model_tags or ''),
-                str(n.max_num_revisions or '')
-            ])
-        self._task.get_logger().report_table(
-            title='Serving Endpoint Configuration', series='Details', iteration=0, table_plot=table_values,
-            extra_layout={"title": "Model Endpoints Details"})
-
-        # generate current endpoint view
-        sankey_node = dict(
-            label=[],
-            color=[],
-            customdata=[],
-            hovertemplate='%{customdata}<extra></extra>',
-            hoverlabel={"align": "left"},
-        )
-        sankey_link = dict(
-            source=[],
-            target=[],
-            value=[],
-            hovertemplate='<extra></extra>',
-        )
-        # root
-        sankey_node['color'].append("mediumpurple")
-        sankey_node['label'].append('{}'.format('serving'))
-        sankey_node['customdata'].append("")
-
-        # Generate table and details
-        table_values = [["Endpoint", "Version", "Model ID"]]
-        # noinspection PyProtectedMember
-        base_url = self._task._get_app_server() + '/projects/*/models/{model_id}/general'
-        for i, serve_url in enumerate(sorted(self._endpoints.keys())):
-            ep = self._endpoints[serve_url]
-            sankey_node['color'].append("blue")
-            sankey_node['label'].append('{}'.format(serve_url))
-            sankey_node['customdata'].append(
-                "project: {}<br />name: {}<br />tags: {}".format(
-                    ep.model_project or '', ep.model_name or '', ep.model_tags or '')
-            )
-            sankey_link['source'].append(0)
-            sankey_link['target'].append(i + 1)
-            sankey_link['value'].append(1. / len(self._endpoints))
-
-            for v in sorted(self._current_serving_endpoints.get(serve_url, [])):
-                model_id = self._current_serving_endpoints[serve_url][v]
-                href = '<a href="{}"> {} </a>'.format(base_url.format(model_id=model_id), model_id)
-                table_values.append([str(serve_url), str(v), href])
-                sankey_node['color'].append("lightblue")
-                sankey_node['label'].append('{}'.format(v))
-                sankey_node['customdata'].append(model_id)
-
-                sankey_link['source'].append(i + 1)
-                sankey_link['target'].append(len(sankey_node['color']) - 1)
-                sankey_link['value'].append(1. / len(self._current_serving_endpoints[serve_url]))
-
-        # create the sankey graph
-        dag_flow = dict(
-            link=sankey_link,
-            node=sankey_node,
-            textfont=dict(color='rgba(0,0,0,255)', size=10),
-            type='sankey',
-            orientation='h'
-        )
-        fig = dict(data=[dag_flow], layout={'xaxis': {'visible': False}, 'yaxis': {'visible': False}})
-
-        self._task.get_logger().report_plotly(
-            title='Model Serving Endpoints', series='', iteration=0, figure=fig)
-
-        # report detailed table
-        self._task.get_logger().report_table(
-            title='Serving Endpoint', series='Details', iteration=0, table_plot=table_values,
-            extra_layout={"title": "Model Endpoints Details"})
-
-        self._dirty = False
-
-    def update_model_endpoint_state(self):
-        # type: () -> bool
-        """
-        Update model endpoint state from the model repository
-
-        :return: True if endpoints were updated
-        """
-
-        for endpoint, node in self._endpoints.items():
-            # model ID supersedes everything
-            if node.model_ids:
-                model_ids = node.model_ids
-            else:
-                # get list of models sorted by descending update time
-                models = Model.query_models(
-                    project_name=node.model_project,
-                    model_name=node.model_name,
-                    tags=node.model_tags
-                )
-                # prefer published models
-                model_ids = [m.id for m in models if m.published] + [m.id for m in models if not m.published]
-
-            cur_endpoint = self._current_serving_endpoints.get(node.serving_url, {})
-            cur_endpoint = {int(k): v for k, v in cur_endpoint.items() if v in model_ids}
-            cur_endpoint_m_ids = list(cur_endpoint.values())
-            max_v = max(list(cur_endpoint.keys()) or [0])
-            for i, m_id in enumerate(model_ids):
-                # only pick the latest in the history
-                if node.max_num_revisions and max_v >= node.max_num_revisions:
-                    break
-
-                if m_id in cur_endpoint_m_ids:
-                    continue
-                max_v += 1
-                cur_endpoint[max_v] = m_id
-
-            # check if we need to update,
-            if self._current_serving_endpoints.get(node.serving_url) != cur_endpoint:
-                # set dirty flag
-                self._dirty = True
-                # store updated results
-                self._current_serving_endpoints[node.serving_url] = cur_endpoint
-
-        return self._dirty
-
-    def stats(self):
-        pass
-
-    def get_endpoints(self):
-        # type: () -> Dict[str, ServingService.EndPoint]
-        """
-        return the internal endpoints configuration
-
-        :return: dict where the keys is the endpoint url and the value is the endpoint configuration
-        """
-        return self._endpoints
-
-    def get_endpoint_version_model_id(self, serving_url):
-        # type: (str) -> Dict[int, str]
-        """
-        Return dict with model versions and model id for the specific serving url
-        If serving url is not found, return None
-
-        :param serving_url: sering url string
-
-        :return: dictionary keys are the versions (integers) and values are the model IDs (str)
-        """
-        return self._current_serving_endpoints.get(serving_url) or {}
-
-    def _serialize(self):
-        configuration = dict()
-        for name, ep in self._endpoints.items():
-            # noinspection PyProtectedMember
-            self._task.set_configuration_object(
-                name="model.{}".format(name),
-                description='Model Serving Configuration',
-                config_type='pbtxt',
-                config_text=ep.model_config_blob)
-            ep_conf = ep.as_dict()
-            ep_conf.pop('model_config_blob', None)
-            configuration['"{}"'.format(name)] = ep_conf
-        # noinspection PyProtectedMember
-        self._task._set_configuration(
-            config_dict=configuration, name='endpoints',
-            config_type='hocon', description='Serving Endpoints Configuration')
-        # set configuration of current served endpoints
-        # noinspection PyProtectedMember
-        self._task._set_configuration(
-            config_dict=self._current_serving_endpoints, name='serving_state',
-            config_type='hocon', description='Current Serving Endpoints State',
-        )
-        serving = dict(engine=self._engine_type)
-        self._task.connect(serving, name='serving')
-
-    def _deserialize(self):
-        # type: () -> bool
-        """
-        deserialize internal state from Task backend
-
-        :return: return True if new state a was updated.
-        """
-        # update if the task was updated
-        if self._endpoints:
-            last_update = self._task.data.last_update
-            try:
-                # noinspection PyProtectedMember
-                if last_update == self._task._get_last_update():
-                    return True
-            except AttributeError:
-                # support old clearml packages
-                pass
-
-        self._task.reload()
-
-        # noinspection PyProtectedMember
-        configuration = self._task._get_configuration_dict(name='endpoints')
-        if not configuration:
-            return False
-
-        self._endpoints = {}
-        self._current_serving_endpoints = {}
-        serving = dict(engine='')
-        task_parameters = self._task.get_parameters_as_dict()
-        serving.update(task_parameters.get('serving', {}))
-        self._engine_type = serving['engine']
-
-        for name, endpoint in configuration.items():
-            ep = self.EndPoint(model_config_blob='', **endpoint)
-            ep.model_config_blob = self._task.get_configuration_object(
-                name="model.{}".format(ep.serving_url))
-            self._endpoints[ep.serving_url] = ep
-
-        # get configuration of current served endpoints
-        # noinspection PyProtectedMember
-        self._current_serving_endpoints = self._task._get_configuration_dict(name='serving_state')
-
-        self._dirty = True
-        return True
-
-    def update(self, force=False):
-        # type: (bool) -> bool
-        """
-        Update internal endpoint state based on Task configuration and model repository
-
-        :param force: if True force update
-
-        :return: True if internal state updated.
-        """
-        if not self._task:
-            return False
-
-        # store current internal state
-        state_hash = self.__state_hash()
-
-        if not self._deserialize():
-            return False
-
-        # check if current internal state changed
-        if not force and state_hash == self.__state_hash():
-            print("Skipping update, nothing changed")
-            return False
-
-        return self.update_model_endpoint_state()
-
-    def get_id(self):
-        # type: () -> str
-        """
-        Return the Serving Service Task ID
-        :return: Unique Task ID (str)
-        """
-        return self._task.id
-
-    def get_engine_type(self):
-        # type: () -> str
-        """
-        return the engine type used ib the serving service
-        :return: engine type (str). example: triton, ovms, kfserving
-        """
-        return self._engine_type
-
-    def serialize(self, force=False):
-        # type: (bool) -> None
-        """
-        Serialize current service state to the Task
-
-        :param force: If True synchronize an aborted/completed Task
-        """
-        if force and self._task.status not in (Task.TaskStatusEnum.created, Task.TaskStatusEnum.in_progress):
-            self._task.mark_started(force=True)
-
-        self._serialize()
-
-    def triton_model_service_update_step(self, model_repository_folder=None, verbose=True):
-        # type: (Optional[str], bool) -> None
-
-        # check if something changed since last time
-        if not self.update(force=self._last_update_step is None):
-            return
-
-        self._last_update_step = time()
-
-        if not model_repository_folder:
-            model_repository_folder = '/models/'
-
-        if verbose:
-            print('Updating local model folder: {}'.format(model_repository_folder))
-
-        for url, endpoint in self.get_endpoints().items():
-            folder = Path(model_repository_folder) / url
-            folder.mkdir(parents=True, exist_ok=True)
-            with open((folder / 'config.pbtxt').as_posix(), 'wt') as f:
-                f.write(endpoint.model_config_blob)
-
-            # download model versions
-            for version, model_id in self.get_endpoint_version_model_id(serving_url=url).items():
-                model_folder = folder / str(version)
-
-                model_folder.mkdir(parents=True, exist_ok=True)
-                model = None
-                # noinspection PyBroadException
-                try:
-                    model = InputModel(model_id)
-                    local_path = model.get_local_copy()
-                except Exception:
-                    local_path = None
-                if not local_path:
-                    print("Error retrieving model ID {} []".format(model_id, model.url if model else ''))
-                    continue
-
-                local_path = Path(local_path)
-
-                if verbose:
-                    print('Update model v{} in {}'.format(version, model_folder))
-
-                # if this is a folder copy every and delete the temp folder
-                if local_path.is_dir():
-                    # we assume we have a `tensorflow.savedmodel` folder
-                    model_folder /= 'model.savedmodel'
-                    model_folder.mkdir(parents=True, exist_ok=True)
-                    # rename to old
-                    old_folder = None
-                    if model_folder.exists():
-                        old_folder = model_folder.parent / '.old.{}'.format(model_folder.name)
-                        model_folder.replace(old_folder)
-                    if verbose:
-                        print('copy model into {}'.format(model_folder))
-                    shutil.copytree(
-                        local_path.as_posix(), model_folder.as_posix(), symlinks=False,
-                    )
-                    if old_folder:
-                        shutil.rmtree(path=old_folder.as_posix())
-                    # delete temp folder
-                    shutil.rmtree(local_path.as_posix())
-                else:
-                    # single file should be moved
-                    target_path = model_folder / local_path.name
-                    old_file = None
-                    if target_path.exists():
-                        old_file = target_path.parent / '.old.{}'.format(target_path.name)
-                        target_path.replace(old_file)
-                    shutil.move(local_path.as_posix(), target_path.as_posix())
-                    if old_file:
-                        old_file.unlink()
-
-    def __state_hash(self):
-        # type: () -> int
-        """
-        Return Hash of the internal state (use only for in process comparison
-        :return: hash int
-        """
-        return hash(json.dumps(
-            [self._current_serving_endpoints, {k: v.as_dict() for k, v in self._endpoints.items()}],
-            sort_keys=True))
diff --git a/clearml_serving/statistics/Dockerfile b/clearml_serving/statistics/Dockerfile
new file mode 100644
index 0000000..e4e692d
--- /dev/null
+++ b/clearml_serving/statistics/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.9-bullseye
+
+
+ENV LC_ALL=C.UTF-8
+
+# install base package
+RUN pip3 install clearml-serving
+
+# get latest execution code from the git repository
+# RUN cd $HOME && git clone https://github.com/allegroai/clearml-serving.git
+COPY clearml_serving /root/clearml/clearml_serving
+
+RUN pip3 install -r /root/clearml/clearml_serving/statistics/requirements.txt
+
+# default serving port
+EXPOSE 9999
+
+# environement variable to load Task from CLEARML_SERVING_TASK_ID, CLEARML_SERVING_PORT
+
+WORKDIR /root/clearml/
+ENTRYPOINT ["clearml_serving/statistics/entrypoint.sh"]
diff --git a/clearml_serving/statistics/__init__.py b/clearml_serving/statistics/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/clearml_serving/statistics/entrypoint.sh b/clearml_serving/statistics/entrypoint.sh
new file mode 100755
index 0000000..2ed724b
--- /dev/null
+++ b/clearml_serving/statistics/entrypoint.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# print configuration
+echo CLEARML_SERVING_TASK_ID="$CLEARML_SERVING_TASK_ID"
+echo CLEARML_SERVING_PORT="$CLEARML_SERVING_PORT"
+echo EXTRA_PYTHON_PACKAGES="$EXTRA_PYTHON_PACKAGES"
+echo CLEARML_SERVING_POLL_FREQ="$CLEARML_SERVING_POLL_FREQ"
+echo CLEARML_DEFAULT_KAFKA_SERVE_URL="$CLEARML_DEFAULT_KAFKA_SERVE_URL"
+
+SERVING_PORT="${CLEARML_SERVING_PORT:-9999}"
+
+# set default internal serve endpoint (for request pipelining)
+CLEARML_DEFAULT_BASE_SERVE_URL="${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:$SERVING_PORT/serve}"
+CLEARML_DEFAULT_TRITON_GRPC_ADDR="${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-127.0.0.1:8001}"
+
+# print configuration
+echo SERVING_PORT="$SERVING_PORT"
+
+# runtime add extra python packages
+if [ ! -z "$EXTRA_PYTHON_PACKAGES" ]
+then
+      python3 -m pip install $EXTRA_PYTHON_PACKAGES
+fi
+
+echo "Starting Statistics Controller server"
+PYTHONPATH=$(pwd) python3 clearml_serving/statistics/main.py
diff --git a/clearml_serving/statistics/main.py b/clearml_serving/statistics/main.py
new file mode 100644
index 0000000..eef9fcb
--- /dev/null
+++ b/clearml_serving/statistics/main.py
@@ -0,0 +1,41 @@
+import os
+
+import prometheus_client
+from clearml import Task
+
+from clearml_serving.serving.model_request_processor import ModelRequestProcessor
+from clearml_serving.statistics.metrics import StatisticsController
+
+
+def main():
+    serving_service_task_id = os.environ.get("CLEARML_SERVING_TASK_ID", None)
+    model_sync_frequency_secs = 5
+    try:
+        model_sync_frequency_secs = float(os.environ.get("CLEARML_SERVING_POLL_FREQ", model_sync_frequency_secs))
+    except (ValueError, TypeError):
+        pass
+
+    # noinspection PyProtectedMember
+    serving_task = ModelRequestProcessor._get_control_plane_task(task_id=serving_service_task_id)
+    # create a new serving instance (for visibility and monitoring)
+    instance_task = Task.init(
+        project_name=serving_task.get_project_name(),
+        task_name="{} - statistics controller".format(serving_task.name),
+        task_type="monitor",
+    )
+    instance_task.set_system_tags(["service"])
+    # noinspection PyProtectedMember
+    kafka_server_url = os.environ.get("CLEARML_DEFAULT_KAFKA_SERVE_URL", "localhost:9092")
+    stats_controller = StatisticsController(
+        task=instance_task,
+        kafka_server_url=kafka_server_url,
+        serving_id=serving_service_task_id,
+        poll_frequency_min=model_sync_frequency_secs
+    )
+    prometheus_client.start_http_server(int(os.environ.get("CLEARML_SERVING_PORT", 9999)))
+    # we will never leave here
+    stats_controller.start()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/clearml_serving/statistics/metrics.py b/clearml_serving/statistics/metrics.py
new file mode 100644
index 0000000..befb49e
--- /dev/null
+++ b/clearml_serving/statistics/metrics.py
@@ -0,0 +1,355 @@
+import json
+import os
+import re
+from copy import deepcopy
+from functools import partial
+from threading import Event, Thread
+from time import time, sleep
+
+from clearml import Task
+from typing import Optional, Dict, Any, Iterable
+
+from prometheus_client import Histogram, Enum, Gauge, Counter, values
+from kafka import KafkaConsumer
+from prometheus_client.metrics import MetricWrapperBase, _validate_exemplar
+from prometheus_client.registry import REGISTRY
+from prometheus_client.samples import Exemplar, Sample
+from prometheus_client.context_managers import Timer
+from prometheus_client.utils import floatToGoString
+
+from ..serving.endpoints import EndpointMetricLogging
+from ..serving.model_request_processor import ModelRequestProcessor
+
+
+class ScalarHistogram(Histogram):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def observe(self, amount, exemplar=None):
+        """Observe the given amount.
+
+        The amount is usually positive or zero. Negative values are
+        accepted but prevent current versions of Prometheus from
+        properly detecting counter resets in the sum of
+        observations. See
+        https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations
+        for details.
+        """
+        self._raise_if_not_observable()
+        if not isinstance(amount, (list, tuple)):
+            amount = [amount]
+        self._sum.inc(len(amount))
+        for v in amount:
+            for i, bound in enumerate(self._upper_bounds):
+                if v <= bound:
+                    self._buckets[i].inc(1)
+                    if exemplar:
+                        _validate_exemplar(exemplar)
+                        self._buckets[i].set_exemplar(Exemplar(exemplar, v, time()))
+                    break
+
+    def _child_samples(self) -> Iterable[Sample]:
+        samples = []
+        for i, bound in enumerate(self._upper_bounds):
+            acc = self._buckets[i].get()
+            samples.append(
+                Sample('_bucket', {'le': floatToGoString(bound)}, acc, None, self._buckets[i].get_exemplar())
+            )
+            samples.append(Sample('_sum', {'le': floatToGoString(bound)}, self._sum.get(), None, None))
+
+        return tuple(samples)
+
+
+class EnumHistogram(MetricWrapperBase):
+    """A Histogram tracks the size and number of events in buckets.
+
+    You can use Histograms for aggregatable calculation of quantiles.
+
+    Example use cases:
+    - Response latency
+    - Request size
+
+    Example for a Histogram:
+
+        from prometheus_client import Histogram
+
+        h = Histogram('request_size_bytes', 'Request size (bytes)')
+        h.observe(512)  # Observe 512 (bytes)
+
+    Example for a Histogram using time:
+
+        from prometheus_client import Histogram
+
+        REQUEST_TIME = Histogram('response_latency_seconds', 'Response latency (seconds)')
+
+        @REQUEST_TIME.time()
+        def create_response(request):
+          '''A dummy function'''
+          time.sleep(1)
+
+    Example of using the same Histogram object as a context manager:
+
+        with REQUEST_TIME.time():
+            pass  # Logic to be timed
+
+    The default buckets are intended to cover a typical web/rpc request from milliseconds to seconds.
+    They can be overridden by passing `buckets` keyword argument to `Histogram`.
+    """
+    _type = 'histogram'
+
+    def __init__(self,
+                 name,
+                 documentation,
+                 buckets,
+                 labelnames=(),
+                 namespace='',
+                 subsystem='',
+                 unit='',
+                 registry=REGISTRY,
+                 _labelvalues=None,
+                 ):
+        self._prepare_buckets(buckets)
+        super().__init__(
+            name=name,
+            documentation=documentation,
+            labelnames=labelnames,
+            namespace=namespace,
+            subsystem=subsystem,
+            unit=unit,
+            registry=registry,
+            _labelvalues=_labelvalues,
+        )
+        self._kwargs['buckets'] = buckets
+
+    def _prepare_buckets(self, buckets):
+        buckets = [str(b) for b in buckets]
+        if buckets != sorted(buckets):
+            # This is probably an error on the part of the user,
+            # so raise rather than sorting for them.
+            raise ValueError('Buckets not in sorted order')
+
+        if len(buckets) < 2:
+            raise ValueError('Must have at least two buckets')
+        self._upper_bounds = buckets
+
+    def _metric_init(self):
+        self._buckets = {}
+        self._created = time()
+        bucket_labelnames = self._upper_bounds
+        self._sum = values.ValueClass(
+            self._type, self._name, self._name + '_sum', self._labelnames, self._labelvalues)
+        for b in self._upper_bounds:
+            self._buckets[b] = values.ValueClass(
+                self._type,
+                self._name,
+                self._name + '_bucket',
+                bucket_labelnames,
+                self._labelvalues + (b,))
+
+    def observe(self, amount, exemplar=None):
+        """Observe the given amount.
+
+        The amount is usually positive or zero. Negative values are
+        accepted but prevent current versions of Prometheus from
+        properly detecting counter resets in the sum of
+        observations. See
+        https://prometheus.io/docs/practices/histograms/#count-and-sum-of-observations
+        for details.
+        """
+        self._raise_if_not_observable()
+        if not isinstance(amount, (list, tuple)):
+            amount = [amount]
+        self._sum.inc(len(amount))
+        for v in amount:
+            self._buckets[v].inc(1)
+            if exemplar:
+                _validate_exemplar(exemplar)
+                self._buckets[v].set_exemplar(Exemplar(exemplar, 1, time()))
+
+    def time(self):
+        """Time a block of code or function, and observe the duration in seconds.
+
+        Can be used as a function decorator or context manager.
+        """
+        return Timer(self, 'observe')
+
+    def _child_samples(self) -> Iterable[Sample]:
+        samples = []
+        for i in self._buckets:
+            acc = self._buckets[i].get()
+            samples.append(Sample(
+                '_bucket', {'enum': i}, acc, None, self._buckets[i].get_exemplar()))
+            samples.append(Sample('_sum', {'enum': i}, self._sum.get(), None, None))
+
+        return tuple(samples)
+
+
+class StatisticsController(object):
+    _reserved = {
+        '_latency': partial(ScalarHistogram, buckets=(.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0)),
+        '_count': Counter
+    }
+    _metric_type_class = {"scalar": ScalarHistogram, "enum": EnumHistogram, "value": Gauge, "counter": Counter}
+
+    def __init__(
+            self,
+            task: Task,
+            kafka_server_url: str,
+            serving_id: Optional[str],
+            poll_frequency_min: float = 5
+    ):
+        self.task = task
+        self._serving_service_task_id = serving_id
+        self._poll_frequency_min = float(poll_frequency_min)
+        self._serving_service = None  # type: Optional[ModelRequestProcessor]
+        self._current_endpoints = {}  # type: Optional[Dict[str, EndpointMetricLogging]]
+        self._prometheus_metrics = {}  # type: Optional[Dict[str, Dict[str, MetricWrapperBase]]]
+        self._timestamp = time()
+        self._sync_thread = None
+        self._last_sync_time = time()
+        self._dirty = False
+        self._sync_event = Event()
+        self._sync_threshold_sec = 30
+        self._kafka_server = kafka_server_url
+        # noinspection PyProtectedMember
+        self._kafka_topic = ModelRequestProcessor._kafka_topic
+
+    def start(self):
+        self._serving_service = ModelRequestProcessor(task_id=self._serving_service_task_id)
+
+        if not self._sync_thread:
+            self._sync_thread = Thread(target=self._sync_daemon, daemon=True)
+            self._sync_thread.start()
+
+        # noinspection PyProtectedMember
+        kafka_server = \
+            self._serving_service.get_configuration().get(ModelRequestProcessor._config_key_kafka_stats) or \
+            self._kafka_server
+
+        print("Starting Kafka Statistics processing: {}".format(kafka_server))
+
+        while True:
+            try:
+                consumer = KafkaConsumer(self._kafka_topic, bootstrap_servers=kafka_server)
+                break
+            except Exception as ex:
+                print("Error: failed opening Kafka consumer [{}]: {}".format(kafka_server, ex))
+                print("Retrying in 30 seconds")
+                sleep(30)
+
+        # we will never leave this loop
+        for message in consumer:
+            # noinspection PyBroadException
+            try:
+                data = json.loads(message.value.decode("utf-8"))
+            except Exception:
+                print("Warning: failed to decode kafka stats message")
+                continue
+            try:
+                url = data.pop("_url", None)
+                if not url:
+                    # should not happen
+                    continue
+                endpoint_metric = self._current_endpoints.get(url)
+                if not endpoint_metric:
+                    # add default one, we will just log the reserved valued:
+                    endpoint_metric = dict()
+                    self._current_endpoints[url] = EndpointMetricLogging(endpoint=url)
+                    # we should sync,
+                    if time()-self._last_sync_time > self._sync_threshold_sec:
+                        self._last_sync_time = time()
+                        self._sync_event.set()
+
+                metric_url_log = self._prometheus_metrics.get(url)
+                if not metric_url_log:
+                    # create a new one
+                    metric_url_log = dict()
+                    self._prometheus_metrics[url] = metric_url_log
+
+                # check if we have the prometheus_logger
+                for k, v in data.items():
+                    prometheus_logger = metric_url_log.get(k)
+                    if not prometheus_logger:
+                        prometheus_logger = self._create_prometheus_logger_class(url, k, endpoint_metric)
+                        if not prometheus_logger:
+                            continue
+                        metric_url_log[k] = prometheus_logger
+
+                    self._report_value(prometheus_logger, v)
+
+            except Exception as ex:
+                print("Warning: failed to report stat to Prometheus: {}".format(ex))
+                continue
+
+    @staticmethod
+    def _report_value(prometheus_logger: Optional[MetricWrapperBase], v: Any) -> bool:
+        if not prometheus_logger:
+            # this means no one configured the variable to log
+            return False
+        elif isinstance(prometheus_logger, (Histogram, EnumHistogram)):
+            prometheus_logger.observe(amount=v)
+        elif isinstance(prometheus_logger, Gauge):
+            prometheus_logger.set(value=v)
+        elif isinstance(prometheus_logger, Counter):
+            prometheus_logger.inc(amount=v)
+        elif isinstance(prometheus_logger, Enum):
+            prometheus_logger.state(state=v)
+        else:
+            # we should not get here
+            return False
+
+        return True
+
+    def _create_prometheus_logger_class(
+            self,
+            url: str,
+            variable_name: str,
+            endpoint_config: EndpointMetricLogging
+    ) -> Optional[MetricWrapperBase]:
+        reserved_cls = self._reserved.get(variable_name)
+        name = "{}:{}".format(url, variable_name)
+        name = re.sub(r"[^(a-zA-Z0-9_:)]", "_", name)
+        if reserved_cls:
+            return reserved_cls(name=name, documentation="Built in {}".format(variable_name))
+
+        if not endpoint_config:
+            # we should not end up here
+            return None
+
+        metric_ = endpoint_config.metrics.get(variable_name)
+        if not metric_:
+            return None
+        metric_cls = self._metric_type_class.get(metric_.type)
+        if not metric_cls:
+            return None
+        if metric_cls in (Histogram, EnumHistogram):
+            return metric_cls(
+                name=name,
+                documentation="User defined metric {}".format(metric_.type),
+                buckets=metric_.buckets
+            )
+        return metric_cls(name=name, documentation="User defined metric {}".format(metric_.type))
+
+    def _sync_daemon(self):
+        self._last_sync_time = time()
+        poll_freq_sec = self._poll_frequency_min*60
+        print("Instance [{}, pid={}]: Launching - configuration sync every {} sec".format(
+            self.task.id, os.getpid(), poll_freq_sec))
+        while True:
+            try:
+                self._serving_service.deserialize()
+                endpoint_metrics = self._serving_service.list_endpoint_logging()
+                self._last_sync_time = time()
+                if self._current_endpoints == endpoint_metrics:
+                    self._sync_event.wait(timeout=poll_freq_sec)
+                    self._sync_event.clear()
+                    continue
+
+                # update metrics:
+                self._dirty = True
+                self._current_endpoints = deepcopy(endpoint_metrics)
+                print("New configuration synced")
+            except Exception as ex:
+                print("Warning: failed to sync state from serving service Task: {}".format(ex))
+                continue
diff --git a/clearml_serving/statistics/requirements.txt b/clearml_serving/statistics/requirements.txt
new file mode 100644
index 0000000..1f153d9
--- /dev/null
+++ b/clearml_serving/statistics/requirements.txt
@@ -0,0 +1,6 @@
+clearml>=1.3.1
+numpy>=1.20,<1.24
+requests>=2.25.1,<2.26
+kafka-python>=2.0.2,<2.1
+prometheus_client>=0.13.1,<0.14
+lz4>=4.0.0,<5
diff --git a/clearml_serving/triton_helper.py b/clearml_serving/triton_helper.py
deleted file mode 100644
index c878541..0000000
--- a/clearml_serving/triton_helper.py
+++ /dev/null
@@ -1,219 +0,0 @@
-import re
-import subprocess
-from argparse import ArgumentParser
-from time import time
-from typing import Optional
-
-from pathlib2 import Path
-
-from clearml import Task, Logger
-from clearml.backend_api.utils import get_http_session_with_retry
-from clearml_serving.serving_service import ServingService
-
-
-class TritonHelper(object):
-    _metric_line_parsing = r"(\w+){(gpu_uuid=\"[\w\W]*\",)?model=\"(\w+)\",\s*version=\"(\d+)\"}\s*([0-9.]*)"
-    _default_metrics_port = 8002
-
-    def __init__(
-            self,
-            args,  # Any
-            task,  # type: Task
-            serving_id,  # type: str
-            metric_host=None,  # type: Optional[str]
-            metric_port=None,  # type: int
-    ):
-        # type: (...) -> None
-        self._http_session = get_http_session_with_retry()
-        self.args = dict(**args.__dict__) if args else {}
-        self.task = task
-        self.serving_id = serving_id
-        self.metric_host = metric_host or '0.0.0.0'
-        self.metric_port = metric_port or self._default_metrics_port
-        self._parse_metric = re.compile(self._metric_line_parsing)
-        self._timestamp = time()
-        print('String Triton Helper service\n{}\n'.format(self.args))
-
-    def report_metrics(self, remote_logger):
-        # type: (Optional[Logger]) -> bool
-        # iterations are seconds from start
-        iteration = int(time() - self._timestamp)
-
-        report_msg = "reporting metrics: relative time {} sec".format(iteration)
-        self.task.get_logger().report_text(report_msg)
-        if remote_logger:
-            remote_logger.report_text(report_msg)
-
-        # noinspection PyBroadException
-        try:
-            request = self._http_session.get('http://{}:{}/metrics'.format(
-                self.metric_host, self.metric_port))
-            if not request.ok:
-                return False
-            content = request.content.decode().split('\n')
-        except Exception:
-            return False
-
-        for line in content:
-            line = line.strip()
-            if not line or line.startswith('#'):
-                continue
-            # noinspection PyBroadException
-            try:
-                metric, gpu_uuid, variant, version, value = self._parse_metric.match(line).groups()
-                value = float(value)
-            except Exception:
-                continue
-            self.task.get_logger().report_scalar(
-                title=metric,
-                series='{}.v{}'.format(variant, version),
-                iteration=iteration,
-                value=value
-            )
-            # on the remote logger we add our own Task ID (unique ID),
-            # to support multiple servers reporting to the same service controller
-            if remote_logger:
-                remote_logger.report_scalar(
-                    title=metric,
-                    series='{}.v{}.{}'.format(variant, version, self.task.id),
-                    iteration=iteration,
-                    value=value
-                )
-
-    def maintenance_daemon(
-            self,
-            local_model_repo='/models',  # type: str
-            update_frequency_sec=60.0,  # type: float
-            metric_frequency_sec=60.0  # type: float
-    ):
-        # type: (...) -> None
-
-        Path(local_model_repo).mkdir(parents=True, exist_ok=True)
-
-        a_service = ServingService(task_id=self.serving_id)
-        a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)
-
-        # noinspection PyProtectedMember
-        remote_logger = a_service._task.get_logger()
-
-        # todo: log triton server outputs when running locally
-
-        # we assume we can run the triton server
-        cmd = [
-            'tritonserver',
-            '--model-control-mode=poll',
-            '--model-repository={}'.format(local_model_repo),
-            '--repository-poll-secs={}'.format(update_frequency_sec),
-            '--metrics-port={}'.format(self._default_metrics_port),
-            '--allow-metrics=true',
-            '--allow-gpu-metrics=true',
-        ]
-        for k, v in self.args.items():
-            if not v or not str(k).startswith('t_'):
-                continue
-            cmd.append('--{}={}'.format(k, v))
-
-        print('Starting server: {}'.format(cmd))
-        try:
-            proc = subprocess.Popen(cmd)
-        except FileNotFoundError:
-            raise ValueError(
-                "Triton Server Engine (tritonserver) could not be found!\n"
-                "Verify you running inside the `nvcr.io/nvidia/tritonserver` docker container")
-        base_freq = min(update_frequency_sec, metric_frequency_sec)
-        metric_tic = update_tic = time()
-        while True:
-            try:
-                error_code = proc.wait(timeout=base_freq)
-                if error_code == 0:
-                    print("triton-server process ended with error code {}".format(error_code))
-                    return
-                raise ValueError("triton-server process ended with error code {}".format(error_code))
-            except subprocess.TimeoutExpired:
-                pass
-            pass
-
-            # update models
-            if time() - update_tic > update_frequency_sec:
-                a_service.triton_model_service_update_step(model_repository_folder=local_model_repo)
-                update_tic = time()
-
-            # update stats
-            if time() - metric_tic > metric_frequency_sec:
-                metric_tic = time()
-                self.report_metrics(remote_logger)
-
-
-def main():
-    title = 'clearml-serving - Nvidia Triton Engine Helper'
-    print(title)
-    parser = ArgumentParser(prog='clearml-serving', description=title)
-    parser.add_argument(
-        '--serving-id', default=None, type=str, required=True,
-        help='Specify main serving service Task ID')
-    parser.add_argument(
-        '--project', default='serving', type=str,
-        help='Optional specify project for the serving engine Task')
-    parser.add_argument(
-        '--name', default='nvidia-triton', type=str,
-        help='Optional specify task name for the serving engine Task')
-    parser.add_argument(
-        '--update-frequency', default=10, type=float,
-        help='Model update frequency in minutes')
-    parser.add_argument(
-        '--metric-frequency', default=1, type=float,
-        help='Metric reporting update frequency in minutes')
-    parser.add_argument(
-        '--t-http-port', type=str, help='<integer> The port for the server to listen on for HTTP requests')
-    parser.add_argument(
-        '--t-http-thread-count', type=str, help='<integer> Number of threads handling HTTP requests')
-    parser.add_argument(
-        '--t-allow-grpc', type=str, help='<integer> Allow the server to listen for GRPC requests')
-    parser.add_argument(
-        '--t-grpc-port', type=str, help='<integer> The port for the server to listen on for GRPC requests')
-    parser.add_argument(
-        '--t-grpc-infer-allocation-pool-size', type=str,
-        help='<integer> The maximum number of inference request/response objects that remain '
-             'allocated for reuse. As long as the number of in-flight requests doesn\'t exceed '
-             'this value there will be no allocation/deallocation of request/response objects')
-    parser.add_argument(
-        '--t-pinned-memory-pool-byte-size', type=str,
-        help='<integer> The total byte size that can be allocated as pinned system '
-             'memory. If GPU support is enabled, the server will allocate pinned '
-             'system memory to accelerate data transfer between host and devices '
-             'until it exceeds the specified byte size. This option will not affect '
-             'the allocation conducted by the backend frameworks. Default is 256 MB')
-    parser.add_argument(
-        '--t-cuda-memory-pool-byte-size', type=str,
-        help='<<integer>:<integer>> The total byte size that can be allocated as CUDA memory for '
-             'the GPU device. If GPU support is enabled, the server will allocate '
-             'CUDA memory to minimize data transfer between host and devices '
-             'until it exceeds the specified byte size. This option will not affect '
-             'the allocation conducted by the backend frameworks. The argument '
-             'should be 2 integers separated by colons in the format <GPU device'
-             'ID>:<pool byte size>. This option can be used multiple times, but only '
-             'once per GPU device. Subsequent uses will overwrite previous uses for '
-             'the same GPU device. Default is 64 MB')
-    parser.add_argument(
-        '--t-min-supported-compute-capability', type=str,
-        help='<float> The minimum supported CUDA compute capability. GPUs that '
-             'don\'t support this compute capability will not be used by the server')
-    parser.add_argument(
-        '--t-buffer-manager-thread-count', type=str,
-        help='<integer> The number of threads used to accelerate copies and other'
-             'operations required to manage input and output tensor contents.'
-             'Default is 0')
-
-    args = parser.parse_args()
-    task = Task.init(project_name=args.project, task_name=args.name, task_type=Task.TaskTypes.inference)
-    helper = TritonHelper(args, task, serving_id=args.serving_id)
-    # this function will never end
-    helper.maintenance_daemon(
-        local_model_repo='/models',
-        update_frequency_sec=args.update_frequency*60.0,
-        metric_frequency_sec=args.metric_frequency*60.0,
-    )
-
-
-if __name__ == '__main__':
-    main()
diff --git a/clearml_serving/version.py b/clearml_serving/version.py
index 80eb7f9..e4e49b3 100644
--- a/clearml_serving/version.py
+++ b/clearml_serving/version.py
@@ -1 +1 @@
-__version__ = '0.3.3'
+__version__ = '0.9.0'
diff --git a/docker/datasource.yml b/docker/datasource.yml
new file mode 100644
index 0000000..1e91d21
--- /dev/null
+++ b/docker/datasource.yml
@@ -0,0 +1,8 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    # Access mode - proxy (server in the UI) or direct (browser in the UI).
+    access: proxy
+    url: http://clearml-serving-prometheus:9090
diff --git a/docker/docker-compose-triton-gpu.yml b/docker/docker-compose-triton-gpu.yml
new file mode 100644
index 0000000..073eee3
--- /dev/null
+++ b/docker/docker-compose-triton-gpu.yml
@@ -0,0 +1,151 @@
+version: "3"
+
+services:
+  zookeeper:
+    image: bitnami/zookeeper:3.7.0
+    container_name: clearml-serving-zookeeper
+    # ports:
+      # - "2181:2181"
+    environment:
+      - ALLOW_ANONYMOUS_LOGIN=yes
+    networks:
+      - clearml-serving-backend
+
+  kafka:
+    image: bitnami/kafka:3.1.0
+    container_name: clearml-serving-kafka
+    # ports:
+      # - "9092:9092"
+    environment:
+      - KAFKA_BROKER_ID=1
+      - KAFKA_CFG_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
+      - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
+      - KAFKA_CFG_ZOOKEEPER_CONNECT=clearml-serving-zookeeper:2181
+      - ALLOW_PLAINTEXT_LISTENER=yes
+      - KAFKA_CREATE_TOPICS="topic_test:1:1"
+    depends_on:
+      - zookeeper
+    networks:
+      - clearml-serving-backend
+
+  prometheus:
+    image: prom/prometheus:v2.34.0
+    container_name: clearml-serving-prometheus
+    volumes:
+      - ./prometheus.yml:/prometheus.yml
+    command:
+      - '--config.file=/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--storage.tsdb.retention.time=200h'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+    # ports:
+      # - "9090:9090"
+    depends_on:
+      - clearml-serving-statistics
+    networks:
+      - clearml-serving-backend
+
+  alertmanager:
+    image: prom/alertmanager:v0.23.0
+    container_name: clearml-serving-alertmanager
+    restart: unless-stopped
+    # ports:
+      # - "9093:9093"
+    depends_on:
+      - prometheus
+      - grafana
+    networks:
+      - clearml-serving-backend
+
+  grafana:
+    image: grafana/grafana:8.4.4-ubuntu
+    container_name: clearml-serving-grafana
+    volumes:
+      - './datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml'
+    restart: unless-stopped
+    ports:
+      - "3000:3000"
+    depends_on:
+      - prometheus
+    networks:
+      - clearml-serving-backend
+
+
+  clearml-serving-inference:
+    image: allegroai/clearml-serving-inference:latest
+    container_name: clearml-serving-inference
+    restart: unless-stopped
+    ports:
+      - "8080:8080"
+    environment:
+      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
+      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
+      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
+      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
+      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
+      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
+      CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080}
+      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
+      CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve}
+      CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
+      CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
+      CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
+      CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
+    depends_on:
+      - kafka
+      - clearml-serving-triton
+    networks:
+      - clearml-serving-backend
+
+  clearml-serving-triton:
+    image: allegroai/clearml-serving-triton:latest
+    container_name: clearml-serving-triton
+    restart: unless-stopped
+    # ports:
+      # - "8001:8001"
+    environment:
+      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
+      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
+      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
+      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
+      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
+      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
+      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
+      CLEARML_TRITON_METRIC_FREQ: ${CLEARML_TRITON_METRIC_FREQ:-1.0}
+    depends_on:
+      - kafka
+    networks:
+      - clearml-serving-backend
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+
+  clearml-serving-statistics:
+    image: allegroai/clearml-serving-statistics:latest
+    container_name: clearml-serving-statistics
+    restart: unless-stopped
+    # ports:
+      # - "9999:9999"
+    environment:
+      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
+      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
+      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
+      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
+      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
+      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
+      CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
+      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
+    depends_on:
+      - kafka
+    networks:
+      - clearml-serving-backend
+
+
+networks:
+  clearml-serving-backend:
+    driver: bridge
diff --git a/docker/docker-compose-triton.yml b/docker/docker-compose-triton.yml
new file mode 100644
index 0000000..f62b1c4
--- /dev/null
+++ b/docker/docker-compose-triton.yml
@@ -0,0 +1,146 @@
+version: "3"
+
+services:
+  zookeeper:
+    image: bitnami/zookeeper:3.7.0
+    container_name: clearml-serving-zookeeper
+    # ports:
+      # - "2181:2181"
+    environment:
+      - ALLOW_ANONYMOUS_LOGIN=yes
+    networks:
+      - clearml-serving-backend
+
+  kafka:
+    image: bitnami/kafka:3.1.0
+    container_name: clearml-serving-kafka
+    # ports:
+      # - "9092:9092"
+    environment:
+      - KAFKA_BROKER_ID=1
+      - KAFKA_CFG_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
+      - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
+      - KAFKA_CFG_ZOOKEEPER_CONNECT=clearml-serving-zookeeper:2181
+      - ALLOW_PLAINTEXT_LISTENER=yes
+      - KAFKA_CREATE_TOPICS="topic_test:1:1"
+    depends_on:
+      - zookeeper
+    networks:
+      - clearml-serving-backend
+
+  prometheus:
+    image: prom/prometheus:v2.34.0
+    container_name: clearml-serving-prometheus
+    volumes:
+      - ./prometheus.yml:/prometheus.yml
+    command:
+      - '--config.file=/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--storage.tsdb.retention.time=200h'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+    # ports:
+      # - "9090:9090"
+    depends_on:
+      - clearml-serving-statistics
+    networks:
+      - clearml-serving-backend
+
+  alertmanager:
+    image: prom/alertmanager:v0.23.0
+    container_name: clearml-serving-alertmanager
+    restart: unless-stopped
+    # ports:
+      # - "9093:9093"
+    depends_on:
+      - prometheus
+      - grafana
+    networks:
+      - clearml-serving-backend
+
+  grafana:
+    image: grafana/grafana:8.4.4-ubuntu
+    container_name: clearml-serving-grafana
+    volumes:
+      - './datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml'
+    restart: unless-stopped
+    ports:
+      - "3000:3000"
+    depends_on:
+      - prometheus
+    networks:
+      - clearml-serving-backend
+
+
+  clearml-serving-inference:
+    image: allegroai/clearml-serving-inference:latest
+    container_name: clearml-serving-inference
+    restart: unless-stopped
+    ports:
+      - "8080:8080"
+    environment:
+      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
+      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
+      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
+      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
+      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
+      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
+      CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080}
+      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
+      CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve}
+      CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
+      CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-clearml-serving-triton:8001}
+      CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
+      CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
+    depends_on:
+      - kafka
+      - clearml-serving-triton
+    networks:
+      - clearml-serving-backend
+
+  clearml-serving-triton:
+    image: allegroai/clearml-serving-triton:latest
+    container_name: clearml-serving-triton
+    restart: unless-stopped
+    # ports:
+      # - "8001:8001"
+    environment:
+      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
+      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
+      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
+      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
+      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
+      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
+      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
+      CLEARML_TRITON_METRIC_FREQ: ${CLEARML_TRITON_METRIC_FREQ:-1.0}
+    depends_on:
+      - kafka
+    networks:
+      - clearml-serving-backend
+
+  clearml-serving-statistics:
+    image: allegroai/clearml-serving-statistics:latest
+    container_name: clearml-serving-statistics
+    restart: unless-stopped
+    # ports:
+      # - "9999:9999"
+    environment:
+      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
+      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
+      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
+      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
+      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
+      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
+      CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
+      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
+    depends_on:
+      - kafka
+    networks:
+      - clearml-serving-backend
+
+
+networks:
+  clearml-serving-backend:
+    driver: bridge
diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml
new file mode 100644
index 0000000..54f4f32
--- /dev/null
+++ b/docker/docker-compose.yml
@@ -0,0 +1,125 @@
+version: "3"
+
+services:
+  zookeeper:
+    image: bitnami/zookeeper:3.7.0
+    container_name: clearml-serving-zookeeper
+    # ports:
+      # - "2181:2181"
+    environment:
+      - ALLOW_ANONYMOUS_LOGIN=yes
+    networks:
+      - clearml-serving-backend
+
+  kafka:
+    image: bitnami/kafka:3.1.0
+    container_name: clearml-serving-kafka
+    # ports:
+      # - "9092:9092"
+    environment:
+      - KAFKA_BROKER_ID=1
+      - KAFKA_CFG_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
+      - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://clearml-serving-kafka:9092
+      - KAFKA_CFG_ZOOKEEPER_CONNECT=clearml-serving-zookeeper:2181
+      - ALLOW_PLAINTEXT_LISTENER=yes
+      - KAFKA_CREATE_TOPICS="topic_test:1:1"
+    depends_on:
+      - zookeeper
+    networks:
+      - clearml-serving-backend
+
+  prometheus:
+    image: prom/prometheus:v2.34.0
+    container_name: clearml-serving-prometheus
+    volumes:
+      - ./prometheus.yml:/prometheus.yml
+    command:
+      - '--config.file=/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--storage.tsdb.retention.time=200h'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+    # ports:
+      # - "9090:9090"
+    depends_on:
+      - clearml-serving-statistics
+    networks:
+      - clearml-serving-backend
+
+  alertmanager:
+    image: prom/alertmanager:v0.23.0
+    container_name: clearml-serving-alertmanager
+    restart: unless-stopped
+    # ports:
+      # - "9093:9093"
+    depends_on:
+      - prometheus
+      - grafana
+    networks:
+      - clearml-serving-backend
+
+  grafana:
+    image: grafana/grafana:8.4.4-ubuntu
+    container_name: clearml-serving-grafana
+    volumes:
+      - './datasource.yml:/etc/grafana/provisioning/datasources/datasource.yaml'
+    restart: unless-stopped
+    ports:
+      - "3000:3000"
+    depends_on:
+      - prometheus
+    networks:
+      - clearml-serving-backend
+
+
+  clearml-serving-inference:
+    image: allegroai/clearml-serving-inference:latest
+    container_name: clearml-serving-inference
+    restart: unless-stopped
+    ports:
+      - "8080:8080"
+    environment:
+      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
+      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
+      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
+      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
+      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
+      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
+      CLEARML_SERVING_PORT: ${CLEARML_SERVING_PORT:-8080}
+      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
+      CLEARML_DEFAULT_BASE_SERVE_URL: ${CLEARML_DEFAULT_BASE_SERVE_URL:-http://127.0.0.1:8080/serve}
+      CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
+      CLEARML_DEFAULT_TRITON_GRPC_ADDR: ${CLEARML_DEFAULT_TRITON_GRPC_ADDR:-}
+      CLEARML_USE_GUNICORN: ${CLEARML_USE_GUNICORN:-}
+      CLEARML_SERVING_NUM_PROCESS: ${CLEARML_SERVING_NUM_PROCESS:-}
+    depends_on:
+      - kafka
+    networks:
+      - clearml-serving-backend
+
+  clearml-serving-statistics:
+    image: allegroai/clearml-serving-statistics:latest
+    container_name: clearml-serving-statistics
+    restart: unless-stopped
+    # ports:
+      # - "9999:9999"
+    environment:
+      CLEARML_WEB_HOST: ${CLEARML_WEB_HOST:-https://app.clear.ml}
+      CLEARML_API_HOST: ${CLEARML_API_HOST:-https://api.clear.ml}
+      CLEARML_FILES_HOST: ${CLEARML_FILES_HOST:-https://files.clear.ml}
+      CLEARML_API_ACCESS_KEY: ${CLEARML_API_ACCESS_KEY}
+      CLEARML_API_SECRET_KEY: ${CLEARML_API_SECRET_KEY}
+      CLEARML_SERVING_TASK_ID: ${CLEARML_SERVING_TASK_ID:-}
+      CLEARML_DEFAULT_KAFKA_SERVE_URL: ${CLEARML_DEFAULT_KAFKA_SERVE_URL:-clearml-serving-kafka:9092}
+      CLEARML_SERVING_POLL_FREQ: ${CLEARML_SERVING_POLL_FREQ:-1.0}
+    depends_on:
+      - kafka
+    networks:
+      - clearml-serving-backend
+
+
+networks:
+  clearml-serving-backend:
+    driver: bridge
diff --git a/docker/example.env b/docker/example.env
new file mode 100644
index 0000000..8b38660
--- /dev/null
+++ b/docker/example.env
@@ -0,0 +1,6 @@
+CLEARML_WEB_HOST="https://app.clear.ml"
+CLEARML_API_HOST="https://api.clear.ml"
+CLEARML_FILES_HOST="https://files.clear.ml"
+CLEARML_API_ACCESS_KEY="<access_key_here>"
+CLEARML_API_SECRET_KEY="<secret_key_here>"
+CLEARML_SERVING_TASK_ID="<serving_service_id_here>"
diff --git a/docker/prometheus.yml b/docker/prometheus.yml
new file mode 100644
index 0000000..469e220
--- /dev/null
+++ b/docker/prometheus.yml
@@ -0,0 +1,22 @@
+global:
+  scrape_interval:     15s # By default, scrape targets every 15 seconds.
+  evaluation_interval:     15s # By default, scrape targets every 15 seconds.
+  external_labels:
+    monitor: 'clearml-serving'
+
+scrape_configs:
+  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
+  - job_name: 'prometheus'
+
+    scrape_interval: 5s
+
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
+  - job_name: 'clearml-inference-stats'
+
+    scrape_interval: 5s
+
+    static_configs:
+      - targets: ['clearml-serving-statistics:9999']
diff --git a/docs/design_diagram.png b/docs/design_diagram.png
new file mode 100644
index 0000000..e75c0ec
Binary files /dev/null and b/docs/design_diagram.png differ
diff --git a/docs/grafana_screenshot.png b/docs/grafana_screenshot.png
new file mode 100644
index 0000000..37ef655
Binary files /dev/null and b/docs/grafana_screenshot.png differ
diff --git a/examples/clearml_serving_simple_http_inference_request/68747470733a2f2f646174616d61646e6573732e6769746875622e696f2f6173736574732f696d616765732f74665f66696c655f666565642f4d4e4953545f64696769742e706e67.png b/examples/clearml_serving_simple_http_inference_request/68747470733a2f2f646174616d61646e6573732e6769746875622e696f2f6173736574732f696d616765732f74665f66696c655f666565642f4d4e4953545f64696769742e706e67.png
deleted file mode 100644
index 0aafd0b..0000000
Binary files a/examples/clearml_serving_simple_http_inference_request/68747470733a2f2f646174616d61646e6573732e6769746875622e696f2f6173736574732f696d616765732f74665f66696c655f666565642f4d4e4953545f64696769742e706e67.png and /dev/null differ
diff --git a/examples/clearml_serving_simple_http_inference_request/client.py b/examples/clearml_serving_simple_http_inference_request/client.py
deleted file mode 100644
index 462145f..0000000
--- a/examples/clearml_serving_simple_http_inference_request/client.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import argparse
-from PIL import Image
-import numpy as np
-
-from http_triton import InferenceServerClient, InferInput
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-v',
-                        '--verbose',
-                        action="store_true",
-                        required=False,
-                        default=False,
-                        help='Enable verbose output')
-    parser.add_argument('-u',
-                        '--url',
-                        type=str,
-                        required=False,
-                        default='localhost:8000',
-                        help='Inference server URL. Default localhost:8000')
-
-    FLAGS = parser.parse_args()
-
-    model_name = "keras_mnist"
-    model_version = "1"
-
-    input_name = "dense_input"
-    shape = (1, 784)
-    datatype = 'FP32'
-
-    output_name = 'activation_2'
-
-    # Path of an image
-    image_path = '68747470733a2f2f646174616d61646e6573732e6769746875622e696f2f6173736574732f696d616765732f74665f66696c655f666565642f4d4e4953545f64696769742e706e67.png'
-
-    # The image is opened using Pillow, then converted to grayscale since the model deployed is trained on grayscale images
-    image = Image.open(image_path).convert('L')
-
-    # The image is resized to 28x28 pixels
-    image = image.resize(shape, Image.ANTIALIAS)
-
-    # The image is converted to a numpy array and data type is converted to float32 since the model is trained on float32
-    np_image = np.array(image).astype(np.float32)
-
-    # The image is reshaped to fit the model
-    np_image = np_image.reshape(-1, 784)
-
-    # Create an InferInput object with the input name, its data type and its shape defined
-    inferInput = InferInput(name=input_name, datatype=datatype, shape=shape)
-
-    # Set the data inside the InferInput object to the image in numpy format
-    inferInput.set_data_from_numpy(np_image)
-
-    # Create an InferenceServerClient and pass to it the url of the server
-    client = InferenceServerClient(url=FLAGS.url, verbose=FLAGS.verbose)
-
-    # Call client.infer(), pass the model name, version and the InferInput object inside a list since there can be multiple inputs
-    inferResult = client.infer(model_name=model_name, inputs=[inferInput], model_version=model_version)
-
-    # Print the output of the model in numpy format, pass in the name of the output layer
-    print(inferResult.as_numpy(output_name))
\ No newline at end of file
diff --git a/examples/clearml_serving_simple_http_inference_request/http_triton.py b/examples/clearml_serving_simple_http_inference_request/http_triton.py
deleted file mode 100644
index de0e103..0000000
--- a/examples/clearml_serving_simple_http_inference_request/http_triton.py
+++ /dev/null
@@ -1,1970 +0,0 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-try:
-    from geventhttpclient import HTTPClient
-    from geventhttpclient.url import URL
-    import gevent
-    import gevent.pool
-    from urllib.parse import quote, quote_plus
-    import rapidjson as json
-    import numpy as np
-    import struct
-    import gzip, zlib
-except ModuleNotFoundError as error:
-    raise RuntimeError(
-        'The installation does not include http support. Specify \'http\' or \'all\' while installing the tritonclient package to include the support'
-    ) from error
-
-from tritonclient.utils import *
-
-
-def _get_error(response):
-    """
-    Returns the InferenceServerException object if response
-    indicates the error. If no error then return None
-    """
-    if response.status_code != 200:
-        error_response = json.loads(response.read())
-        return InferenceServerException(msg=error_response["error"])
-    else:
-        return None
-
-
-def _raise_if_error(response):
-    """
-    Raise InferenceServerException if received non-Success
-    response from the server
-    """
-    error = _get_error(response)
-    if error is not None:
-        raise error
-
-
-def _get_query_string(query_params):
-    params = []
-    for key, value in query_params.items():
-        if isinstance(value, list):
-            for item in value:
-                params.append("%s=%s" %
-                              (quote_plus(key), quote_plus(str(item))))
-        else:
-            params.append("%s=%s" % (quote_plus(key), quote_plus(str(value))))
-    if params:
-        return "&".join(params)
-    return ''
-
-
-def _get_inference_request(inputs, request_id, outputs, sequence_id,
-                           sequence_start, sequence_end, priority, timeout):
-    infer_request = {}
-    parameters = {}
-    if request_id != "":
-        infer_request['id'] = request_id
-    if sequence_id != 0 and sequence_id != "":
-        parameters['sequence_id'] = sequence_id
-        parameters['sequence_start'] = sequence_start
-        parameters['sequence_end'] = sequence_end
-    if priority != 0:
-        parameters['priority'] = priority
-    if timeout is not None:
-        parameters['timeout'] = timeout
-
-    infer_request['inputs'] = [
-        this_input._get_tensor() for this_input in inputs
-    ]
-    if outputs:
-        infer_request['outputs'] = [
-            this_output._get_tensor() for this_output in outputs
-        ]
-    else:
-        # no outputs specified so set 'binary_data_output' True in the
-        # request so that all outputs are returned in binary format
-        parameters['binary_data_output'] = True
-
-    if parameters:
-        infer_request['parameters'] = parameters
-
-    request_body = json.dumps(infer_request)
-    json_size = len(request_body)
-    binary_data = None
-    for input_tensor in inputs:
-        raw_data = input_tensor._get_binary_data()
-        if raw_data is not None:
-            if binary_data is not None:
-                binary_data += raw_data
-            else:
-                binary_data = raw_data
-
-    if binary_data is not None:
-        request_body = struct.pack(
-            '{}s{}s'.format(len(request_body), len(binary_data)),
-            request_body.encode(), binary_data)
-        return request_body, json_size
-
-    return request_body, None
-
-
-class InferenceServerClient:
-    """An InferenceServerClient object is used to perform any kind of
-    communication with the InferenceServer using http protocol. None
-    of the methods are thread safe. The object is intended to be used
-    by a single thread and simultaneously calling different methods
-    with different threads is not supported and will cause undefined
-    behavior.
-
-    Parameters
-    ----------
-    url : str
-        The inference server name, port and optional base path 
-        in the following format: host:port/<base-path>, e.g.
-        'localhost:8000'.
-
-    verbose : bool
-        If True generate verbose output. Default value is False.
-    concurrency : int
-        The number of connections to create for this client.
-        Default value is 1.
-    connection_timeout : float
-        The timeout value for the connection. Default value
-        is 60.0 sec.
-    network_timeout : float
-        The timeout value for the network. Default value is
-        60.0 sec
-    max_greenlets : int
-        Determines the maximum allowed number of worker greenlets
-        for handling asynchronous inference requests. Default value
-        is None, which means there will be no restriction on the
-        number of greenlets created.
-    ssl : bool
-        If True, channels the requests to encrypted https scheme.
-        Some improper settings may cause connection to prematurely
-        terminate with an unsuccessful handshake. See
-        `ssl_context_factory` option for using secure default
-        settings. Default value for this option is False.
-    ssl_options : dict
-        Any options supported by `ssl.wrap_socket` specified as
-        dictionary. The argument is ignored if 'ssl' is specified
-        False.
-    ssl_context_factory : SSLContext callable
-        It must be a callbable that returns a SSLContext. Set to
-        `gevent.ssl.create_default_context` to use contexts with
-        secure default settings. This should most likely resolve
-        connection issues in a secure way. The default value for
-        this option is None which directly wraps the socket with
-        the options provided via `ssl_options`. The argument is
-        ignored if 'ssl' is specified False.
-    insecure : bool
-        If True, then does not match the host name with the certificate.
-        Default value is False. The argument is ignored if 'ssl' is
-        specified False.
-
-    Raises
-        ------
-        Exception
-            If unable to create a client.
-
-    """
-
-    def __init__(self,
-                 url,
-                 verbose=False,
-                 concurrency=1,
-                 connection_timeout=60.0,
-                 network_timeout=60.0,
-                 max_greenlets=None,
-                 ssl=False,
-                 ssl_options=None,
-                 ssl_context_factory=None,
-                 insecure=False):
-        if url.startswith("http://") or url.startswith("https://"):
-            raise_error("url should not include the scheme")
-        scheme = "https://" if ssl else "http://"
-        self._parsed_url = URL(scheme + url)
-        self._base_uri = self._parsed_url.request_uri.rstrip('/')
-        self._client_stub = HTTPClient.from_url(
-            self._parsed_url,
-            concurrency=concurrency,
-            connection_timeout=connection_timeout,
-            network_timeout=network_timeout,
-            ssl_options=ssl_options,
-            ssl_context_factory=ssl_context_factory,
-            insecure=insecure)
-        self._pool = gevent.pool.Pool(max_greenlets)
-        self._verbose = verbose
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self.close()
-
-    def __del__(self):
-        self.close()
-
-    def close(self):
-        """Close the client. Any future calls to server
-        will result in an Error.
-
-        """
-        self._pool.join()
-        self._client_stub.close()
-
-    def _get(self, request_uri, headers, query_params):
-        """Issues the GET request to the server
-
-         Parameters
-        ----------
-        request_uri: str
-            The request URI to be used in GET request.
-        headers: dict
-            Additional HTTP headers to include in the request.
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction.
-
-        Returns
-        -------
-        geventhttpclient.response.HTTPSocketPoolResponse
-            The response from server.
-        """
-        if self._base_uri is not None:
-            request_uri = self._base_uri + "/" + request_uri
-
-        if query_params is not None:
-            request_uri = request_uri + "?" + _get_query_string(query_params)
-
-        if self._verbose:
-            print("GET {}, headers {}".format(request_uri, headers))
-
-        if headers is not None:
-            response = self._client_stub.get(request_uri, headers=headers)
-        else:
-            response = self._client_stub.get(request_uri)
-
-        if self._verbose:
-            print(response)
-
-        return response
-
-    def _post(self, request_uri, request_body, headers, query_params):
-        """Issues the POST request to the server
-
-        Parameters
-        ----------
-        request_uri: str
-            The request URI to be used in POST request.
-        request_body: str
-            The body of the request
-        headers: dict
-            Additional HTTP headers to include in the request.
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction.
-
-        Returns
-        -------
-        geventhttpclient.response.HTTPSocketPoolResponse
-            The response from server.
-        """
-        if self._base_uri is not None:
-            request_uri = self._base_uri + "/" + request_uri
-
-        if query_params is not None:
-            request_uri = request_uri + "?" + _get_query_string(query_params)
-
-        if self._verbose:
-            print("POST {}, headers {}\n{}".format(request_uri, headers,
-                                                   request_body))
-
-        if headers is not None:
-            response = self._client_stub.post(request_uri=request_uri,
-                                              body=request_body,
-                                              headers=headers)
-        else:
-            response = self._client_stub.post(request_uri=request_uri,
-                                              body=request_body)
-
-        if self._verbose:
-            print(response)
-
-        return response
-
-    def is_server_live(self, headers=None, query_params=None):
-        """Contact the inference server and get liveness.
-
-        Parameters
-        ----------
-        headers: dict
-            Optional dictionary specifying additional HTTP
-            headers to include in the request.
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction.
-
-        Returns
-        -------
-        bool
-            True if server is live, False if server is not live.
-
-        Raises
-        ------
-        Exception
-            If unable to get liveness.
-
-        """
-
-        request_uri = "v2/health/live"
-        response = self._get(request_uri=request_uri,
-                             headers=headers,
-                             query_params=query_params)
-
-        return response.status_code == 200
-
-    def is_server_ready(self, headers=None, query_params=None):
-        """Contact the inference server and get readiness.
-
-        Parameters
-        ----------
-        headers: dict
-            Optional dictionary specifying additional HTTP
-            headers to include in the request.
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction.
-
-        Returns
-        -------
-        bool
-            True if server is ready, False if server is not ready.
-
-        Raises
-        ------
-        Exception
-            If unable to get readiness.
-
-        """
-        request_uri = "v2/health/ready"
-        response = self._get(request_uri=request_uri,
-                             headers=headers,
-                             query_params=query_params)
-
-        return response.status_code == 200
-
-    def is_model_ready(self,
-                       model_name,
-                       model_version="",
-                       headers=None,
-                       query_params=None):
-        """Contact the inference server and get the readiness of specified model.
-
-        Parameters
-        ----------
-        model_name: str
-            The name of the model to check for readiness.
-        model_version: str
-            The version of the model to check for readiness. The default value
-            is an empty string which means then the server will choose a version
-            based on the model and internal policy.
-        headers: dict
-            Optional dictionary specifying additional HTTP
-            headers to include in the request.
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction.
-
-        Returns
-        -------
-        bool
-            True if the model is ready, False if not ready.
-
-        Raises
-        ------
-        Exception
-            If unable to get model readiness.
-
-        """
-        if type(model_version) != str:
-            raise_error("model version must be a string")
-        if model_version != "":
-            request_uri = "v2/models/{}/versions/{}/ready".format(
-                quote(model_name), model_version)
-        else:
-            request_uri = "v2/models/{}/ready".format(quote(model_name))
-
-        response = self._get(request_uri=request_uri,
-                             headers=headers,
-                             query_params=query_params)
-
-        return response.status_code == 200
-
-    def get_server_metadata(self, headers=None, query_params=None):
-        """Contact the inference server and get its metadata.
-
-        Parameters
-        ----------
-        headers: dict
-            Optional dictionary specifying additional HTTP
-            headers to include in the request.
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction.
-
-        Returns
-        -------
-        dict
-            The JSON dict holding the metadata.
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to get server metadata.
-
-        """
-        request_uri = "v2"
-        response = self._get(request_uri=request_uri,
-                             headers=headers,
-                             query_params=query_params)
-        _raise_if_error(response)
-
-        content = response.read()
-        if self._verbose:
-            print(content)
-
-        return json.loads(content)
-
-    def get_model_metadata(self,
-                           model_name,
-                           model_version="",
-                           headers=None,
-                           query_params=None):
-        """Contact the inference server and get the metadata for specified model.
-
-        Parameters
-        ----------
-        model_name: str
-            The name of the model
-        model_version: str
-            The version of the model to get metadata. The default value
-            is an empty string which means then the server will choose
-            a version based on the model and internal policy.
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Returns
-        -------
-        dict
-            The JSON dict holding the metadata.
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to get model metadata.
-
-        """
-        if type(model_version) != str:
-            raise_error("model version must be a string")
-        if model_version != "":
-            request_uri = "v2/models/{}/versions/{}".format(
-                quote(model_name), model_version)
-        else:
-            request_uri = "v2/models/{}".format(quote(model_name))
-
-        response = self._get(request_uri=request_uri,
-                             headers=headers,
-                             query_params=query_params)
-        _raise_if_error(response)
-
-        content = response.read()
-        if self._verbose:
-            print(content)
-
-        return json.loads(content)
-
-    def get_model_config(self,
-                         model_name,
-                         model_version="",
-                         headers=None,
-                         query_params=None):
-        """Contact the inference server and get the configuration for specified model.
-
-        Parameters
-        ----------
-        model_name: str
-            The name of the model
-        model_version: str
-            The version of the model to get configuration. The default value
-            is an empty string which means then the server will choose
-            a version based on the model and internal policy.
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Returns
-        -------
-        dict
-            The JSON dict holding the model config.
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to get model configuration.
-
-        """
-        if model_version != "":
-            request_uri = "v2/models/{}/versions/{}/config".format(
-                quote(model_name), model_version)
-        else:
-            request_uri = "v2/models/{}/config".format(quote(model_name))
-
-        response = self._get(request_uri=request_uri,
-                             headers=headers,
-                             query_params=query_params)
-        _raise_if_error(response)
-
-        content = response.read()
-        if self._verbose:
-            print(content)
-
-        return json.loads(content)
-
-    def get_model_repository_index(self, headers=None, query_params=None):
-        """Get the index of model repository contents
-
-        Parameters
-        ----------
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Returns
-        -------
-        dict
-            The JSON dict holding the model repository index.
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to get the repository index.
-
-        """
-        request_uri = "v2/repository/index"
-        response = self._post(request_uri=request_uri,
-                              request_body="",
-                              headers=headers,
-                              query_params=query_params)
-        _raise_if_error(response)
-
-        content = response.read()
-        if self._verbose:
-            print(content)
-
-        return json.loads(content)
-
-    def load_model(self, model_name, headers=None, query_params=None):
-        """Request the inference server to load or reload specified model.
-
-        Parameters
-        ----------
-        model_name : str
-            The name of the model to be loaded.
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to load the model.
-
-        """
-        request_uri = "v2/repository/models/{}/load".format(quote(model_name))
-        response = self._post(request_uri=request_uri,
-                              request_body="",
-                              headers=headers,
-                              query_params=query_params)
-        _raise_if_error(response)
-        if self._verbose:
-            print("Loaded model '{}'".format(model_name))
-
-    def unload_model(self,
-                     model_name,
-                     headers=None,
-                     query_params=None,
-                     unload_dependents=False):
-        """Request the inference server to unload specified model.
-
-        Parameters
-        ----------
-        model_name : str
-            The name of the model to be unloaded.
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-        unload_dependents : bool
-            Whether the dependents of the model should also be unloaded.
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to unload the model.
-
-        """
-        request_uri = "v2/repository/models/{}/unload".format(quote(model_name))
-        unload_request = {
-            "parameters": {
-                "unload_dependents": unload_dependents
-            }
-        }
-        response = self._post(request_uri=request_uri,
-                              request_body=json.dumps(unload_request),
-                              headers=headers,
-                              query_params=query_params)
-        _raise_if_error(response)
-        if self._verbose:
-            print("Loaded model '{}'".format(model_name))
-
-    def get_inference_statistics(self,
-                                 model_name="",
-                                 model_version="",
-                                 headers=None,
-                                 query_params=None):
-        """Get the inference statistics for the specified model name and
-        version.
-
-        Parameters
-        ----------
-        model_name : str
-            The name of the model to get statistics. The default value is
-            an empty string, which means statistics of all models will
-            be returned.
-        model_version: str
-            The version of the model to get inference statistics. The
-            default value is an empty string which means then the server
-            will return the statistics of all available model versions.
-        headers: dict
-            Optional dictionary specifying additional HTTP
-            headers to include in the request.
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Returns
-        -------
-        dict
-            The JSON dict holding the model inference statistics.
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to get the model inference statistics.
-
-        """
-
-        if model_name != "":
-            if type(model_version) != str:
-                raise_error("model version must be a string")
-            if model_version != "":
-                request_uri = "v2/models/{}/versions/{}/stats".format(
-                    quote(model_name), model_version)
-            else:
-                request_uri = "v2/models/{}/stats".format(quote(model_name))
-        else:
-            request_uri = "v2/models/stats"
-
-        response = self._get(request_uri=request_uri,
-                             headers=headers,
-                             query_params=query_params)
-        _raise_if_error(response)
-
-        content = response.read()
-        if self._verbose:
-            print(content)
-
-        return json.loads(content)
-
-    def get_system_shared_memory_status(self,
-                                        region_name="",
-                                        headers=None,
-                                        query_params=None):
-        """Request system shared memory status from the server.
-
-        Parameters
-        ----------
-        region_name : str
-            The name of the region to query status. The default
-            value is an empty string, which means that the status
-            of all active system shared memory will be returned.
-        headers: dict
-            Optional dictionary specifying additional HTTP
-            headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Returns
-        -------
-        dict
-            The JSON dict holding system shared memory status.
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to get the status of specified shared memory.
-
-        """
-        if region_name != "":
-            request_uri = "v2/systemsharedmemory/region/{}/status".format(
-                quote(region_name))
-        else:
-            request_uri = "v2/systemsharedmemory/status"
-
-        response = self._get(request_uri=request_uri,
-                             headers=headers,
-                             query_params=query_params)
-        _raise_if_error(response)
-
-        content = response.read()
-        if self._verbose:
-            print(content)
-
-        return json.loads(content)
-
-    def register_system_shared_memory(self,
-                                      name,
-                                      key,
-                                      byte_size,
-                                      offset=0,
-                                      headers=None,
-                                      query_params=None):
-        """Request the server to register a system shared memory with the
-        following specification.
-
-        Parameters
-        ----------
-        name : str
-            The name of the region to register.
-        key : str
-            The key of the underlying memory object that contains the
-            system shared memory region.
-        byte_size : int
-            The size of the system shared memory region, in bytes.
-        offset : int
-            Offset, in bytes, within the underlying memory object to
-            the start of the system shared memory region. The default
-            value is zero.
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to register the specified system shared memory.
-
-        """
-        request_uri = "v2/systemsharedmemory/region/{}/register".format(
-            quote(name))
-
-        register_request = {
-            'key': key,
-            'offset': offset,
-            'byte_size': byte_size
-        }
-        request_body = json.dumps(register_request)
-
-        response = self._post(request_uri=request_uri,
-                              request_body=request_body,
-                              headers=headers,
-                              query_params=query_params)
-        _raise_if_error(response)
-        if self._verbose:
-            print("Registered system shared memory with name '{}'".format(name))
-
-    def unregister_system_shared_memory(self,
-                                        name="",
-                                        headers=None,
-                                        query_params=None):
-        """Request the server to unregister a system shared memory with the
-        specified name.
-
-        Parameters
-        ----------
-        name : str
-            The name of the region to unregister. The default value is empty
-            string which means all the system shared memory regions will be
-            unregistered.
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to unregister the specified system shared memory region.
-
-        """
-        if name != "":
-            request_uri = "v2/systemsharedmemory/region/{}/unregister".format(
-                quote(name))
-        else:
-            request_uri = "v2/systemsharedmemory/unregister"
-
-        response = self._post(request_uri=request_uri,
-                              request_body="",
-                              headers=headers,
-                              query_params=query_params)
-        _raise_if_error(response)
-        if self._verbose:
-            if name != "":
-                print("Unregistered system shared memory with name '{}'".format(
-                    name))
-            else:
-                print("Unregistered all system shared memory regions")
-
-    def get_cuda_shared_memory_status(self,
-                                      region_name="",
-                                      headers=None,
-                                      query_params=None):
-        """Request cuda shared memory status from the server.
-
-        Parameters
-        ----------
-        region_name : str
-            The name of the region to query status. The default
-            value is an empty string, which means that the status
-            of all active cuda shared memory will be returned.
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Returns
-        -------
-        dict
-            The JSON dict holding cuda shared memory status.
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to get the status of specified shared memory.
-
-        """
-        if region_name != "":
-            request_uri = "v2/cudasharedmemory/region/{}/status".format(
-                quote(region_name))
-        else:
-            request_uri = "v2/cudasharedmemory/status"
-
-        response = self._get(request_uri=request_uri,
-                             headers=headers,
-                             query_params=query_params)
-        _raise_if_error(response)
-
-        content = response.read()
-        if self._verbose:
-            print(content)
-
-        return json.loads(content)
-
-    def register_cuda_shared_memory(self,
-                                    name,
-                                    raw_handle,
-                                    device_id,
-                                    byte_size,
-                                    headers=None,
-                                    query_params=None):
-        """Request the server to register a system shared memory with the
-        following specification.
-
-        Parameters
-        ----------
-        name : str
-            The name of the region to register.
-        raw_handle : bytes
-            The raw serialized cudaIPC handle in base64 encoding.
-        device_id : int
-            The GPU device ID on which the cudaIPC handle was created.
-        byte_size : int
-            The size of the cuda shared memory region, in bytes.
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to register the specified cuda shared memory.
-
-        """
-        request_uri = "v2/cudasharedmemory/region/{}/register".format(
-            quote(name))
-
-        register_request = {
-            'raw_handle': {
-                'b64': raw_handle
-            },
-            'device_id': device_id,
-            'byte_size': byte_size
-        }
-        request_body = json.dumps(register_request)
-
-        response = self._post(request_uri=request_uri,
-                              request_body=request_body,
-                              headers=headers,
-                              query_params=query_params)
-        _raise_if_error(response)
-        if self._verbose:
-            print("Registered cuda shared memory with name '{}'".format(name))
-
-    def unregister_cuda_shared_memory(self,
-                                      name="",
-                                      headers=None,
-                                      query_params=None):
-        """Request the server to unregister a cuda shared memory with the
-        specified name.
-
-        Parameters
-        ----------
-        name : str
-            The name of the region to unregister. The default value is empty
-            string which means all the cuda shared memory regions will be
-            unregistered.
-        headers: dict
-            Optional dictionary specifying additional
-            HTTP headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction
-
-        Raises
-        ------
-        InferenceServerException
-            If unable to unregister the specified cuda shared memory region.
-
-        """
-        if name != "":
-            request_uri = "v2/cudasharedmemory/region/{}/unregister".format(
-                quote(name))
-        else:
-            request_uri = "v2/cudasharedmemory/unregister"
-
-        response = self._post(request_uri=request_uri,
-                              request_body="",
-                              headers=headers,
-                              query_params=query_params)
-        _raise_if_error(response)
-        if self._verbose:
-            if name != "":
-                print("Unregistered cuda shared memory with name '{}'".format(
-                    name))
-            else:
-                print("Unregistered all cuda shared memory regions")
-
-    @staticmethod
-    def generate_request_body(inputs,
-                  outputs=None,
-                  request_id="",
-                  sequence_id=0,
-                  sequence_start=False,
-                  sequence_end=False,
-                  priority=0,
-                  timeout=None):
-        """Generate a request body for inference using the supplied 'inputs'
-        requesting the outputs specified by 'outputs'.
-
-        Parameters
-        ----------
-        inputs : list
-            A list of InferInput objects, each describing data for a input
-            tensor required by the model.
-        outputs : list
-            A list of InferRequestedOutput objects, each describing how the output
-            data must be returned. If not specified all outputs produced
-            by the model will be returned using default settings.
-        request_id: str
-            Optional identifier for the request. If specified will be returned
-            in the response. Default value is an empty string which means no
-            request_id will be used.
-        sequence_id : int or str
-            The unique identifier for the sequence being represented by the
-            object. A value of 0 or "" means that the request does not
-            belong to a sequence. Default is 0.
-        sequence_start: bool
-            Indicates whether the request being added marks the start of the
-            sequence. Default value is False. This argument is ignored if
-            'sequence_id' is 0.
-        sequence_end: bool
-            Indicates whether the request being added marks the end of the
-            sequence. Default value is False. This argument is ignored if
-            'sequence_id' is 0.
-        priority : int
-            Indicates the priority of the request. Priority value zero
-            indicates that the default priority level should be used
-            (i.e. same behavior as not specifying the priority parameter).
-            Lower value priorities indicate higher priority levels. Thus
-            the highest priority level is indicated by setting the parameter
-            to 1, the next highest is 2, etc. If not provided, the server
-            will handle the request using default setting for the model.
-        timeout : int
-            The timeout value for the request, in microseconds. If the request
-            cannot be completed within the time the server can take a
-            model-specific action such as terminating the request. If not
-            provided, the server will handle the request using default setting
-            for the model.
-
-        Returns
-        -------
-        Bytes
-            The request body of the inference.
-        Int
-            The byte size of the inference request header in the request body.
-            Returns None if the whole request body constitutes the request header.
-            
-
-        Raises
-        ------
-        InferenceServerException
-            If server fails to perform inference.
-        """
-        return _get_inference_request(inputs=inputs,
-                                      request_id=request_id,
-                                      outputs=outputs,
-                                      sequence_id=sequence_id,
-                                      sequence_start=sequence_start,
-                                      sequence_end=sequence_end,
-                                      priority=priority,
-                                      timeout=timeout)
-
-    @staticmethod
-    def parse_response_body(response_body,
-                          verbose=False,
-                          header_length=None,
-                          content_encoding=None):
-        """Generate a InferResult object from the given 'response_body'
-
-        Parameters
-        ----------
-        response_body : bytes
-            The inference response from the server
-        verbose : bool
-            If True generate verbose output. Default value is False.
-        header_length : int
-            The length of the inference header if the header does not occupy
-            the whole response body. Default value is None.
-        content_encoding : string
-            The encoding of the response body if it is compressed.
-            Default value is None.
-        
-        Returns
-        -------
-        InferResult
-            The InferResult object generated from the response body
-        """
-        return InferResult.from_response_body(response_body, verbose,
-                                              header_length, content_encoding)
-
-    def infer(self,
-              model_name,
-              inputs,
-              model_version="",
-              outputs=None,
-              request_id="",
-              sequence_id=0,
-              sequence_start=False,
-              sequence_end=False,
-              priority=0,
-              timeout=None,
-              headers=None,
-              query_params=None,
-              request_compression_algorithm=None,
-              response_compression_algorithm=None):
-        """Run synchronous inference using the supplied 'inputs' requesting
-        the outputs specified by 'outputs'.
-
-        Parameters
-        ----------
-        model_name: str
-            The name of the model to run inference.
-        inputs : list
-            A list of InferInput objects, each describing data for a input
-            tensor required by the model.
-        model_version: str
-            The version of the model to run inference. The default value
-            is an empty string which means then the server will choose
-            a version based on the model and internal policy.
-        outputs : list
-            A list of InferRequestedOutput objects, each describing how the output
-            data must be returned. If not specified all outputs produced
-            by the model will be returned using default settings.
-        request_id: str
-            Optional identifier for the request. If specified will be returned
-            in the response. Default value is an empty string which means no
-            request_id will be used.
-        sequence_id : int
-            The unique identifier for the sequence being represented by the
-            object. Default value is 0 which means that the request does not
-            belong to a sequence.
-        sequence_start: bool
-            Indicates whether the request being added marks the start of the
-            sequence. Default value is False. This argument is ignored if
-            'sequence_id' is 0.
-        sequence_end: bool
-            Indicates whether the request being added marks the end of the
-            sequence. Default value is False. This argument is ignored if
-            'sequence_id' is 0.
-        priority : int
-            Indicates the priority of the request. Priority value zero
-            indicates that the default priority level should be used
-            (i.e. same behavior as not specifying the priority parameter).
-            Lower value priorities indicate higher priority levels. Thus
-            the highest priority level is indicated by setting the parameter
-            to 1, the next highest is 2, etc. If not provided, the server
-            will handle the request using default setting for the model.
-        timeout : int
-            The timeout value for the request, in microseconds. If the request
-            cannot be completed within the time the server can take a
-            model-specific action such as terminating the request. If not
-            provided, the server will handle the request using default setting
-            for the model.
-        headers: dict
-            Optional dictionary specifying additional HTTP
-            headers to include in the request.
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction.
-        request_compression_algorithm : str
-            Optional HTTP compression algorithm to use for the request body on client side.
-            Currently supports "deflate", "gzip" and None. By default, no
-            compression is used.
-        response_compression_algorithm : str
-            Optional HTTP compression algorithm to request for the response body.
-            Note that the response may not be compressed if the server does not
-            support the specified algorithm. Currently supports "deflate",
-            "gzip" and None. By default, no compression is requested.
-
-        Returns
-        -------
-        InferResult
-            The object holding the result of the inference.
-
-        Raises
-        ------
-        InferenceServerException
-            If server fails to perform inference.
-        """
-
-        request_body, json_size = _get_inference_request(
-            inputs=inputs,
-            request_id=request_id,
-            outputs=outputs,
-            sequence_id=sequence_id,
-            sequence_start=sequence_start,
-            sequence_end=sequence_end,
-            priority=priority,
-            timeout=timeout)
-
-        if request_compression_algorithm == "gzip":
-            if headers is None:
-                headers = {}
-            headers["Content-Encoding"] = "gzip"
-            request_body = gzip.compress(request_body)
-        elif request_compression_algorithm == 'deflate':
-            if headers is None:
-                headers = {}
-            headers["Content-Encoding"] = "deflate"
-            # "Content-Encoding: deflate" actually means compressing in zlib structure
-            # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
-            request_body = zlib.compress(request_body)
-
-        if response_compression_algorithm == "gzip":
-            if headers is None:
-                headers = {}
-            headers["Accept-Encoding"] = "gzip"
-        elif response_compression_algorithm == 'deflate':
-            if headers is None:
-                headers = {}
-            headers["Accept-Encoding"] = "deflate"
-
-        if json_size is not None:
-            if headers is None:
-                headers = {}
-            headers["Inference-Header-Content-Length"] = json_size
-
-        if type(model_version) != str:
-            raise_error("model version must be a string")
-        if model_version != "":
-            request_uri = "v2/models/{}/versions/{}/infer".format(
-                quote(model_name), model_version)
-        else:
-            request_uri = "v2/models/{}/infer".format(quote(model_name))
-
-        response = self._post(request_uri=request_uri,
-                              request_body=request_body,
-                              headers=headers,
-                              query_params=query_params)
-        _raise_if_error(response)
-
-        return InferResult(response, self._verbose)
-
-    def async_infer(self,
-                    model_name,
-                    inputs,
-                    model_version="",
-                    outputs=None,
-                    request_id="",
-                    sequence_id=0,
-                    sequence_start=False,
-                    sequence_end=False,
-                    priority=0,
-                    timeout=None,
-                    headers=None,
-                    query_params=None,
-                    request_compression_algorithm=None,
-                    response_compression_algorithm=None):
-        """Run asynchronous inference using the supplied 'inputs' requesting
-        the outputs specified by 'outputs'. Even though this call is
-        non-blocking, however, the actual number of concurrent requests to
-        the server will be limited by the 'concurrency' parameter specified
-        while creating this client. In other words, if the inflight
-        async_infer exceeds the specified 'concurrency', the delivery of
-        the exceeding request(s) to server will be blocked till the slot is
-        made available by retrieving the results of previously issued requests.
-
-        Parameters
-        ----------
-        model_name: str
-            The name of the model to run inference.
-        inputs : list
-            A list of InferInput objects, each describing data for a input
-            tensor required by the model.
-        model_version: str
-            The version of the model to run inference. The default value
-            is an empty string which means then the server will choose
-            a version based on the model and internal policy.
-        outputs : list
-            A list of InferRequestedOutput objects, each describing how the output
-            data must be returned. If not specified all outputs produced
-            by the model will be returned using default settings.
-        request_id: str
-            Optional identifier for the request. If specified will be returned
-            in the response. Default value is 'None' which means no request_id
-            will be used.
-        sequence_id : int
-            The unique identifier for the sequence being represented by the
-            object. Default value is 0 which means that the request does not
-            belong to a sequence.
-        sequence_start: bool
-            Indicates whether the request being added marks the start of the
-            sequence. Default value is False. This argument is ignored if
-            'sequence_id' is 0.
-        sequence_end: bool
-            Indicates whether the request being added marks the end of the
-            sequence. Default value is False. This argument is ignored if
-            'sequence_id' is 0.
-        priority : int
-            Indicates the priority of the request. Priority value zero
-            indicates that the default priority level should be used
-            (i.e. same behavior as not specifying the priority parameter).
-            Lower value priorities indicate higher priority levels. Thus
-            the highest priority level is indicated by setting the parameter
-            to 1, the next highest is 2, etc. If not provided, the server
-            will handle the request using default setting for the model.
-        timeout : int
-            The timeout value for the request, in microseconds. If the request
-            cannot be completed within the time the server can take a
-            model-specific action such as terminating the request. If not
-            provided, the server will handle the request using default setting
-            for the model.
-        headers: dict
-            Optional dictionary specifying additional HTTP
-            headers to include in the request
-        query_params: dict
-            Optional url query parameters to use in network
-            transaction.
-        request_compression_algorithm : str
-            Optional HTTP compression algorithm to use for the request body on client side.
-            Currently supports "deflate", "gzip" and None. By default, no
-            compression is used.
-        response_compression_algorithm : str
-            Optional HTTP compression algorithm to request for the response body.
-            Note that the response may not be compressed if the server does not
-            support the specified algorithm. Currently supports "deflate",
-            "gzip" and None. By default, no compression is requested.
-
-        Returns
-        -------
-        InferAsyncRequest object
-            The handle to the asynchronous inference request.
-
-        Raises
-        ------
-        InferenceServerException
-            If server fails to issue inference.
-        """
-
-        def wrapped_post(request_uri, request_body, headers, query_params):
-            return self._post(request_uri, request_body, headers, query_params)
-
-        request_body, json_size = _get_inference_request(
-            inputs=inputs,
-            request_id=request_id,
-            outputs=outputs,
-            sequence_id=sequence_id,
-            sequence_start=sequence_start,
-            sequence_end=sequence_end,
-            priority=priority,
-            timeout=timeout)
-
-        if request_compression_algorithm == "gzip":
-            if headers is None:
-                headers = {}
-            headers["Content-Encoding"] = "gzip"
-            request_body = gzip.compress(request_body)
-        elif request_compression_algorithm == 'deflate':
-            if headers is None:
-                headers = {}
-            headers["Content-Encoding"] = "deflate"
-            # "Content-Encoding: deflate" actually means compressing in zlib structure
-            # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Content-Encoding
-            request_body = zlib.compress(request_body)
-
-        if response_compression_algorithm == "gzip":
-            if headers is None:
-                headers = {}
-            headers["Accept-Encoding"] = "gzip"
-        elif response_compression_algorithm == 'deflate':
-            if headers is None:
-                headers = {}
-            headers["Accept-Encoding"] = "deflate"
-
-        if json_size is not None:
-            if headers is None:
-                headers = {}
-            headers["Inference-Header-Content-Length"] = json_size
-
-        if type(model_version) != str:
-            raise_error("model version must be a string")
-        if model_version != "":
-            request_uri = "v2/models/{}/versions/{}/infer".format(
-                quote(model_name), model_version)
-        else:
-            request_uri = "v2/models/{}/infer".format(quote(model_name))
-
-        g = self._pool.apply_async(
-            wrapped_post, (request_uri, request_body, headers, query_params))
-
-        # Schedule the greenlet to run in this loop iteration
-        g.start()
-
-        # Relinquish control to greenlet loop. Using non-zero
-        # value to ensure the control is transferred to the
-        # event loop.
-        gevent.sleep(0.01)
-
-        if self._verbose:
-            verbose_message = "Sent request"
-            if request_id != "":
-                verbose_message = verbose_message + " '{}'".format(request_id)
-            print(verbose_message)
-
-        return InferAsyncRequest(g, self._verbose)
-
-
-class InferAsyncRequest:
-    """An object of InferAsyncRequest class is used to describe
-    a handle to an ongoing asynchronous inference request.
-
-    Parameters
-    ----------
-    greenlet : gevent.Greenlet
-        The greenlet object which will provide the results.
-        For further details about greenlets refer
-        http://www.gevent.org/api/gevent.greenlet.html.
-
-    verbose : bool
-        If True generate verbose output. Default value is False.
-    """
-
-    def __init__(self, greenlet, verbose=False):
-        self._greenlet = greenlet
-        self._verbose = verbose
-
-    def get_result(self, block=True, timeout=None):
-        """Get the results of the associated asynchronous inference.
-        Parameters
-        ----------
-        block : bool
-            If block is True, the function will wait till the
-            corresponding response is received from the server.
-            Default value is True.
-        timeout : int
-            The maximum wait time for the function. This setting is
-            ignored if the block is set False. Default is None,
-            which means the function will block indefinitely till
-            the corresponding response is received.
-
-        Returns
-        -------
-        InferResult
-            The object holding the result of the async inference.
-
-        Raises
-        ------
-        InferenceServerException
-            If server fails to perform inference or failed to respond
-            within specified timeout.
-        """
-
-        try:
-            response = self._greenlet.get(block=block, timeout=timeout)
-        except gevent.Timeout as e:
-            raise_error("failed to obtain inference response")
-
-        _raise_if_error(response)
-        return InferResult(response, self._verbose)
-
-
-class InferInput:
-    """An object of InferInput class is used to describe
-    input tensor for an inference request.
-
-    Parameters
-    ----------
-    name : str
-        The name of input whose data will be described by this object
-    shape : list
-        The shape of the associated input.
-    datatype : str
-        The datatype of the associated input.
-    """
-
-    def __init__(self, name, shape, datatype):
-        self._name = name
-        self._shape = shape
-        self._datatype = datatype
-        self._parameters = {}
-        self._data = None
-        self._raw_data = None
-
-    def name(self):
-        """Get the name of input associated with this object.
-
-        Returns
-        -------
-        str
-            The name of input
-        """
-        return self._name
-
-    def datatype(self):
-        """Get the datatype of input associated with this object.
-
-        Returns
-        -------
-        str
-            The datatype of input
-        """
-        return self._datatype
-
-    def shape(self):
-        """Get the shape of input associated with this object.
-
-        Returns
-        -------
-        list
-            The shape of input
-        """
-        return self._shape
-
-    def set_shape(self, shape):
-        """Set the shape of input.
-
-        Parameters
-        ----------
-        shape : list
-            The shape of the associated input.
-        """
-        self._shape = shape
-
-    def set_data_from_numpy(self, input_tensor, binary_data=True):
-        """Set the tensor data from the specified numpy array for
-        input associated with this object.
-
-        Parameters
-        ----------
-        input_tensor : numpy array
-            The tensor data in numpy array format
-        binary_data : bool
-            Indicates whether to set data for the input in binary format
-            or explicit tensor within JSON. The default value is True,
-            which means the data will be delivered as binary data in the
-            HTTP body after the JSON object.
-
-        Raises
-        ------
-        InferenceServerException
-            If failed to set data for the tensor.
-        """
-        if not isinstance(input_tensor, (np.ndarray,)):
-            raise_error("input_tensor must be a numpy array")
-        dtype = np_to_triton_dtype(input_tensor.dtype)
-        if self._datatype != dtype:
-            raise_error(
-                "got unexpected datatype {} from numpy array, expected {}".
-                format(dtype, self._datatype))
-        valid_shape = True
-        if len(self._shape) != len(input_tensor.shape):
-            valid_shape = False
-        else:
-            for i in range(len(self._shape)):
-                if self._shape[i] != input_tensor.shape[i]:
-                    valid_shape = False
-        if not valid_shape:
-            raise_error(
-                "got unexpected numpy array shape [{}], expected [{}]".format(
-                    str(input_tensor.shape)[1:-1],
-                    str(self._shape)[1:-1]))
-
-        self._parameters.pop('shared_memory_region', None)
-        self._parameters.pop('shared_memory_byte_size', None)
-        self._parameters.pop('shared_memory_offset', None)
-
-        if not binary_data:
-            self._parameters.pop('binary_data_size', None)
-            self._raw_data = None
-            if self._datatype == "BYTES":
-                self._data = []
-                try:
-                    if input_tensor.size > 0:
-                        for obj in np.nditer(input_tensor,
-                                             flags=["refs_ok"],
-                                             order='C'):
-                            # We need to convert the object to string using utf-8,
-                            # if we want to use the binary_data=False. JSON requires
-                            # the input to be a UTF-8 string.
-                            if input_tensor.dtype == np.object_:
-                                if type(obj.item()) == bytes:
-                                    self._data.append(
-                                        str(obj.item(), encoding='utf-8'))
-                                else:
-                                    self._data.append(str(obj.item()))
-                            else:
-                                self._data.append(
-                                    str(obj.item(), encoding='utf-8'))
-                except UnicodeDecodeError:
-                    raise_error(
-                        f'Failed to encode "{obj.item()}" using UTF-8. Please use binary_data=True, if'
-                        ' you want to pass a byte array.')
-            else:
-                self._data = [val.item() for val in input_tensor.flatten()]
-        else:
-            self._data = None
-            if self._datatype == "BYTES":
-                serialized_output = serialize_byte_tensor(input_tensor)
-                if serialized_output.size > 0:
-                    self._raw_data = serialized_output.item()
-                else:
-                    self._raw_data = b''
-            else:
-                self._raw_data = input_tensor.tobytes()
-            self._parameters['binary_data_size'] = len(self._raw_data)
-
-    def set_shared_memory(self, region_name, byte_size, offset=0):
-        """Set the tensor data from the specified shared memory region.
-
-        Parameters
-        ----------
-        region_name : str
-            The name of the shared memory region holding tensor data.
-        byte_size : int
-            The size of the shared memory region holding tensor data.
-        offset : int
-            The offset, in bytes, into the region where the data for
-            the tensor starts. The default value is 0.
-
-        """
-        self._data = None
-        self._raw_data = None
-        self._parameters.pop('binary_data_size', None)
-
-        self._parameters['shared_memory_region'] = region_name
-        self._parameters['shared_memory_byte_size'] = byte_size
-        if offset != 0:
-            self._parameters['shared_memory_offset'].int64_param = offset
-
-    def _get_binary_data(self):
-        """Returns the raw binary data if available
-
-        Returns
-        -------
-        bytes
-            The raw data for the input tensor
-        """
-        return self._raw_data
-
-    def _get_tensor(self):
-        """Retrieve the underlying input as json dict.
-
-        Returns
-        -------
-        dict
-            The underlying tensor specification as dict
-        """
-        tensor = {
-            'name': self._name,
-            'shape': self._shape,
-            'datatype': self._datatype
-        }
-        if self._parameters:
-            tensor['parameters'] = self._parameters
-
-        if self._parameters.get('shared_memory_region') is None and \
-                self._raw_data is None:
-            if self._data is not None:
-                tensor['data'] = self._data
-        return tensor
-
-
-class InferRequestedOutput:
-    """An object of InferRequestedOutput class is used to describe a
-    requested output tensor for an inference request.
-
-    Parameters
-    ----------
-    name : str
-        The name of output tensor to associate with this object.
-    binary_data : bool
-        Indicates whether to return result data for the output in
-        binary format or explicit tensor within JSON. The default
-        value is True, which means the data will be delivered as
-        binary data in the HTTP body after JSON object. This field
-        will be unset if shared memory is set for the output.
-    class_count : int
-        The number of classifications to be requested. The default
-        value is 0 which means the classification results are not
-        requested.
-    """
-
-    def __init__(self, name, binary_data=True, class_count=0):
-        self._name = name
-        self._parameters = {}
-        if class_count != 0:
-            self._parameters['classification'] = class_count
-        self._binary = binary_data
-        self._parameters['binary_data'] = binary_data
-
-    def name(self):
-        """Get the name of output associated with this object.
-
-        Returns
-        -------
-        str
-            The name of output
-        """
-        return self._name
-
-    def set_shared_memory(self, region_name, byte_size, offset=0):
-        """Marks the output to return the inference result in
-        specified shared memory region.
-
-        Parameters
-        ----------
-        region_name : str
-            The name of the shared memory region to hold tensor data.
-        byte_size : int
-            The size of the shared memory region to hold tensor data.
-        offset : int
-            The offset, in bytes, into the region where the data for
-            the tensor starts. The default value is 0.
-
-        """
-        if 'classification' in self._parameters:
-            raise_error("shared memory can't be set on classification output")
-        if self._binary:
-            self._parameters['binary_data'] = False
-
-        self._parameters['shared_memory_region'] = region_name
-        self._parameters['shared_memory_byte_size'] = byte_size
-        if offset != 0:
-            self._parameters['shared_memory_offset'] = offset
-
-    def unset_shared_memory(self):
-        """Clears the shared memory option set by the last call to
-        InferRequestedOutput.set_shared_memory(). After call to this
-        function requested output will no longer be returned in a
-        shared memory region.
-        """
-
-        self._parameters['binary_data'] = self._binary
-        self._parameters.pop('shared_memory_region', None)
-        self._parameters.pop('shared_memory_byte_size', None)
-        self._parameters.pop('shared_memory_offset', None)
-
-    def _get_tensor(self):
-        """Retrieve the underlying input as json dict.
-
-        Returns
-        -------
-        dict
-            The underlying tensor as a dict
-        """
-        tensor = {'name': self._name}
-        if self._parameters:
-            tensor['parameters'] = self._parameters
-        return tensor
-
-
-class InferResult:
-    """An object of InferResult class holds the response of
-    an inference request and provide methods to retrieve
-    inference results.
-
-    Parameters
-    ----------
-    response : geventhttpclient.response.HTTPSocketPoolResponse
-        The inference response from the server
-    verbose : bool
-        If True generate verbose output. Default value is False.
-    """
-
-    def __init__(self, response, verbose):
-        header_length = response.get('Inference-Header-Content-Length')
-
-        # Internal class that simulate the interface of 'response'
-        class DecompressedResponse:
-
-            def __init__(self, decompressed_data):
-                self.decompressed_data_ = decompressed_data
-                self.offset_ = 0
-
-            def read(self, length=-1):
-                if length == -1:
-                    return self.decompressed_data_[self.offset_:]
-                else:
-                    prev_offset = self.offset_
-                    self.offset_ += length
-                    return self.decompressed_data_[prev_offset:self.offset_]
-
-        content_encoding = response.get('Content-Encoding')
-        if content_encoding is not None:
-            if content_encoding == "gzip":
-                response = DecompressedResponse(gzip.decompress(
-                    response.read()))
-            elif content_encoding == 'deflate':
-                response = DecompressedResponse(zlib.decompress(
-                    response.read()))
-        if header_length is None:
-            content = response.read()
-            if verbose:
-                print(content)
-            try:
-                self._result = json.loads(content)
-            except UnicodeDecodeError as e:
-                raise_error(
-                    f'Failed to encode using UTF-8. Please use binary_data=True, if'
-                    f' you want to pass a byte array. UnicodeError: {e}')
-        else:
-            header_length = int(header_length)
-            content = response.read(length=header_length)
-            if verbose:
-                print(content)
-            self._result = json.loads(content)
-
-            # Maps the output name to the index in buffer for quick retrieval
-            self._output_name_to_buffer_map = {}
-            # Read the remaining data off the response body.
-            self._buffer = response.read()
-            buffer_index = 0
-            for output in self._result['outputs']:
-                parameters = output.get("parameters")
-                if parameters is not None:
-                    this_data_size = parameters.get("binary_data_size")
-                    if this_data_size is not None:
-                        self._output_name_to_buffer_map[
-                            output['name']] = buffer_index
-                        buffer_index = buffer_index + this_data_size
-
-    @classmethod
-    def from_response_body(cls,
-                           response_body,
-                           verbose=False,
-                           header_length=None,
-                           content_encoding=None):
-        """A class method to construct InferResult object
-        from a given 'response_body'.
-
-        Parameters
-        ----------
-        response_body : bytes
-            The inference response from the server
-        verbose : bool
-            If True generate verbose output. Default value is False.
-        header_length : int
-            The length of the inference header if the header does not occupy
-            the whole response body. Default value is None.
-        content_encoding : string
-            The encoding of the response body if it is compressed.
-            Default value is None.
-        
-        Returns
-        -------
-        InferResult
-            The InferResult object generated from the response body
-        """
-
-        # Internal class that simulate the interface of 'response'
-        class Response:
-
-            def __init__(self, response_body, header_length, content_encoding):
-                self.response_body_ = response_body
-                self.offset_ = 0
-                self.parameters_ = {
-                    'Inference-Header-Content-Length': header_length,
-                    'Content-Encoding': content_encoding
-                }
-
-            def get(self, key):
-                return self.parameters_.get(key)
-
-            def read(self, length=-1):
-                if length == -1:
-                    return self.response_body_[self.offset_:]
-                else:
-                    prev_offset = self.offset_
-                    self.offset_ += length
-                    return self.response_body_[prev_offset:self.offset_]
-
-        return cls(Response(response_body, header_length, content_encoding),
-                   verbose)
-
-    def as_numpy(self, name):
-        """Get the tensor data for output associated with this object
-        in numpy format
-
-        Parameters
-        ----------
-        name : str
-            The name of the output tensor whose result is to be retrieved.
-
-        Returns
-        -------
-        numpy array
-            The numpy array containing the response data for the tensor or
-            None if the data for specified tensor name is not found.
-        """
-        if self._result.get('outputs') is not None:
-            for output in self._result['outputs']:
-                if output['name'] == name:
-                    datatype = output['datatype']
-                    has_binary_data = False
-                    parameters = output.get("parameters")
-                    if parameters is not None:
-                        this_data_size = parameters.get("binary_data_size")
-                        if this_data_size is not None:
-                            has_binary_data = True
-                            if this_data_size != 0:
-                                start_index = self._output_name_to_buffer_map[
-                                    name]
-                                end_index = start_index + this_data_size
-                                if datatype == 'BYTES':
-                                    # String results contain a 4-byte string length
-                                    # followed by the actual string characters. Hence,
-                                    # need to decode the raw bytes to convert into
-                                    # array elements.
-                                    np_array = deserialize_bytes_tensor(
-                                        self._buffer[start_index:end_index])
-                                else:
-                                    np_array = np.frombuffer(
-                                        self._buffer[start_index:end_index],
-                                        dtype=triton_to_np_dtype(datatype))
-                            else:
-                                np_array = np.empty(0)
-                    if not has_binary_data:
-                        np_array = np.array(output['data'],
-                                            dtype=triton_to_np_dtype(datatype))
-                    np_array = np_array.reshape(output['shape'])
-                    return np_array
-        return None
-
-    def get_output(self, name):
-        """Retrieves the output tensor corresponding to the named ouput.
-
-        Parameters
-        ----------
-        name : str
-            The name of the tensor for which Output is to be
-            retrieved.
-
-        Returns
-        -------
-        Dict
-            If an output tensor with specified name is present in
-            the infer resonse then returns it as a json dict,
-            otherwise returns None.
-        """
-        for output in self._result['outputs']:
-            if output['name'] == name:
-                return output
-
-        return None
-
-    def get_response(self):
-        """Retrieves the complete response
-
-        Returns
-        -------
-        dict
-            The underlying response dict.
-        """
-        return self._result
\ No newline at end of file
diff --git a/examples/clearml_serving_simple_http_inference_request/sample_image.webp b/examples/clearml_serving_simple_http_inference_request/sample_image.webp
deleted file mode 100644
index 9258c91..0000000
Binary files a/examples/clearml_serving_simple_http_inference_request/sample_image.webp and /dev/null differ
diff --git a/examples/ensemble/preprocess.py b/examples/ensemble/preprocess.py
new file mode 100644
index 0000000..6ba648c
--- /dev/null
+++ b/examples/ensemble/preprocess.py
@@ -0,0 +1,19 @@
+from typing import Any
+
+import numpy as np
+
+
+# Notice Preprocess class Must be named "Preprocess"
+class Preprocess(object):
+    def __init__(self):
+        # set internal state, this will be called only once. (i.e. not per request)
+        pass
+
+    def preprocess(self, body: dict, collect_custom_statistics_fn=None) -> Any:
+        # we expect to get two valid on the dict x0, and x1
+        return [[body.get("x0", None), body.get("x1", None)], ]
+
+    def postprocess(self, data: Any, collect_custom_statistics_fn=None) -> dict:
+        # post process the data returned from the model inference engine
+        # data is the return value from model.predict we will put is inside a return value as Y
+        return dict(y=data.tolist() if isinstance(data, np.ndarray) else data)
diff --git a/examples/ensemble/readme.md b/examples/ensemble/readme.md
new file mode 100644
index 0000000..68a80d4
--- /dev/null
+++ b/examples/ensemble/readme.md
@@ -0,0 +1,32 @@
+# Train and Deploy Scikit-Learn model ensemble
+
+## training mock voting regression model
+
+Run the mock python training code
+```bash
+pip install -r examples/ensemble/requirements.txt 
+python examples/ensemble/train_ensemble.py
+```
+
+The output will be a model created on the project "serving examples", by the name "train model ensemble"
+
+## setting up the serving service
+
+1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
+2. Create model endpoint: 
+`clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_ensemble" --preprocess "examples/ensemble/preprocess.py" --name "train model ensemble" --project "serving examples"`
+
+Or auto update 
+
+`clearml-serving --id <service_id> model auto-update --engine sklearn --endpoint "test_model_ensemble_auto" --preprocess "examples/ensemble/preprocess.py" --name "train model ensemble" --project "serving examples" --max-versions 2`
+
+Or add Canary endpoint
+
+`clearml-serving --id <service_id> model canary --endpoint "test_model_ensemble_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_ensemble_auto`
+
+3. Run the clearml-serving container `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
+4. Test new endpoint: `curl -X POST "http://127.0.0.1:8080/serve/test_model_ensemble" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
+
+> **_Notice:_**  You can also change the serving service while it is already running!
+This includes adding/removing endpoints, adding canary model routing etc.
+by default new endpoints/models will be automatically updated after 1 minute
diff --git a/examples/ensemble/requirements.txt b/examples/ensemble/requirements.txt
new file mode 100644
index 0000000..eb862f7
--- /dev/null
+++ b/examples/ensemble/requirements.txt
@@ -0,0 +1,2 @@
+clearml >= 1.1.6
+scikit-learn
diff --git a/examples/ensemble/train_ensemble.py b/examples/ensemble/train_ensemble.py
new file mode 100644
index 0000000..518673b
--- /dev/null
+++ b/examples/ensemble/train_ensemble.py
@@ -0,0 +1,23 @@
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import VotingRegressor
+from sklearn.datasets import make_blobs
+from joblib import dump
+from clearml import Task
+
+task = Task.init(project_name="serving examples", task_name="train model ensemble", output_uri=True)
+
+# generate 2d classification dataset
+X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
+
+knn = KNeighborsRegressor(n_neighbors=5)
+knn.fit(X, y)
+
+rf = RandomForestRegressor(n_estimators=50)
+rf.fit(X, y)
+
+estimators = [("knn", knn), ("rf", rf), ]
+ensemble = VotingRegressor(estimators)
+ensemble.fit(X, y)
+
+dump(ensemble, filename="ensemble-vr.pkl", compress=9)
diff --git a/examples/keras/preprocess.py b/examples/keras/preprocess.py
new file mode 100644
index 0000000..b87d3e8
--- /dev/null
+++ b/examples/keras/preprocess.py
@@ -0,0 +1,36 @@
+from typing import Any
+
+import numpy as np
+from PIL import Image, ImageOps
+
+
+# Notice Preprocess class Must be named "Preprocess"
+from clearml import StorageManager
+
+
+class Preprocess(object):
+    def __init__(self):
+        # set internal state, this will be called only once. (i.e. not per request)
+        pass
+
+    def preprocess(self, body: dict, collect_custom_statistics_fn=None) -> Any:
+        # we expect to get two valid on the dict x0, and x1
+        url = body.get("url")
+        if not url:
+            raise ValueError("'url' entry not provided, expected http/s link to image")
+
+        local_file = StorageManager.get_local_copy(remote_url=url)
+        image = Image.open(local_file)
+        image = ImageOps.grayscale(image).resize((28, 28))
+
+        return np.array(image).flatten()
+
+    def postprocess(self, data: Any, collect_custom_statistics_fn=None) -> dict:
+        # post process the data returned from the model inference engine
+        # data is the return value from model.predict we will put is inside a return value as Y
+        if not isinstance(data, np.ndarray):
+            # this should not happen
+            return dict(digit=-1)
+
+        # data is returned as probability per class (10 class/digits)
+        return dict(digit=int(data.flatten().argmax()))
diff --git a/examples/keras/readme.md b/examples/keras/readme.md
new file mode 100644
index 0000000..5a9f84c
--- /dev/null
+++ b/examples/keras/readme.md
@@ -0,0 +1,41 @@
+# Train and Deploy Keras model with Nvidia Triton Engine
+
+## training mnist digit classifier model
+
+Run the mock python training code
+```bash
+pip install -r examples/keras/requirements.txt 
+python examples/keras/train_keras_mnist.py
+```
+
+The output will be a model created on the project "serving examples", by the name "train keras model"
+
+## setting up the serving service
+
+Prerequisites, Keras/Tensorflow models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart.
+
+1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
+2. Create model endpoint: 
+
+ `clearml-serving --id <service_id> model add --engine triton --endpoint "test_model_keras" --preprocess "examples/keras/preprocess.py" --name "train keras model" --project "serving examples" --input-size 1 784 --input-name "dense_input" --input-type float32 --output-size -1 10 --output-name "activation_2" --output-type float32   
+`
+
+Or auto update
+
+`clearml-serving --id <service_id> model auto-update --engine triton --endpoint "test_model_auto" --preprocess "examples/keras/preprocess.py" --name "train keras model" --project "serving examples" --max-versions 2
+  --input-size 1 784 --input-name "dense_input" --input-type float32   
+  --output-size -1 10 --output-name "activation_2" --output-type float32`
+
+Or add Canary endpoint
+
+`clearml-serving --id <service_id> model canary --endpoint "test_model_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_auto`
+   
+3. Run the Triton Engine `docker run -v ~/clearml.conf:/root/clearml.conf -p 8001:8001 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving-triton:latest`
+4. Configure the Triton Engine IP on the Serving Service (if running on k8s, the gRPC ingest of the triton container)
+`clearml-serving --id <service_id> config --triton-grpc-server <local_ip_here>:8001`
+5. Run the clearml-serving container `docker run -v ~/clearml.conf:/root/clearml.conf -p 8001:8001 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
+6. Test new endpoint: `curl -X POST "http://127.0.0.1:8080/serve/test_model_keras" -H "accept: application/json" -H "Content-Type: application/json" -d '{"url": "https://camo.githubusercontent.com/8385ca52c9cba1f6e629eb938ab725ec8c9449f12db81f9a34e18208cd328ce9/687474703a2f2f706574722d6d6172656b2e636f6d2f77702d636f6e74656e742f75706c6f6164732f323031372f30372f6465636f6d707265737365642e6a7067"}'`
+
+> **_Notice:_**  You can also change the serving service while it is already running!
+This includes adding/removing endpoints, adding canary model routing etc.
+by default new endpoints/models will be automatically updated after 1 minute
\ No newline at end of file
diff --git a/examples/keras/requirements.txt b/examples/keras/requirements.txt
index 68a3593..d2043d9 100644
--- a/examples/keras/requirements.txt
+++ b/examples/keras/requirements.txt
@@ -1,2 +1,3 @@
 tensorflow>=2.0
 clearml
+PIL
\ No newline at end of file
diff --git a/examples/keras/keras_mnist.py b/examples/keras/train_keras_mnist.py
similarity index 75%
rename from examples/keras/keras_mnist.py
rename to examples/keras/train_keras_mnist.py
index 90972ec..05ada4e 100644
--- a/examples/keras/keras_mnist.py
+++ b/examples/keras/train_keras_mnist.py
@@ -48,41 +48,6 @@ class TensorBoardImage(TensorBoard):
         self.writer.add_summary(summary, epoch)
 
 
-def create_config_pbtxt(model, config_pbtxt_file):
-    platform = "tensorflow_savedmodel"
-    input_name = model.input_names[0]
-    output_name = model.output_names[0]
-    input_data_type = "TYPE_FP32"
-    output_data_type = "TYPE_FP32"
-    input_dims = str(model.input.shape.as_list()).replace("None", "-1")
-    output_dims = str(model.output.shape.as_list()).replace("None", "-1")
-
-    config_pbtxt = """
-        platform: "%s"
-        input [
-            {
-                name: "%s"
-                data_type: %s
-                dims: %s
-            }
-        ]
-        output [
-            {
-                name: "%s"
-                data_type: %s
-                dims: %s
-            }
-        ]
-    """ % (
-        platform,
-        input_name, input_data_type, input_dims,
-        output_name, output_data_type, output_dims
-    )
-
-    with open(config_pbtxt_file, "w") as config_file:
-        config_file.write(config_pbtxt)
-
-
 def main():
     parser = argparse.ArgumentParser(description='Keras MNIST Example - training CNN classification model')
     parser.add_argument('--batch-size', type=int, default=128, help='input batch size for training (default: 128)')
@@ -126,7 +91,7 @@ def main():
 
     # Connecting ClearML with the current process,
     # from here on everything is logged automatically
-    task = Task.init(project_name='examples', task_name='Keras MNIST serve example', output_uri=True)
+    task = Task.init(project_name='serving examples', task_name='train keras model', output_uri=True)
 
     # Advanced: setting model class enumeration
     labels = dict(('digit_%d' % i, i) for i in range(10))
@@ -155,12 +120,6 @@ def main():
     # store the model in a format that can be served
     model.save('serving_model', include_optimizer=False)
 
-    # create the config.pbtxt for triton to be able to serve the model
-    create_config_pbtxt(model=model, config_pbtxt_file='config.pbtxt')
-    # store the configuration on the creating Task,
-    # this will allow us to skip over manually setting the config.pbtxt for `clearml-serving`
-    task.connect_configuration(configuration=Path('config.pbtxt'), name='config.pbtxt')
-
     print('Test score: {}'.format(score[0]))
     print('Test accuracy: {}'.format(score[1]))
 
diff --git a/examples/lightgbm/preprocess.py b/examples/lightgbm/preprocess.py
new file mode 100644
index 0000000..e89f563
--- /dev/null
+++ b/examples/lightgbm/preprocess.py
@@ -0,0 +1,23 @@
+from typing import Any
+
+import numpy as np
+
+
+# Notice Preprocess class Must be named "Preprocess"
+class Preprocess(object):
+    def __init__(self):
+        # set internal state, this will be called only once. (i.e. not per request)
+        pass
+
+    def preprocess(self, body: dict, collect_custom_statistics_fn=None) -> Any:
+        # we expect to get four valid numbers on the dict: x0, x1, x2, x3
+        return np.array(
+            [[body.get("x0", None), body.get("x1", None), body.get("x2", None), body.get("x3", None)], ],
+            dtype=np.float32
+        )
+
+    def postprocess(self, data: Any, collect_custom_statistics_fn=None) -> dict:
+        # post process the data returned from the model inference engine
+        # data is the return value from model.predict we will put is inside a return value as Y
+        # we pick the most probably class and return the class index (argmax)
+        return dict(y=int(np.argmax(data)) if isinstance(data, np.ndarray) else data)
diff --git a/examples/lightgbm/readme.md b/examples/lightgbm/readme.md
new file mode 100644
index 0000000..701c409
--- /dev/null
+++ b/examples/lightgbm/readme.md
@@ -0,0 +1,34 @@
+# Train and Deploy LightGBM model
+
+## training iris classifier model
+
+Run the mock python training code
+```bash
+pip install -r examples/lightgbm/requirements.txt 
+python examples/lightgbm/train_model.py
+```
+
+The output will be a model created on the project "serving examples", by the name "train lightgbm model"
+
+## setting up the serving service
+
+1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
+
+2. Create model endpoint: 
+
+`clearml-serving --id <service_id> model add --engine lightgbm --endpoint "test_model_lgbm" --preprocess "examples/lightgbm/preprocess.py" --name "train lightgbm model" --project "serving examples"`
+
+Or auto-update 
+
+`clearml-serving --id <service_id> model auto-update --engine lightgbm --endpoint "test_model_auto" --preprocess "examples/lightgbm/preprocess.py" --name "train lightgbm model" --project "serving examples" --max-versions 2`
+
+Or add Canary endpoint
+
+`clearml-serving --id <service_id> model canary --endpoint "test_model_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_auto`
+
+3. Run the clearml-serving container `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
+
+4. Test new endpoint: `curl -X POST "http://127.0.0.1:8080/serve/test_model_lgbm" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2, "x2": 3, "x3": 4}'`
+
+> **_Notice:_**  You can also change the serving service while it is already running!
+This includes adding/removing endpoints, adding canary model routing etc.
diff --git a/examples/lightgbm/requirements.txt b/examples/lightgbm/requirements.txt
new file mode 100644
index 0000000..ddc5c29
--- /dev/null
+++ b/examples/lightgbm/requirements.txt
@@ -0,0 +1,3 @@
+clearml >= 1.1.6
+lightgbm
+
diff --git a/examples/lightgbm/train_model.py b/examples/lightgbm/train_model.py
new file mode 100644
index 0000000..ab378d9
--- /dev/null
+++ b/examples/lightgbm/train_model.py
@@ -0,0 +1,22 @@
+import lightgbm as lgb
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+
+from clearml import Task
+
+task = Task.init(project_name="serving examples", task_name="train lightgbm model", output_uri=True)
+
+iris = load_iris()
+y = iris['target']
+X = iris['data']
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
+dtrain = lgb.Dataset(X_train, label=y_train)
+
+params = {
+    'objective': 'multiclass',
+    'metric': 'softmax',
+    'num_class': 3
+}
+lgb_model = lgb.train(params=params, train_set=dtrain)
+
+lgb_model.save_model("lgbm_model")
diff --git a/examples/pipeline/preprocess.py b/examples/pipeline/preprocess.py
new file mode 100644
index 0000000..bcfd8a1
--- /dev/null
+++ b/examples/pipeline/preprocess.py
@@ -0,0 +1,32 @@
+from typing import Any, List
+
+
+# Notice Preprocess class Must be named "Preprocess"
+class Preprocess(object):
+    def __init__(self):
+        # set internal state, this will be called only once. (i.e. not per request)
+        pass
+
+    def postprocess(self, data: List[dict], collect_custom_statistics_fn=None) -> dict:
+        # we will here average the results and return the new value
+        # assume data is a list of dicts greater than 1
+
+        # average result
+        return dict(y=0.5 * data[0]['y'][0] + 0.5 * data[1]['y'][0])
+
+    def process(self, data: Any, collect_custom_statistics_fn=None) -> Any:
+        """
+        do something with the actual data, return any type of object.
+        The returned object will be passed as is to the postprocess function engine
+        """
+        predict_a = self.send_request(endpoint="/test_model_sklearn_a/", version=None, data=data)
+        predict_b = self.send_request(endpoint="/test_model_sklearn_b/", version=None, data=data)
+        if not predict_b or not predict_a:
+            raise ValueError("Error requesting inference endpoint test_model_sklearn a/b")
+
+        return [predict_a, predict_b]
+
+    def send_request(self, endpoint, version, data) -> List[dict]:
+        # Mock Function!
+        # replaced by real send request function when constructed by the inference service
+        pass
diff --git a/examples/pipeline/readme.md b/examples/pipeline/readme.md
new file mode 100644
index 0000000..1a25846
--- /dev/null
+++ b/examples/pipeline/readme.md
@@ -0,0 +1,27 @@
+# Deploy a model inference pipeline 
+
+## prerequisites 
+
+Training a scikit-learn model (see example/sklearn) 
+
+## setting up the serving service
+
+1. Create serving Service (if not already running): 
+`clearml-serving create --name "serving example"` (write down the service ID)
+
+2. Create model base two endpoints: 
+`clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn_a" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --project "serving examples"`
+
+`clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn_b" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --project "serving examples"`
+
+3. Create pipeline model endpoint: 
+`clearml-serving --id <service_id> model add --engine custom --endpoint "test_model_pipeline" --preprocess "examples/pipeline/preprocess.py"`
+
+4. Run the clearml-serving container `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
+
+5. Test new endpoint: `curl -X POST "http://127.0.0.1:8080/serve/test_model_pipeline" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
+
+
+> **_Notice:_**  You can also change the serving service while it is already running!
+This includes adding/removing endpoints, adding canary model routing etc.
+by default new endpoints/models will be automatically updated after 1 minute
diff --git a/examples/pytorch/preprocess.py b/examples/pytorch/preprocess.py
new file mode 100644
index 0000000..75d6815
--- /dev/null
+++ b/examples/pytorch/preprocess.py
@@ -0,0 +1,35 @@
+from typing import Any
+
+import numpy as np
+from PIL import Image, ImageOps
+
+
+# Notice Preprocess class Must be named "Preprocess"
+from clearml import StorageManager
+
+
+class Preprocess(object):
+    def __init__(self):
+        # set internal state, this will be called only once. (i.e. not per request)
+        pass
+
+    def preprocess(self, body: dict, collect_custom_statistics_fn=None) -> Any:
+        # we expect to get two valid on the dict x0, and x1
+        url = body.get("url")
+        if not url:
+            raise ValueError("'url' entry not provided, expected http/s link to image")
+
+        local_file = StorageManager.get_local_copy(remote_url=url)
+        image = Image.open(local_file)
+        image = ImageOps.grayscale(image).resize((28, 28))
+        return np.array(image).flatten()
+
+    def postprocess(self, data: Any, collect_custom_statistics_fn=None) -> dict:
+        # post process the data returned from the model inference engine
+        # data is the return value from model.predict we will put is inside a return value as Y
+        if not isinstance(data, np.ndarray):
+            # this should not happen
+            return dict(digit=-1)
+
+        # data is returned as probability per class (10 class/digits)
+        return dict(digit=int(data.flatten().argmax()))
diff --git a/examples/pytorch/readme.md b/examples/pytorch/readme.md
new file mode 100644
index 0000000..0b1a064
--- /dev/null
+++ b/examples/pytorch/readme.md
@@ -0,0 +1,45 @@
+# Train and Deploy Keras model with Nvidia Triton Engine
+
+## training mnist digit classifier model
+
+Run the mock python training code
+```bash
+pip install -r examples/pytorch/requirements.txt 
+python examples/pytorch/train_pytorch_mnist.py
+```
+
+The output will be a model created on the project "serving examples", by the name "train pytorch model"
+*Notice* Only TorchScript models are supported by Triton server
+
+## setting up the serving service
+
+
+Prerequisites, PyTorch models require Triton engine support, please use `docker-compose-triton.yml` / `docker-compose-triton-gpu.yml` or if running on Kubernetes, the matching helm chart.
+
+1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
+2. Create model endpoint:
+
+`clearml-serving --id <service_id> model add --engine triton --endpoint "test_model_pytorch" --preprocess "examples/pytorch/preprocess.py" --name "train pytorch model" --project "serving examples"
+  --input-size 1 28 28 --input-name "INPUT__0" --input-type float32   
+  --output-size -1 10 --output-name "OUTPUT__0" --output-type float32   
+`
+
+Or auto update 
+
+`clearml-serving --id <service_id> model auto-update --engine triton --endpoint "test_model_pytorch_auto" --preprocess "examples/pytorch/preprocess.py" --name "train pytorch model" --project "serving examples" --max-versions 2
+  --input-size 1 28 28 --input-name "INPUT__0" --input-type float32   
+  --output-size -1 10 --output-name "OUTPUT__0" --output-type float32`
+  
+Or add Canary endpoint
+
+`clearml-serving --id <service_id> model canary --endpoint "test_model_pytorch_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_pytorch_auto`
+   
+3. Run the Triton Engine `docker run -v ~/clearml.conf:/root/clearml.conf -p 8001:8001 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving-triton:latest`
+4. Configure the Triton Engine IP on the Serving Service (if running on k8s, the gRPC ingest of the triton container)
+`clearml-serving --id <service_id> config --triton-grpc-server <local_ip_here>:8001`
+5. Run the clearml-serving container `docker run -v ~/clearml.conf:/root/clearml.conf -p 8001:8001 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
+6. Test new endpoint: `curl -X POST "http://127.0.0.1:8080/serve/test_model_pytorch" -H "accept: application/json" -H "Content-Type: application/json" -d '{"url": "https://camo.githubusercontent.com/8385ca52c9cba1f6e629eb938ab725ec8c9449f12db81f9a34e18208cd328ce9/687474703a2f2f706574722d6d6172656b2e636f6d2f77702d636f6e74656e742f75706c6f6164732f323031372f30372f6465636f6d707265737365642e6a7067"}'`
+
+> **_Notice:_**  You can also change the serving service while it is already running!
+This includes adding/removing endpoints, adding canary model routing etc.
+by default new endpoints/models will be automatically updated after 1 minute
diff --git a/examples/pytorch/requirements.txt b/examples/pytorch/requirements.txt
new file mode 100644
index 0000000..878ee74
--- /dev/null
+++ b/examples/pytorch/requirements.txt
@@ -0,0 +1,5 @@
+torchvision
+torch
+clearml
+PIL
+setuptools<58
diff --git a/examples/pytorch/train_pytorch_mnist.py b/examples/pytorch/train_pytorch_mnist.py
new file mode 100644
index 0000000..e03aca6
--- /dev/null
+++ b/examples/pytorch/train_pytorch_mnist.py
@@ -0,0 +1,142 @@
+# ClearML - Example of pytorch with tensorboard>=v1.14
+#
+from __future__ import print_function
+
+import argparse
+import os
+from tempfile import gettempdir
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.autograd import Variable
+from torch.utils.tensorboard import SummaryWriter
+
+from clearml import Task, OutputModel
+
+
+class Net(nn.Module):
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
+        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 2))
+        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+
+
+def train(model, epoch, train_loader, args, optimizer, writer):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        if args.cuda:
+            data, target = data.cuda(), target.cuda()
+        data, target = Variable(data), Variable(target)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.data.item()))
+            niter = epoch*len(train_loader)+batch_idx
+            writer.add_scalar('Train/Loss', loss.data.item(), niter)
+
+
+def test(model, test_loader, args, optimizer, writer):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    for niter, (data, target) in enumerate(test_loader):
+        if args.cuda:
+            data, target = data.cuda(), target.cuda()
+        data, target = Variable(data), Variable(target)
+        output = model(data)
+        test_loss += F.nll_loss(output, target, reduction='sum').data.item()  # sum up batch loss
+        pred = output.data.max(1)[1]  # get the index of the max log-probability
+        pred = pred.eq(target.data).cpu().sum()
+        writer.add_scalar('Test/Loss', pred, niter)
+        correct += pred
+        if niter % 100 == 0:
+            writer.add_image('test', data[0, :, :, :], niter)
+
+    test_loss /= len(test_loader.dataset)
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 10)')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                        help='SGD momentum (default: 0.5)')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                        help='how many batches to wait before logging training status')
+    args = parser.parse_args()
+
+    # Connecting ClearML with the current process,
+    # from here on everything is logged automatically
+    task = Task.init(project_name='serving examples', task_name='train pytorch model', output_uri=True)  # noqa: F841
+    writer = SummaryWriter('runs')
+    writer.add_text('TEXT', 'This is some text', 0)
+    args.cuda = not args.no_cuda and torch.cuda.is_available()
+
+    torch.manual_seed(args.seed)
+    if args.cuda:
+        torch.cuda.manual_seed(args.seed)
+
+    kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
+    train_loader = torch.utils.data.DataLoader(datasets.MNIST('./data', train=True, download=True,
+                                                              transform=transforms.Compose([
+                                                                  transforms.ToTensor(),
+                                                                  transforms.Normalize((0.1307,), (0.3081,))])),
+                                               batch_size=args.batch_size, shuffle=True, **kwargs)
+    test_loader = torch.utils.data.DataLoader(datasets.MNIST('./data', train=False,
+                                                             transform=transforms.Compose([
+                                                                 transforms.ToTensor(),
+                                                                 transforms.Normalize((0.1307,), (0.3081,))])),
+                                              batch_size=args.test_batch_size, shuffle=True, **kwargs)
+
+    model = Net()
+    if args.cuda:
+        model.cuda()
+
+    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
+
+    for epoch in range(1, args.epochs + 1):
+        train(model, epoch, train_loader, args, optimizer, writer)
+
+    # store in a way we can easily load into triton without having to have the model class
+    torch.jit.script(model).save('serving_model.pt')
+    OutputModel().update_weights('serving_model.pt')
+    test(model, test_loader, args, optimizer, writer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/sklearn/preprocess.py b/examples/sklearn/preprocess.py
new file mode 100644
index 0000000..6ba648c
--- /dev/null
+++ b/examples/sklearn/preprocess.py
@@ -0,0 +1,19 @@
+from typing import Any
+
+import numpy as np
+
+
+# Notice Preprocess class Must be named "Preprocess"
+class Preprocess(object):
+    def __init__(self):
+        # set internal state, this will be called only once. (i.e. not per request)
+        pass
+
+    def preprocess(self, body: dict, collect_custom_statistics_fn=None) -> Any:
+        # we expect to get two valid on the dict x0, and x1
+        return [[body.get("x0", None), body.get("x1", None)], ]
+
+    def postprocess(self, data: Any, collect_custom_statistics_fn=None) -> dict:
+        # post process the data returned from the model inference engine
+        # data is the return value from model.predict we will put is inside a return value as Y
+        return dict(y=data.tolist() if isinstance(data, np.ndarray) else data)
diff --git a/examples/sklearn/readme.md b/examples/sklearn/readme.md
new file mode 100644
index 0000000..33b802f
--- /dev/null
+++ b/examples/sklearn/readme.md
@@ -0,0 +1,32 @@
+# Train and Deploy Scikit-Learn model
+
+## training mock logistic regression model
+
+Run the mock python training code
+```bash
+pip install -r examples/sklearn/requirements.txt 
+python examples/sklearn/train_model.py
+```
+
+The output will be a model created on the project "serving examples", by the name "train sklearn model"
+
+## setting up the serving service
+
+1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
+2. Create model endpoint: 
+`clearml-serving --id <service_id> model add --engine sklearn --endpoint "test_model_sklearn" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --project "serving examples"`
+
+Or auto update 
+
+`clearml-serving --id <service_id> model auto-update --engine sklearn --endpoint "test_model_sklearn_auto" --preprocess "examples/sklearn/preprocess.py" --name "train sklearn model" --project "serving examples" --max-versions 2`
+
+Or add Canary endpoint
+
+`clearml-serving --id <service_id> model canary --endpoint "test_model_sklearn_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_sklearn_auto`
+
+3. Run the clearml-serving container `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
+4. Test new endpoint: `curl -X POST "http://127.0.0.1:8080/serve/test_model_sklearn" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2}'`
+
+> **_Notice:_**  You can also change the serving service while it is already running!
+This includes adding/removing endpoints, adding canary model routing etc.
+by default new endpoints/models will be automatically updated after 1 minute
diff --git a/examples/sklearn/requirements.txt b/examples/sklearn/requirements.txt
new file mode 100644
index 0000000..eb862f7
--- /dev/null
+++ b/examples/sklearn/requirements.txt
@@ -0,0 +1,2 @@
+clearml >= 1.1.6
+scikit-learn
diff --git a/examples/sklearn/train_model.py b/examples/sklearn/train_model.py
new file mode 100644
index 0000000..94edb00
--- /dev/null
+++ b/examples/sklearn/train_model.py
@@ -0,0 +1,15 @@
+from sklearn.linear_model import LogisticRegression
+from sklearn.datasets import make_blobs
+from joblib import dump
+from clearml import Task
+
+task = Task.init(project_name="serving examples", task_name="train sklearn model", output_uri=True)
+
+# generate 2d classification dataset
+X, y = make_blobs(n_samples=100, centers=2, n_features=2, random_state=1)
+# fit final model
+model = LogisticRegression()
+model.fit(X, y)
+
+dump(model, filename="sklearn-model.pkl", compress=9)
+
diff --git a/examples/xgboost/preprocess.py b/examples/xgboost/preprocess.py
new file mode 100644
index 0000000..e3a1771
--- /dev/null
+++ b/examples/xgboost/preprocess.py
@@ -0,0 +1,21 @@
+from typing import Any
+
+import numpy as np
+import xgboost as xgb
+
+
+# Notice Preprocess class Must be named "Preprocess"
+class Preprocess(object):
+    def __init__(self):
+        # set internal state, this will be called only once. (i.e. not per request)
+        pass
+
+    def preprocess(self, body: dict, collect_custom_statistics_fn=None) -> Any:
+        # we expect to get four valid numbers on the dict: x0, x1, x2, x3
+        return xgb.DMatrix(
+            [[body.get("x0", None), body.get("x1", None), body.get("x2", None), body.get("x3", None)]])
+
+    def postprocess(self, data: Any, collect_custom_statistics_fn=None) -> dict:
+        # post process the data returned from the model inference engine
+        # data is the return value from model.predict we will put is inside a return value as Y
+        return dict(y=data.tolist() if isinstance(data, np.ndarray) else data)
diff --git a/examples/xgboost/readme.md b/examples/xgboost/readme.md
new file mode 100644
index 0000000..52d39ca
--- /dev/null
+++ b/examples/xgboost/readme.md
@@ -0,0 +1,32 @@
+# Train and Deploy XGBoost model
+
+## training iris classifier model
+
+Run the mock python training code
+```bash
+pip install -r examples/xgboost/requirements.txt 
+python examples/xgboost/train_model.py
+```
+
+The output will be a model created on the project "serving examples", by the name "train xgboost model"
+
+## setting up the serving service
+
+1. Create serving Service: `clearml-serving create --name "serving example"` (write down the service ID)
+2. Create model endpoint: 
+
+3. `clearml-serving --id <service_id> model add --engine xgboost --endpoint "test_model_xgb" --preprocess "examples/xgboost/preprocess.py" --name "train xgboost model" --project "serving examples"`
+
+Or auto update 
+
+`clearml-serving --id <service_id> model auto-update --engine xgboost --endpoint "test_model_xgb_auto" --preprocess "examples/xgboost/preprocess.py" --name "train xgboost model" --project "serving examples" --max-versions 2`
+
+Or add Canary endpoint
+
+`clearml-serving --id <service_id> model canary --endpoint "test_model_xgb_auto" --weights 0.1 0.9 --input-endpoint-prefix test_model_xgb_auto`
+
+4. Run the clearml-serving container `docker run -v ~/clearml.conf:/root/clearml.conf -p 8080:8080 -e CLEARML_SERVING_TASK_ID=<service_id> clearml-serving:latest`
+5. Test new endpoint: `curl -X POST "http://127.0.0.1:8080/serve/test_model_xgb" -H "accept: application/json" -H "Content-Type: application/json" -d '{"x0": 1, "x1": 2, "x2": 3, "x3": 4}'`
+
+> **_Notice:_**  You can also change the serving service while it is already running!
+This includes adding/removing endpoints, adding canary model routing etc.
diff --git a/examples/xgboost/requirements.txt b/examples/xgboost/requirements.txt
new file mode 100644
index 0000000..0b0fe4b
--- /dev/null
+++ b/examples/xgboost/requirements.txt
@@ -0,0 +1,3 @@
+clearml >= 1.1.6
+xgboost
+
diff --git a/examples/xgboost/train_model.py b/examples/xgboost/train_model.py
new file mode 100644
index 0000000..cb91cf3
--- /dev/null
+++ b/examples/xgboost/train_model.py
@@ -0,0 +1,28 @@
+import xgboost as xgb
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+
+from clearml import Task
+
+task = Task.init(project_name="serving examples", task_name="train xgboost model", output_uri=True)
+
+X, y = load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=100
+)
+
+dtrain = xgb.DMatrix(X_train, label=y_train)
+dtest = xgb.DMatrix(X_test, label=y_test)
+
+params = {"objective": "reg:squarederror", "eval_metric": "rmse"}
+
+
+bst = xgb.train(
+    params,
+    dtrain,
+    num_boost_round=100,
+    evals=[(dtrain, "train"), (dtest, "test")],
+    verbose_eval=0,
+)
+
+bst.save_model("xgb_model")
diff --git a/requirements.txt b/requirements.txt
index 7c60ed3..caf94e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-clearml >= 0.17.6rc1
+clearml >= 1.3.1
diff --git a/setup.py b/setup.py
index 8575ed3..b6d0f16 100644
--- a/setup.py
+++ b/setup.py
@@ -39,8 +39,8 @@ setup(
     long_description_content_type='text/markdown',
     # The project's main homepage.
     url='https://github.com/allegroai/clearml-serving.git',
-    author='Allegroai',
-    author_email='clearml@allegro.ai',
+    author='ClearML',
+    author_email='support@clear.ml',
     license='Apache License 2.0',
     classifiers=[
         'Development Status :: 4 - Beta',
@@ -54,7 +54,6 @@ setup(
         'Topic :: Software Development :: Version Control',
         'Topic :: System :: Logging',
         'Topic :: System :: Monitoring',
-        'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',