From 255cabfd7f9b44d8748a1fe756f888baf5fccb45 Mon Sep 17 00:00:00 2001 From: Uzmar Gomez Date: Wed, 15 May 2024 15:19:40 +0100 Subject: [PATCH] Feature/multiplequeues (#286) * copy changes * add workflow when push * update readme * update readme * delete test workflow * bump chart version to 5.2.0 * test ci * delete test ci * bump kubeversion * update readme * Update Chart.yaml --------- Co-authored-by: Valeriano Manassero <14011549+valeriano-manassero@users.noreply.github.com> --- charts/clearml-agent/Chart.yaml | 8 +++++--- charts/clearml-agent/README.md | 9 +++++---- .../clearml-agent/templates/agentk8sglue-deployment.yaml | 6 ++++-- charts/clearml-agent/values.yaml | 4 +++- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/charts/clearml-agent/Chart.yaml b/charts/clearml-agent/Chart.yaml index 4e27e2b..bdcbfe8 100644 --- a/charts/clearml-agent/Chart.yaml +++ b/charts/clearml-agent/Chart.yaml @@ -2,9 +2,9 @@ apiVersion: v2 name: clearml-agent description: MLOps platform Task running agent type: application -version: "5.1.5" +version: "5.2.0" appVersion: "1.24" -kubeVersion: ">= 1.21.0-0 < 1.30.0-0" +kubeVersion: ">= 1.21.0-0 < 1.31.0-0" home: https://clear.ml icon: https://raw.githubusercontent.com/allegroai/clearml/master/docs/clearml-logo.svg sources: @@ -21,4 +21,6 @@ keywords: annotations: artifacthub.io/changes: | - kind: fixed - description: Use existingClearmlConfigSecret in deployment template + description: Support for 1.30 kubernetes + - kind: added + description: Support for multiple queues diff --git a/charts/clearml-agent/README.md b/charts/clearml-agent/README.md index b25b7ef..3fe171e 100644 --- a/charts/clearml-agent/README.md +++ b/charts/clearml-agent/README.md @@ -1,6 +1,6 @@ # ClearML Kubernetes Agent -![Version: 5.1.5](https://img.shields.io/badge/Version-5.1.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.24](https://img.shields.io/badge/AppVersion-1.24-informational?style=flat-square) +![Version: 5.2.0](https://img.shields.io/badge/Version-5.2.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.24](https://img.shields.io/badge/AppVersion-1.24-informational?style=flat-square) MLOps platform Task running agent @@ -47,13 +47,13 @@ Before issuing helm upgrade: ## Requirements -Kubernetes: `>= 1.21.0-0 < 1.30.0-0` +Kubernetes: `>= 1.21.0-0 < 1.31.0-0` ## Values | Key | Type | Default | Description | |-----|------|---------|-------------| -| agentk8sglue | object | `{"additionalClusterRoleBindings":[],"additionalRoleBindings":[],"affinity":{},"annotations":{},"apiServerUrlReference":"https://api.clear.ml","basePodTemplate":{"affinity":{},"annotations":{},"containerSecurityContext":{},"env":[],"fileMounts":[],"hostAliases":[],"initContainers":[],"labels":{},"nodeSelector":{},"podSecurityContext":{},"priorityClassName":"","resources":{},"schedulerName":"","tolerations":[],"volumeMounts":[],"volumes":[]},"clearmlcheckCertificate":true,"containerSecurityContext":{},"defaultContainerImage":"ubuntu:18.04","extraEnvs":[],"fileMounts":[],"fileServerUrlReference":"https://files.clear.ml","image":{"registry":"","repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-21"},"initContainers":{"resources":{}},"labels":{},"nodeSelector":{},"podSecurityContext":{},"queue":"default","replicaCount":1,"resources":{},"serviceExistingAccountName":"","tolerations":[],"volumeMounts":[],"volumes":[],"webServerUrlReference":"https://app.clear.ml"}` | This agent will spawn queued experiments in new pods, a good use case is to combine this with GPU autoscaling nodes. https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue | +| agentk8sglue | object | `{"additionalClusterRoleBindings":[],"additionalRoleBindings":[],"affinity":{},"annotations":{},"apiServerUrlReference":"https://api.clear.ml","basePodTemplate":{"affinity":{},"annotations":{},"containerSecurityContext":{},"env":[],"fileMounts":[],"hostAliases":[],"initContainers":[],"labels":{},"nodeSelector":{},"podSecurityContext":{},"priorityClassName":"","resources":{},"schedulerName":"","tolerations":[],"volumeMounts":[],"volumes":[]},"clearmlcheckCertificate":true,"containerSecurityContext":{},"createQueueIfNotExists":true,"defaultContainerImage":"ubuntu:18.04","extraEnvs":[],"fileMounts":[],"fileServerUrlReference":"https://files.clear.ml","image":{"registry":"","repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-21"},"initContainers":{"resources":{}},"labels":{},"nodeSelector":{},"podSecurityContext":{},"queue":"default","replicaCount":1,"resources":{},"serviceExistingAccountName":"","tolerations":[],"volumeMounts":[],"volumes":[],"webServerUrlReference":"https://app.clear.ml"}` | This agent will spawn queued experiments in new pods, a good use case is to combine this with GPU autoscaling nodes. https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue | | agentk8sglue.additionalClusterRoleBindings | list | `[]` | additional existing ClusterRoleBindings | | agentk8sglue.additionalRoleBindings | list | `[]` | additional existing RoleBindings | | agentk8sglue.affinity | object | `{}` | affinity setup for Agent pod (example in values.yaml comments) | @@ -78,6 +78,7 @@ Kubernetes: `>= 1.21.0-0 < 1.30.0-0` | agentk8sglue.basePodTemplate.volumes | list | `[]` | volumes definition for pods spawned to consume ClearML Task (example in values.yaml comments) | | agentk8sglue.clearmlcheckCertificate | bool | `true` | Check certificates validity for evefry UrlReference below. | | agentk8sglue.containerSecurityContext | object | `{}` | container securityContext setup for Agent pod (example in values.yaml comments) | +| agentk8sglue.createQueueIfNotExists | bool | `true` | if ClearML queue does not exist, it will be create it if the value is set to true | | agentk8sglue.defaultContainerImage | string | `"ubuntu:18.04"` | default container image for ClearML Task pod | | agentk8sglue.extraEnvs | list | `[]` | Extra Environment variables for Glue Agent | | agentk8sglue.fileMounts | list | `[]` | file definition for Glue Agent (example in values.yaml comments) | @@ -88,7 +89,7 @@ Kubernetes: `>= 1.21.0-0 < 1.30.0-0` | agentk8sglue.labels | object | `{}` | labels setup for Agent pod (example in values.yaml comments) | | agentk8sglue.nodeSelector | object | `{}` | nodeSelector setup for Agent pod (example in values.yaml comments) | | agentk8sglue.podSecurityContext | object | `{}` | container securityContext setup for Agent pod (example in values.yaml comments) | -| agentk8sglue.queue | string | `"default"` | ClearML queue this agent will consume | +| agentk8sglue.queue | string | `"default"` | ClearML queue this agent will consume. Multiple queues can be specified with the following format: queue1,queue2,queue3 | | agentk8sglue.replicaCount | int | `1` | Glue Agent number of pods | | agentk8sglue.resources | object | `{}` | Glue Agent pod resources | | agentk8sglue.serviceExistingAccountName | string | `""` | if set, don't create a serviceAccountName but use defined existing one | diff --git a/charts/clearml-agent/templates/agentk8sglue-deployment.yaml b/charts/clearml-agent/templates/agentk8sglue-deployment.yaml index f5bf546..61d4fb6 100644 --- a/charts/clearml-agent/templates/agentk8sglue-deployment.yaml +++ b/charts/clearml-agent/templates/agentk8sglue-deployment.yaml @@ -98,10 +98,12 @@ spec: value: "--namespace {{ .Release.Namespace }} --template-yaml /root/template/template.yaml \ --ports-mode --num-of-services {{ .Values.sessions.maxServices }} \ --base-port {{ .Values.sessions.startingPort }} \ - --gateway-address {{ .Values.sessions.externalIP }}" + --gateway-address {{ .Values.sessions.externalIP }} \ + {{- if .Values.agentk8sglue.createQueueIfNotExists }} --create-queue{{- end }}" {{- else}} - name: K8S_GLUE_EXTRA_ARGS - value: "--namespace {{ .Release.Namespace }} --template-yaml /root/template/template.yaml" + value: "--namespace {{ .Release.Namespace }} --template-yaml /root/template/template.yaml \ + {{- if .Values.agentk8sglue.createQueueIfNotExists }} --create-queue{{- end }}" {{- end }} {{ if or (.Values.clearml.clearmlConfig) (.Values.clearml.existingClearmlConfigSecret) }} - name: CLEARML_CONFIG_FILE diff --git a/charts/clearml-agent/values.yaml b/charts/clearml-agent/values.yaml index d87b34a..535d34c 100644 --- a/charts/clearml-agent/values.yaml +++ b/charts/clearml-agent/values.yaml @@ -80,8 +80,10 @@ agentk8sglue: # -- default container image for ClearML Task pod defaultContainerImage: ubuntu:18.04 - # -- ClearML queue this agent will consume + # -- ClearML queue this agent will consume. Multiple queues can be specified with the following format: queue1,queue2,queue3 queue: default + # -- if ClearML queue does not exist, it will be create it if the value is set to true + createQueueIfNotExists: true # -- labels setup for Agent pod (example in values.yaml comments) labels: {} # schedulerName: scheduler