From 7041c62f44c96aab41607d407076cfcc2b2fb787 Mon Sep 17 00:00:00 2001 From: Valeriano Manassero <14011549+valeriano-manassero@users.noreply.github.com> Date: Wed, 4 Jan 2023 09:45:23 +0100 Subject: [PATCH] Clearml agent enterprise features (#121) * Added: enterprise features alignment * Changed: version bump * Fixed: trailing spaces * Fixed: comment starting space * Changed: owner-token feature * Fixed: secret reference name * Changed: owner-token enterprise reference --- charts/clearml-agent/Chart.yaml | 2 +- charts/clearml-agent/README.md | 58 ++++- charts/clearml-agent/templates/_helpers.tpl | 49 ++-- .../templates/agentk8sglue-configmap.yaml | 224 +++++++++++++++++- .../templates/agentk8sglue-deployment.yaml | 147 +++++++++--- .../templates/agentk8sglue-rbac.yaml | 61 ++++- .../templates/clearml-secrets.yaml | 14 +- .../templates/service-secret.yaml | 37 +++ .../templates/service-sessions.yaml | 32 +++ charts/clearml-agent/values.yaml | 151 ++++++++++-- 10 files changed, 652 insertions(+), 123 deletions(-) create mode 100644 charts/clearml-agent/templates/service-secret.yaml create mode 100644 charts/clearml-agent/templates/service-sessions.yaml diff --git a/charts/clearml-agent/Chart.yaml b/charts/clearml-agent/Chart.yaml index b61d52a..67b5a11 100644 --- a/charts/clearml-agent/Chart.yaml +++ b/charts/clearml-agent/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: clearml-agent description: MLOps platform type: application -version: "2.0.2" +version: "3.0.0" appVersion: "1.24" kubeVersion: ">= 1.19.0-0 < 1.26.0-0" home: https://clear.ml diff --git a/charts/clearml-agent/README.md b/charts/clearml-agent/README.md index e80da4b..154cd49 100644 --- a/charts/clearml-agent/README.md +++ b/charts/clearml-agent/README.md @@ -1,6 +1,6 @@ # ClearML Kubernetes Agent -![Version: 2.0.2](https://img.shields.io/badge/Version-2.0.2-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.24](https://img.shields.io/badge/AppVersion-1.24-informational?style=flat-square) +![Version: 3.0.0](https://img.shields.io/badge/Version-3.0.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.24](https://img.shields.io/badge/AppVersion-1.24-informational?style=flat-square) MLOps platform @@ -30,25 +30,35 @@ Kubernetes: `>= 1.19.0-0 < 1.26.0-0` | Key | Type | Default | Description | |-----|------|---------|-------------| -| agentk8sglue | object | `{"apiServerUrlReference":"https://api.clear.ml","clearmlcheckCertificate":true,"defaultContainerImage":"ubuntu:18.04","extraEnvs":[],"fileServerUrlReference":"https://files.clear.ml","id":"k8s-agent","image":{"repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-21"},"maxPods":10,"podTemplate":{"env":[],"nodeSelector":{},"resources":{},"tolerations":[],"volumeMounts":[],"volumes":[]},"queue":"default","replicaCount":1,"serviceAccountName":"default","webServerUrlReference":"https://app.clear.ml"}` | This agent will spawn queued experiments in new pods, a good use case is to combine this with GPU autoscaling nodes. https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue | +| agentk8sglue | object | `{"apiServerUrlReference":"https://api.clear.ml","basePodTemplate":{"env":[],"fileMounts":[],"hostAliases":{},"initContainers":[],"labels":{},"nodeSelector":{},"resources":{},"schedulerName":"","securityContext":{},"tolerations":[],"volumeMounts":[],"volumes":[]},"clearmlcheckCertificate":true,"containerCustomBashScript":"","customBashScript":"","debugMode":false,"defaultContainerImage":"ubuntu:18.04","extraEnvs":[],"fileMounts":[],"fileServerUrlReference":"https://files.clear.ml","image":{"repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-21"},"queue":"default","replicaCount":1,"serviceExistingAccountName":"","volumeMounts":[],"volumes":[],"webServerUrlReference":"https://app.clear.ml"}` | This agent will spawn queued experiments in new pods, a good use case is to combine this with GPU autoscaling nodes. https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue | | agentk8sglue.apiServerUrlReference | string | `"https://api.clear.ml"` | Reference to Api server url | +| agentk8sglue.basePodTemplate | object | `{"env":[],"fileMounts":[],"hostAliases":{},"initContainers":[],"labels":{},"nodeSelector":{},"resources":{},"schedulerName":"","securityContext":{},"tolerations":[],"volumeMounts":[],"volumes":[]}` | base template for pods spawned to consume ClearML Task | +| agentk8sglue.basePodTemplate.env | list | `[]` | environment variables for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.fileMounts | list | `[]` | file definition for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.hostAliases | object | `{}` | hostAliases setup for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.initContainers | list | `[]` | initContainers definition for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.labels | object | `{}` | labels setup for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.nodeSelector | object | `{}` | nodeSelector setup for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.resources | object | `{}` | resources declaration for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.schedulerName | string | `""` | schedulerName setup for pods spawned to consume ClearML Task | +| agentk8sglue.basePodTemplate.securityContext | object | `{}` | securityContext setup for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.tolerations | list | `[]` | tolerations setup for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.volumeMounts | list | `[]` | volume mounts definition for pods spawned to consume ClearML Task (example in values.yaml comments) | +| agentk8sglue.basePodTemplate.volumes | list | `[]` | volumes definition for pods spawned to consume ClearML Task (example in values.yaml comments) | | agentk8sglue.clearmlcheckCertificate | bool | `true` | Check certificates validity for evefry UrlReference below. | +| agentk8sglue.containerCustomBashScript | string | `""` | Custom Bash script for the Task Pods ran by Glue Agent | +| agentk8sglue.customBashScript | string | `""` | Custom Bash script for the Glue Agent | +| agentk8sglue.debugMode | bool | `false` | Enable Debugging logs for Agent pod | | agentk8sglue.defaultContainerImage | string | `"ubuntu:18.04"` | default container image for ClearML Task pod | -| agentk8sglue.extraEnvs | list | `[]` | Environment variables to be exposed in the agentk8sglue pods | +| agentk8sglue.extraEnvs | list | `[]` | Extra Environment variables for Glue Agent | +| agentk8sglue.fileMounts | list | `[]` | file definition for Glue Agent (example in values.yaml comments) | | agentk8sglue.fileServerUrlReference | string | `"https://files.clear.ml"` | Reference to File server url | -| agentk8sglue.id | string | `"k8s-agent"` | ClearML worker ID (must be unique across the entire ClearMLenvironment) | | agentk8sglue.image | object | `{"repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-21"}` | Glue Agent image configuration | -| agentk8sglue.maxPods | int | `10` | maximum concurrent consume ClearML Task pod | -| agentk8sglue.podTemplate | object | `{"env":[],"nodeSelector":{},"resources":{},"tolerations":[],"volumeMounts":[],"volumes":[]}` | template for pods spawned to consume ClearML Task | -| agentk8sglue.podTemplate.env | list | `[]` | environment variables for pods spawned to consume ClearML Task (example in values.yaml comments) | -| agentk8sglue.podTemplate.nodeSelector | object | `{}` | nodeSelector setup for pods spawned to consume ClearML Task (example in values.yaml comments) | -| agentk8sglue.podTemplate.resources | object | `{}` | resources declaration for pods spawned to consume ClearML Task (example in values.yaml comments) | -| agentk8sglue.podTemplate.tolerations | list | `[]` | tolerations setup for pods spawned to consume ClearML Task (example in values.yaml comments) | -| agentk8sglue.podTemplate.volumeMounts | list | `[]` | volumeMounts definition for pods spawned to consume ClearML Task (example in values.yaml comments) | -| agentk8sglue.podTemplate.volumes | list | `[]` | volumes definition for pods spawned to consume ClearML Task (example in values.yaml comments) | | agentk8sglue.queue | string | `"default"` | ClearML queue this agent will consume | | agentk8sglue.replicaCount | int | `1` | Glue Agent number of pods | -| agentk8sglue.serviceAccountName | string | `"default"` | serviceAccountName for pods spawned to consume ClearML Task | +| agentk8sglue.serviceExistingAccountName | string | `""` | if set, don't create a serviceAccountName but use defined existing one | +| agentk8sglue.volumeMounts | list | `[]` | volume mounts definition for Glue Agent (example in values.yaml comments) | +| agentk8sglue.volumes | list | `[]` | volumes definition for Glue Agent (example in values.yaml comments) | | agentk8sglue.webServerUrlReference | string | `"https://app.clear.ml"` | Reference to Web server url | | clearml | object | `{"agentk8sglueKey":"ACCESSKEY","agentk8sglueSecret":"SECRETKEY","clearmlConfig":"sdk {\n}","existingAgentk8sglueSecret":"","existingClearmlConfigSecret":""}` | ClearMl generic configurations | | clearml.agentk8sglueKey | string | `"ACCESSKEY"` | Agent k8s Glue basic auth key | @@ -56,6 +66,19 @@ Kubernetes: `>= 1.19.0-0 < 1.26.0-0` | clearml.clearmlConfig | string | `"sdk {\n}"` | ClearML configuration file | | clearml.existingAgentk8sglueSecret | string | `""` | If this is set, chart will not generate a secret but will use what is defined here | | clearml.existingClearmlConfigSecret | string | `""` | If this is set, chart will not generate a secret but will use what is defined here | +| enterpriseFeatures | object | `{"applyVaultEnvVars":true,"enabled":false,"maxPods":10,"monitoredResources":{"maxResources":0,"maxResourcesFieldName":"resources|limits|nvidia.com/gpu","minResourcesFieldName":"resources|limits|nvidia.com/gpu"},"queues":{"default":{"templateOverrides":{}}},"serviceAccountClusterAccess":false,"useOwnerToken":true}` | Enterprise features (work only with an Enterprise license) | +| enterpriseFeatures.applyVaultEnvVars | bool | `true` | push env vars from Clear.ML Vault to task pods | +| enterpriseFeatures.enabled | bool | `false` | Enable/Disable Enterprise features | +| enterpriseFeatures.maxPods | int | `10` | maximum concurrent consume ClearML Task pod | +| enterpriseFeatures.monitoredResources | object | `{"maxResources":0,"maxResourcesFieldName":"resources|limits|nvidia.com/gpu","minResourcesFieldName":"resources|limits|nvidia.com/gpu"}` | GPU resource general counters | +| enterpriseFeatures.monitoredResources.maxResources | int | `0` | Maximum resources counter | +| enterpriseFeatures.monitoredResources.maxResourcesFieldName | string | `"resources|limits|nvidia.com/gpu"` | Field name used by Agent to count maximum resources | +| enterpriseFeatures.monitoredResources.minResourcesFieldName | string | `"resources|limits|nvidia.com/gpu"` | Field name used by Agent to count minimum resources | +| enterpriseFeatures.queues | object | `{"default":{"templateOverrides":{}}}` | ClearML queues and related template OVERRIDES used this agent will consume | +| enterpriseFeatures.queues.default | object | `{"templateOverrides":{}}` | name of the queue will be used for this template | +| enterpriseFeatures.queues.default.templateOverrides | object | `{}` | overrides of the base template for this queue (must be declared even if empty!) | +| enterpriseFeatures.serviceAccountClusterAccess | bool | `false` | service account access every namespace flag | +| enterpriseFeatures.useOwnerToken | bool | `true` | Agent must use owner Token | | imageCredentials | object | `{"email":"someone@host.com","enabled":false,"existingSecret":"","password":"pwd","registry":"docker.io","username":"someone"}` | Private image registry configuration | | imageCredentials.email | string | `"someone@host.com"` | Email | | imageCredentials.enabled | bool | `false` | Use private authentication mode | @@ -63,6 +86,15 @@ Kubernetes: `>= 1.19.0-0 < 1.26.0-0` | imageCredentials.password | string | `"pwd"` | Registry password | | imageCredentials.registry | string | `"docker.io"` | Registry name | | imageCredentials.username | string | `"someone"` | Registry username | +| sessions | object | `{"dynamicSvcs":false,"externalIP":"0.0.0.0","maxServices":20,"portModeEnabled":false,"setInteractiveQueuesTag":true,"startingPort":30000,"svcAnnotations":{},"svcType":"NodePort"}` | Sessions internal service configuration | +| sessions.dynamicSvcs | bool | `false` | Enable/Disable dynamic svc for sessions pods | +| sessions.externalIP | string | `"0.0.0.0"` | External IP sessions clients can connect to | +| sessions.maxServices | int | `20` | maximum number of NodePorts exposed | +| sessions.portModeEnabled | bool | `false` | Enable/Disable sessions portmode WARNING: only one Agent deployment can have this set to true | +| sessions.setInteractiveQueuesTag | bool | `true` | set interactive queue tags | +| sessions.startingPort | int | `30000` | starting range of exposed NodePorts | +| sessions.svcAnnotations | object | `{}` | specific annotations for session services | +| sessions.svcType | string | `"NodePort"` | service type ("NodePort" or "ClusterIP" or "LoadBalancer") | # Upgrading Chart diff --git a/charts/clearml-agent/templates/_helpers.tpl b/charts/clearml-agent/templates/_helpers.tpl index 8b733b4..9126650 100644 --- a/charts/clearml-agent/templates/_helpers.tpl +++ b/charts/clearml-agent/templates/_helpers.tpl @@ -2,32 +2,14 @@ Expand the name of the chart. */}} {{- define "clearml.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "clearml.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} +{{- .Release.Name | trunc 59 | trimSuffix "-" }} {{- end }} {{/* Create chart name and version as used by the chart label. */}} {{- define "clearml.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 59 | trimSuffix "-" }} {{- end }} {{/* @@ -50,29 +32,22 @@ app.kubernetes.io/name: {{ include "clearml.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} -{{/* -Reference Name (agentk8sglue) -*/}} -{{- define "agentk8sglue.referenceName" -}} -{{- include "clearml.fullname" . }}-agentk8sglue -{{- end }} - {{/* Selector labels (agentk8sglue) */}} {{- define "agentk8sglue.selectorLabels" -}} app.kubernetes.io/name: {{ include "clearml.name" . }} -app.kubernetes.io/instance: {{ include "agentk8sglue.referenceName" . }} +app.kubernetes.io/instance: {{ include "clearml.name" . }} {{- end }} {{/* Create the name of the service account to use */}} {{- define "clearml.serviceAccountName" -}} -{{- if .Values.serviceAccount.create }} -{{- default (include "clearml.fullname" .) .Values.serviceAccount.name }} +{{- if .Values.agentk8sglue.serviceExistingAccountName }} +{{- .Values.agentk8sglue.serviceExistingAccountName }} {{- else }} -{{- default "default" .Values.serviceAccount.name }} +{{- include "clearml.name" . }}-sa {{- end }} {{- end }} @@ -84,3 +59,15 @@ Create secret to access docker registry {{- printf "{\"auths\":{\"%s\":{\"username\":\"%s\",\"password\":\"%s\",\"email\":\"%s\",\"auth\":\"%s\"}}}" .registry .username .password .email (printf "%s:%s" .username .password | b64enc) | b64enc }} {{- end }} {{- end }} + + +{{/* +Create a string composed by queue names +*/}} +{{- define "agentk8sglue.queues" -}} +{{- $list := list }} +{{- range $key, $value := .Values.agentk8sglue.queues }} +{{- $list = append $list (printf "%s" $key) }} +{{- end }} +{{- join " " $list }} +{{- end }} diff --git a/charts/clearml-agent/templates/agentk8sglue-configmap.yaml b/charts/clearml-agent/templates/agentk8sglue-configmap.yaml index 9813599..b2b6628 100644 --- a/charts/clearml-agent/templates/agentk8sglue-configmap.yaml +++ b/charts/clearml-agent/templates/agentk8sglue-configmap.yaml @@ -1,8 +1,178 @@ apiVersion: v1 kind: ConfigMap metadata: - name: {{ include "agentk8sglue.referenceName" . }}-k8sagent-pod-template + name: {{ include "clearml.name" . }}-pt data: +{{- if .Values.enterpriseFeatures.enabled }} + template.yaml: | + {{- range $key, $value := $.Values.agentk8sglue.queues }} + {{ $key }}: + apiVersion: v1 + metadata: + namespace: {{ $.Release.Namespace }} + {{- if $value.templateOverrides.labels }} + labels: + {{- toYaml $value.templateOverrides.labels | nindent 10 }} + {{- else if $.Values.agentk8sglue.basePodTemplate.labels }} + labels: + {{- toYaml $.Values.agentk8sglue.basePodTemplate.labels | nindent 10 }} + {{- end}} + spec: + {{- if $.Values.imageCredentials.enabled }} + imagePullSecrets: + {{- if $.Values.imageCredentials.existingSecret }} + - name: $.Values.imageCredentials.existingSecret + {{- else }} + - name: {{ include "clearml.name" $ }}-ark + {{- end }} + {{- end }} + {{- if $value.templateOverrides.schedulerName }} + schedulerName: {{ $value.templateOverrides.schedulerName }} + {{- else if $.Values.agentk8sglue.basePodTemplate.schedulerName }} + schedulerName: {{ $.Values.agentk8sglue.basePodTemplate.schedulerName }} + {{- end}} + restartPolicy: Never + {{- if $value.templateOverrides.securityContext }} + securityContext: + {{- toYaml $value.templateOverrides.securityContext | nindent 10 }} + {{- else if $.Values.agentk8sglue.basePodTemplate.securityContext }} + securityContext: + {{- toYaml $.Values.agentk8sglue.basePodTemplate.securityContext | nindent 10 }} + {{- end}} + {{- if $value.templateOverrides.hostAliases }} + {{- with $value.templateOverrides.hostAliases }} + hostAliases: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- else if $.Values.agentk8sglue.basePodTemplate.hostAliases }} + {{- with $.Values.agentk8sglue.basePodTemplate.hostAliases }} + hostAliases: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- end }} + volumes: + {{- if $value.templateOverrides.volumes }} + {{- toYaml $value.templateOverrides.volumes | nindent 10 }} + {{- else if $.Values.agentk8sglue.basePodTemplate.volumes }} + {{- toYaml $.Values.agentk8sglue.basePodTemplate.volumes | nindent 10 }} + {{- end }} + {{- if $value.templateOverrides.fileMounts }} + - name: filemounts + secret: + secretName: {{ include "clearml.name" $ }}-{{ $key }}-fm + {{- else if $.Values.agentk8sglue.basePodTemplate.fileMounts }} + - name: filemounts + secret: + secretName: {{ include "clearml.name" $ }}-fm + {{- end }} + {{- if $value.templateOverrides.initContainers }} + initContainers: + {{- toYaml $value.templateOverrides.initContainers | nindent 10 }} + {{- else if $.Values.agentk8sglue.basePodTemplate.initContainers }} + initContainers: + {{- toYaml $.Values.agentk8sglue.basePodTemplate.initContainers | nindent 10 }} + {{- end }} + containers: + - resources: + {{- if $value.templateOverrides.resources }} + {{- toYaml $value.templateOverrides.resources | nindent 12 }} + {{- else if $.Values.agentk8sglue.basePodTemplate.resources }} + {{- toYaml $.Values.agentk8sglue.basePodTemplate.resources | nindent 12 }} + {{- end}} + ports: + - containerPort: 10022 + volumeMounts: + {{- if $value.templateOverrides.volumeMounts }} + {{- toYaml $value.templateOverrides.volumeMounts | nindent 12 }} + {{- else if $.Values.agentk8sglue.basePodTemplate.volumeMounts }} + {{- toYaml $.Values.agentk8sglue.basePodTemplate.volumeMounts | nindent 12 }} + {{- end }} + {{- if $value.templateOverrides.fileMounts }} + {{- range $value.templateOverrides.fileMounts }} + - name: filemounts + mountPath: "{{ .folderPath }}/{{ .name }}" + subPath: "{{ .name }}" + readOnly: true + {{- end }} + {{- else if $.Values.agentk8sglue.basePodTemplate.fileMounts }} + {{- range $.Values.agentk8sglue.basePodTemplate.fileMounts }} + - name: filemounts + mountPath: "{{ .folderPath }}/{{ .name }}" + subPath: "{{ .name }}" + readOnly: true + {{- end }} + {{- end }} + env: + - name: CLEARML_API_HOST + value: {{ $.Values.agentk8sglue.apiServerUrlReference }} + - name: CLEARML_WEB_HOST + value: {{ $.Values.agentk8sglue.webServerUrlReference }} + - name: CLEARML_FILES_HOST + value: {{ $.Values.agentk8sglue.fileServerUrlReference }} + {{- if not $.Values.agentk8sglue.useOwnerToken }} + - name: CLEARML_API_ACCESS_KEY + valueFrom: + secretKeyRef: + {{- if .Values.clearml.existingAgentk8sglueSecret }} + name: {{ .Values.clearml.existingAgentk8sglueSecret }} + {{- else }} + name: {{ include "clearml.name" . }}-ac + {{- end }} + key: agentk8sglue_key + - name: CLEARML_API_SECRET_KEY + valueFrom: + secretKeyRef: + {{- if .Values.clearml.existingAgentk8sglueSecret }} + name: {{ .Values.clearml.existingAgentk8sglueSecret }} + {{- else }} + name: {{ include "clearml.name" . }}-ac + {{- end }} + key: agentk8sglue_secret + {{- end }} + - name: PYTHONUNBUFFERED + value: "x" + {{- if not $.Values.agentk8sglue.clearmlcheckCertificate }} + - name: CLEARML_API_HOST_VERIFY_CERT + value: "false" + {{- end }} + {{- if $value.templateOverrides.env }} + {{- toYaml $value.templateOverrides.env | nindent 12 }} + {{- else if $.Values.agentk8sglue.basePodTemplate.env }} + {{- toYaml $.Values.agentk8sglue.basePodTemplate.env | nindent 12 }} + {{- end }} + {{- if $value.templateOverrides.nodeSelector }} + {{- with $value.templateOverrides.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- else if $.Values.agentk8sglue.basePodTemplate.nodeSelector }} + {{- with $.Values.agentk8sglue.basePodTemplate.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- end }} + {{- if $value.templateOverrides.tolerations }} + {{- with $value.templateOverrides.tolerations }} + tolerations: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- else if $.Values.agentk8sglue.basePodTemplate.tolerations }} + {{- with $.Values.agentk8sglue.basePodTemplate.tolerations }} + tolerations: + {{- toYaml . | nindent 10 }} + {{- end }} + {{- end }} + {{- end }} + secrets.yaml: | + {{- range $key, $value := $.Values.agentk8sglue.queues }} + {{ $key }}: + {{- if $value.templateOverrides.fileMounts }} + - {{ include "clearml.name" $ }}-{{ $key }}-fm + {{- else if $.Values.agentk8sglue.basePodTemplate.fileMounts }} + - {{ include "clearml.name" $ }}-fm + {{- end }} + {{- end }} +{{- else }} template.yaml: | apiVersion: v1 metadata: @@ -13,20 +183,19 @@ data: {{- if .Values.imageCredentials.existingSecret }} - name: {{.Values.imageCredentials.existingSecret}} {{- else }} - - name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-registry-key + - name: name: {{ include "clearml.name" $ }}-ark {{- end }} {{- end }} - serviceAccountName: {{ .Values.agentk8sglue.serviceAccountName }} - {{- with .Values.agentk8sglue.podTemplate.volumes }} + {{- with .Values.agentk8sglue.basePodTemplate.volumes }} volumes: {{- toYaml . | nindent 8 }} {{- end }} containers: - resources: - {{- toYaml .Values.agentk8sglue.podTemplate.resources | nindent 10 }} + {{- toYaml .Values.agentk8sglue.basePodTemplate.resources | nindent 10 }} ports: - containerPort: 10022 - {{- with .Values.agentk8sglue.podTemplate.volumeMounts }} + {{- with .Values.agentk8sglue.basePodTemplate.volumeMounts }} volumeMounts: {{- toYaml . | nindent 10 }} {{- end }} @@ -43,7 +212,7 @@ data: {{- if .Values.clearml.existingAgentk8sglueSecret }} name: {{ .Values.clearml.existingAgentk8sglueSecret }} {{- else }} - name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-k8sglue + name: {{ include "clearml.name" . }}-ac {{- end }} key: agentk8sglue_key - name: CLEARML_API_SECRET_KEY @@ -52,17 +221,48 @@ data: {{- if .Values.clearml.existingAgentk8sglueSecret }} name: {{ .Values.clearml.existingAgentk8sglueSecret }} {{- else }} - name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-k8sglue + name: {{ include "clearml.name" . }}-ac {{- end }} key: agentk8sglue_secret - {{- if .Values.agentk8sglue.podTemplate.env }} - {{ toYaml .Values.agentk8sglue.podTemplate.env | nindent 8 }} + {{- if .Values.agentk8sglue.basePodTemplate.env }} + {{ toYaml .Values.agentk8sglue.basePodTemplate.env | nindent 8 }} {{- end }} - {{- with .Values.agentk8sglue.podTemplate.nodeSelector}} + {{- with .Values.agentk8sglue.basePodTemplate.nodeSelector}} nodeSelector: {{- toYaml . | nindent 8 }} {{- end }} - {{- with .Values.agentk8sglue.podTemplate.tolerations }} + {{- with .Values.agentk8sglue.basePodTemplate.tolerations }} tolerations: {{- toYaml . | nindent 8 }} {{- end }} +{{- end }} +{{- if .Values.sessions.portModeEnabled }} +{{- range untilStep 1 ( ( add .Values.sessions.maxServices 1 ) | int ) 1 }} + services-{{ . }}.yaml: | + apiVersion: v1 + kind: Service + metadata: + name: clearml-session-{{ . }} + labels: + {{- include "clearml.labels" $ | nindent 8 }} + {{- with $.Values.sessions.svcAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + type: {{ $.Values.sessions.svcType }} + ports: + - targetPort: 10022 + {{- if eq $.Values.sessions.svcType "NodePort" }} + port: 10022 + {{- else }} + port: {{ add $.Values.sessions.startingPort . }} + {{- end }} + protocol: TCP + {{- if eq $.Values.sessions.svcType "NodePort" }} + nodePort: {{ add $.Values.sessions.startingPort . }} + {{- end }} + selector: + ai-allegro-agent-serial: pod-{{ . }} +{{- end }} +{{- end }} diff --git a/charts/clearml-agent/templates/agentk8sglue-deployment.yaml b/charts/clearml-agent/templates/agentk8sglue-deployment.yaml index 4303425..e1a9788 100644 --- a/charts/clearml-agent/templates/agentk8sglue-deployment.yaml +++ b/charts/clearml-agent/templates/agentk8sglue-deployment.yaml @@ -1,7 +1,7 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ include "agentk8sglue.referenceName" . }} + name: {{ include "clearml.name" . }} labels: {{- include "clearml.labels" . | nindent 4 }} spec: @@ -19,25 +19,60 @@ spec: {{- if .Values.imageCredentials.enabled }} imagePullSecrets: {{- if .Values.imageCredentials.existingSecret }} - - name: "{{.Values.imageCredentials.existingSecret}}" + - name: .Values.imageCredentials.existingSecret {{- else }} - - name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-registry-key + - name: {{ include "clearml.name" . }}-ark {{- end }} {{- end }} + serviceAccountName: {{ include "clearml.serviceAccountName" . }} + initContainers: + - name: init-k8s-glue + image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}" + command: + - /bin/sh + - -c + - > + set -x; + while [ $(curl {{ if not .Values.agentk8sglue.clearmlcheckCertificate }}--insecure{{ end }} -sw '%{http_code}' "{{.Values.agentk8sglue.apiServerUrlReference}}/debug.ping" -o /dev/null) -ne 200 ] ; do + echo "waiting for apiserver" ; + sleep 5 ; + done; + while [[ $(curl {{ if not .Values.agentk8sglue.clearmlcheckCertificate }}--insecure{{ end }} -sw '%{http_code}' "{{.Values.agentk8sglue.fileServerUrlReference}}/" -o /dev/null) =~ 403|405 ]] ; do + echo "waiting for fileserver" ; + sleep 5 ; + done; + while [ $(curl {{ if not .Values.agentk8sglue.clearmlcheckCertificate }}--insecure{{ end }} -sw '%{http_code}' "{{.Values.agentk8sglue.webServerUrlReference}}/" -o /dev/null) -ne 200 ] ; do + echo "waiting for webserver" ; + sleep 5 ; + done containers: - name: k8s-glue image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}" imagePullPolicy: IfNotPresent - command: ["/bin/bash", "-c", "export PATH=$PATH:$HOME/bin; source /root/.bashrc && /root/entrypoint.sh"] + command: + - /bin/bash + - -c + - > + export PATH=$PATH:$HOME/bin; + source /root/.bashrc && /root/entrypoint.sh volumeMounts: - - name: {{ include "agentk8sglue.referenceName" . }}-k8sagent-pod-template + - name: {{ include "clearml.name" . }}-pt mountPath: /root/template - {{- if or .Values.clearml.clearmlConfig .Values.clearml.existingClearmlConfigSecret }} + {{ if .Values.clearml.clearmlConfig }} - name: k8sagent-clearml-conf-volume mountPath: /root/clearml.conf subPath: clearml.conf readOnly: true {{- end }} + {{- if .Values.agentk8sglue.volumeMounts }} + {{- toYaml .Values.agentk8sglue.volumeMounts | nindent 10 }} + {{- end }} + {{- range .Values.agentk8sglue.fileMounts }} + - name: filemounts + mountPath: "{{ .folderPath }}/{{ .name }}" + subPath: "{{ .name }}" + readOnly: true + {{- end }} env: - name: CLEARML_API_HOST value: "{{.Values.agentk8sglue.apiServerUrlReference}}" @@ -45,56 +80,104 @@ spec: value: "{{.Values.agentk8sglue.webServerUrlReference}}" - name: CLEARML_FILES_HOST value: "{{.Values.agentk8sglue.fileServerUrlReference}}" - - name: K8S_GLUE_MAX_PODS - value: "{{.Values.agentk8sglue.maxPods}}" - - name: K8S_GLUE_QUEUE - value: "{{.Values.agentk8sglue.queue}}" + {{- if not .Values.agentk8sglue.clearmlcheckCertificate }} + - name: CLEARML_API_HOST_VERIFY_CERT + value: "false" + {{- end }} + {{- if .Values.sessions.portModeEnabled }} - name: K8S_GLUE_EXTRA_ARGS - value: "--namespace {{ .Release.Namespace }} --template-yaml /root/template/template.yaml" + value: "--namespace {{ .Release.Namespace }} --template-yaml /root/template/template.yaml \ + --ports-mode --num-of-services {{ .Values.sessions.maxServices }} \ + --base-port {{ .Values.sessions.startingPort }} \ + --gateway-address {{ .Values.sessions.externalIP }}{{ if .Values.enterpriseFeatures.enabled }}{{ if .Values.enterpriseFeatures.useOwnerToken }} --use-owner-token{{ end }}{{ end }}" + {{- if .Values.sessions.dynamicSvcs }} + - name: CLEARML_K8S_GLUE_POD_POST_APPLY_CMD + value: "kubectl -n {namespace} apply -f ~/template/services-{pod_number}.yaml ; kubectl -n {namespace} label svc clearml-session-{pod_number} service-for={pod_name}" + - name: CLEARML_K8S_GLUE_POD_POST_DELETE_CMD + value: "kubectl -n {namespace} delete svc -l service-for={pod_name}" + {{- end }} + {{- else}} + - name: K8S_GLUE_EXTRA_ARGS + value: "--namespace {{ .Release.Namespace }} --template-yaml /root/template/template.yaml \ + --max-pods {{.Values.enterpriseFeatures.maxPods}}{{ if .Values.enterpriseFeatures.enabled }}{{ if .Values.enterpriseFeatures.useOwnerToken }} --use-owner-token{{ end }}{{ end }}" + {{- end }} + - name: CLEARML_K8S_GLUE_LIMIT_POD_LABEL + value: "ai-allegro-agent-serial=pod-{pod_number}" + - name: CLEARML_K8S_SECRETS_LIST_FILE + value: /root/template/secrets.yaml - name: K8S_DEFAULT_NAMESPACE value: "{{ .Release.Namespace }}" - name: CLEARML_API_ACCESS_KEY valueFrom: secretKeyRef: - {{- if .Values.clearml.existingAgentk8sglueSecret }} - name: {{ .Values.clearml.existingAgentk8sglueSecret }} - {{- else }} - name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-k8sglue - {{- end }} + name: {{ include "clearml.name" . }}-ac key: agentk8sglue_key - name: CLEARML_API_SECRET_KEY valueFrom: secretKeyRef: - {{- if .Values.clearml.existingAgentk8sglueSecret }} - name: {{ .Values.clearml.existingAgentk8sglueSecret }} - {{- else }} - name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-k8sglue - {{- end }} + name: {{ include "clearml.name" . }}-ac key: agentk8sglue_secret - name: CLEARML_WORKER_ID - value: "{{.Values.agentk8sglue.id}}" + value: {{ include "clearml.name" . }} - name: CLEARML_AGENT_UPDATE_REPO value: "" - name: FORCE_CLEARML_AGENT_REPO - value: "" + value: "" - name: CLEARML_DOCKER_IMAGE value: "{{.Values.agentk8sglue.defaultContainerImage}}" + {{ if .Values.agentk8sglue.customBashScript }} + - name: CLEARML_K8S_GLUE_EXTRA_BASH_SCRIPT + value: "{{.Values.agentk8sglue.customBashScript}}" + {{- end }} + {{ if .Values.agentk8sglue.containerCustomBashScript }} + - name: CLEARML_K8S_GLUE_POD_BASH_SCRIPT + value: "{{.Values.agentk8sglue.containerCustomBashScript}}" + {{- end }} + {{- if .Values.agentk8sglue.debugMode }} + - name: "CLEARML_K8S_GLUE_DEBUG" + value: "1" + {{- end }} {{- if .Values.agentk8sglue.extraEnvs }} {{ toYaml .Values.agentk8sglue.extraEnvs | nindent 10 }} {{- end }} + {{- if .Values.sessions.portModeEnabled }} + {{- if .Values.sessions.setInteractiveQueuesTag }} + - name: "CLEARML_K8S_GLUE_SET_QUEUE_SYSTEM_TAGS" + value: "interactive" + {{- end }} + {{- end }} + {{- if .Values.enterpriseFeatures.enabled }} + - name: K8S_GLUE_QUEUE + value: {{ include "agentk8sglue.queues" . | quote }} + - name: CLEARML_K8S_GLUE_APPLY_VAULT_ENV_VARS + value: {{ .Values.enterpriseFeatures.applyVaultEnvVars | quote }} + - name: "CLEARML_K8S_GLUE_POD_MIN_RES_FIELD" + value: {{.Values.enterpriseFeatures.monitoredResources.minResourcesFieldName}} + - name: "CLEARML_K8S_GLUE_MAX_RESOURCES" + value: "{{.Values.agentk8sglue.monitoredResources.maxResources}}" + - name: "CLEARML_K8S_GLUE_POD_MAX_RES_FIELD" + value: {{.Values.enterpriseFeatures.monitoredResources.maxResourcesFieldName}} + {{- else }} + - name: K8S_GLUE_QUEUE + value: {{ .Values.agentk8sglue.queue }} + {{- end }} volumes: - - name: {{ include "agentk8sglue.referenceName" . }}-k8sagent-pod-template - configMap: - name: {{ include "agentk8sglue.referenceName" . }}-k8sagent-pod-template - {{- if or .Values.clearml.clearmlConfig .Values.clearml.existingClearmlConfigSecret }} + - name: {{ include "clearml.name" . }}-pt + configMap: + name: {{ include "clearml.name" . }}-pt + {{ if .Values.clearml.clearmlConfig }} - name: k8sagent-clearml-conf-volume secret: - {{- if .Values.clearml.existingClearmlConfigSecret }} - secretName: {{ .Values.clearml.existingClearmlConfigSecret }} - {{- else }} - secretName: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-conf - {{- end }} + secretName: {{ include "clearml.name" . }}-ac items: - key: clearml.conf path: clearml.conf {{ end }} + {{- if .Values.agentk8sglue.volumes }} + {{- toYaml .Values.agentk8sglue.volumes | nindent 8 }} + {{- end }} + {{ if .Values.agentk8sglue.fileMounts }} + - name: filemounts + secret: + secretName: {{ include "clearml.name" . }}-afm + {{- end }} diff --git a/charts/clearml-agent/templates/agentk8sglue-rbac.yaml b/charts/clearml-agent/templates/agentk8sglue-rbac.yaml index d961603..7b19d9c 100644 --- a/charts/clearml-agent/templates/agentk8sglue-rbac.yaml +++ b/charts/clearml-agent/templates/agentk8sglue-rbac.yaml @@ -1,23 +1,72 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role +{{- if not .Values.agentk8sglue.serviceExistingAccountName }} +apiVersion: v1 +kind: ServiceAccount metadata: - name: {{ include "agentk8sglue.referenceName" . }}-k8sagent-pods-access + name: {{ include "clearml.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} +{{- if .Values.enterpriseFeatures.serviceAccountClusterAccess }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "clearml.name" . }}-kpa rules: - apiGroups: - "" resources: - pods + - secrets + - services verbs: ["get", "list", "watch", "create", "patch", "delete"] + - apiGroups: + - "" + resources: + - namespaces + verbs: ["list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "clearml.name" . }}-kpa +subjects: + - kind: ServiceAccount + name: {{ include "clearml.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "clearml.name" . }}-kpa +{{- else }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "clearml.name" . }}-kpa +rules: + - apiGroups: + - "" + resources: + - pods + - secrets + - services + verbs: ["get", "list", "watch", "create", "patch", "delete"] + - apiGroups: + - "" + resources: + - namespaces + verbs: ["list"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: - name: {{ include "agentk8sglue.referenceName" . }}-k8sagent-pods-access + name: {{ include "clearml.name" . }}-kpa subjects: - kind: ServiceAccount - name: default + name: {{ include "clearml.serviceAccountName" . }} namespace: {{ .Release.Namespace }} roleRef: apiGroup: rbac.authorization.k8s.io kind: Role - name: {{ include "agentk8sglue.referenceName" . }}-k8sagent-pods-access + name: {{ include "clearml.name" . }}-kpa +{{- end }} diff --git a/charts/clearml-agent/templates/clearml-secrets.yaml b/charts/clearml-agent/templates/clearml-secrets.yaml index 7f0a54f..718ba59 100644 --- a/charts/clearml-agent/templates/clearml-secrets.yaml +++ b/charts/clearml-agent/templates/clearml-secrets.yaml @@ -1,28 +1,18 @@ -{{- if not .Values.clearml.existingAgentk8sglueSecret }} apiVersion: v1 kind: Secret metadata: - name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-k8sglue + name: {{ include "clearml.name" . }}-ac data: agentk8sglue_key: {{ .Values.clearml.agentk8sglueKey | b64enc }} agentk8sglue_secret: {{ .Values.clearml.agentk8sglueSecret | b64enc }} -{{- end }} ---- -{{- if not .Values.clearml.existingClearmlConfigSecret }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-conf -data: clearml.conf: {{ .Values.clearml.clearmlConfig | b64enc }} --- -{{- end }} {{- if .Values.imageCredentials.enabled }} {{- if not .Values.imageCredentials.existingSecret }} apiVersion: v1 kind: Secret metadata: - name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-registry-key + name: {{ include "clearml.name" . }}-ark type: kubernetes.io/dockerconfigjson data: .dockerconfigjson: {{ template "imagePullSecret" . }} diff --git a/charts/clearml-agent/templates/service-secret.yaml b/charts/clearml-agent/templates/service-secret.yaml new file mode 100644 index 0000000..8434a0c --- /dev/null +++ b/charts/clearml-agent/templates/service-secret.yaml @@ -0,0 +1,37 @@ +{{ if .Values.agentk8sglue.fileMounts }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "clearml.name" . }}-afm +data: + {{- range .Values.agentk8sglue.fileMounts }} + {{ .name }}: {{ .fileContent | b64enc }} + {{- end }} +{{ end }} +--- +{{- if .Values.enterpriseFeatures.enabled }} +{{ if .Values.agentk8sglue.basePodTemplate.fileMounts }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "clearml.name" . }}-fm +data: + {{- range .Values.agentk8sglue.basePodTemplate.fileMounts }} + {{ .name }}: {{ .fileContent | b64enc }} + {{- end }} +{{ end }} +--- +{{- range $key, $value := $.Values.agentk8sglue.queues }} +{{ if .templateOverrides.fileMounts }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "clearml.name" $ }}-{{ $key }}-fm +data: + {{- range .templateOverrides.fileMounts }} + {{ .name }}: {{ .fileContent | b64enc }} + {{- end }} +{{ end }} +--- +{{- end }} +{{- end }} diff --git a/charts/clearml-agent/templates/service-sessions.yaml b/charts/clearml-agent/templates/service-sessions.yaml new file mode 100644 index 0000000..3602291 --- /dev/null +++ b/charts/clearml-agent/templates/service-sessions.yaml @@ -0,0 +1,32 @@ +{{- if .Values.sessions.portModeEnabled }} +{{- if not .Values.sessions.dynamicSvcs }} +{{- range untilStep 1 ( ( add .Values.sessions.maxServices 1 ) | int ) 1 }} +--- +apiVersion: v1 +kind: Service +metadata: + name: clearml-session-{{ . }} + labels: + {{- include "clearml.labels" $ | nindent 4 }} + {{- with $.Values.sessions.svcAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ $.Values.sessions.svcType }} + ports: + - targetPort: 10022 + {{- if eq $.Values.sessions.svcType "NodePort" }} + port: 10022 + {{- else }} + port: {{ add $.Values.sessions.startingPort . }} + {{- end }} + protocol: TCP + {{- if eq $.Values.sessions.svcType "NodePort" }} + nodePort: {{ add $.Values.sessions.startingPort . }} + {{- end }} + selector: + ai.allegro.agent.serial: pod-{{ . }} +{{- end }} +{{- end }} +{{- end }} diff --git a/charts/clearml-agent/values.yaml b/charts/clearml-agent/values.yaml index 00b3277..ec8a2f8 100644 --- a/charts/clearml-agent/values.yaml +++ b/charts/clearml-agent/values.yaml @@ -41,9 +41,15 @@ agentk8sglue: # -- Glue Agent number of pods replicaCount: 1 + # -- if set, don't create a serviceAccountName but use defined existing one + serviceExistingAccountName: "" + # -- Check certificates validity for evefry UrlReference below. clearmlcheckCertificate: true + # -- Enable Debugging logs for Agent pod + debugMode: false + # -- Reference to Api server url apiServerUrlReference: "https://api.clear.ml" # -- Reference to File server url @@ -51,32 +57,78 @@ agentk8sglue: # -- Reference to Web server url webServerUrlReference: "https://app.clear.ml" - # -- serviceAccountName for pods spawned to consume ClearML Task - serviceAccountName: default - # -- maximum concurrent consume ClearML Task pod - maxPods: 10 # -- default container image for ClearML Task pod defaultContainerImage: ubuntu:18.04 # -- ClearML queue this agent will consume queue: default - - # -- ClearML worker ID (must be unique across the entire ClearMLenvironment) - id: k8s-agent - - # -- Environment variables to be exposed in the agentk8sglue pods + # -- Custom Bash script for the Glue Agent + customBashScript: "" + # -- Custom Bash script for the Task Pods ran by Glue Agent + containerCustomBashScript: "" + # -- Extra Environment variables for Glue Agent extraEnvs: [] + # - name: PYTHONPATH + # value: "somepath" - # -- template for pods spawned to consume ClearML Task - podTemplate: + # -- volumes definition for Glue Agent (example in values.yaml comments) + volumes: [] + # - name: "yourvolume" + # nfs: + # server: 192.168.0.1 + # path: /var/nfs/mount + # -- volume mounts definition for Glue Agent (example in values.yaml comments) + volumeMounts: [] + # - name: yourvolume + # mountPath: /yourpath + # subPath: userfolder + + # -- file definition for Glue Agent (example in values.yaml comments) + fileMounts: [] + # - name: "integration.py" + # folderPath: "/mnt/python" + # fileContent: |- + # def get_template(*args, **kwargs): + # print("args: {}".format(args)) + # print("kwargs: {}".format(kwargs)) + # return { + # "template": { + # } + # } + + # -- base template for pods spawned to consume ClearML Task + basePodTemplate: + # -- initContainers definition for pods spawned to consume ClearML Task (example in values.yaml comments) + initContainers: [] + # - name: volume-dirs-init-cntr + # image: busybox:1.35 + # command: + # - /bin/bash + # - -c + # - > + # /bin/echo "this is an init"; + # -- labels setup for pods spawned to consume ClearML Task (example in values.yaml comments) + labels: {} + # schedulerName: scheduler + # -- schedulerName setup for pods spawned to consume ClearML Task + schedulerName: "" # -- volumes definition for pods spawned to consume ClearML Task (example in values.yaml comments) volumes: [] # - name: "yourvolume" - # persistentVolumeClaim: - # claimName: "yourvolume" - # -- volumeMounts definition for pods spawned to consume ClearML Task (example in values.yaml comments) + # nfs: + # server: 192.168.0.1 + # path: /var/nfs/mount + # -- volume mounts definition for pods spawned to consume ClearML Task (example in values.yaml comments) volumeMounts: [] - # - name: "yourvolume" - # mountPath: "/yourpath" + # - name: yourvolume + # mountPath: /yourpath + # subPath: userfolder + # -- file definition for pods spawned to consume ClearML Task (example in values.yaml comments) + fileMounts: [] + # - name: "mounted-file.txt" + # folderPath: "/mnt/" + # fileContent: |- + # this is a test file + # with test content # -- environment variables for pods spawned to consume ClearML Task (example in values.yaml comments) env: [] # # to setup access to private repo, setup secret with git credentials: @@ -87,6 +139,10 @@ agentk8sglue: # secretKeyRef: # name: git-password # key: git-password + # - name: CURL_CA_BUNDLE + # value: "" + # - name: PYTHONWARNINGS + # value: "=\"ignore:Unverified HTTPS request\"" # -- resources declaration for pods spawned to consume ClearML Task (example in values.yaml comments) resources: {} # limits: @@ -99,3 +155,66 @@ agentk8sglue: # -- nodeSelector setup for pods spawned to consume ClearML Task (example in values.yaml comments) nodeSelector: {} # fleet: gpu-nodes + # -- securityContext setup for pods spawned to consume ClearML Task (example in values.yaml comments) + securityContext: {} + # runAsUser: 1000 + # -- hostAliases setup for pods spawned to consume ClearML Task (example in values.yaml comments) + hostAliases: {} + # - ip: "127.0.0.1" + # hostnames: + # - "foo.local" + # - "bar.local" + +# -- Sessions internal service configuration +sessions: + # -- Enable/Disable sessions portmode WARNING: only one Agent deployment can have this set to true + portModeEnabled: false + # -- Enable/Disable dynamic svc for sessions pods + dynamicSvcs: false + # -- specific annotations for session services + svcAnnotations: {} + # -- service type ("NodePort" or "ClusterIP" or "LoadBalancer") + svcType: "NodePort" + # -- External IP sessions clients can connect to + externalIP: 0.0.0.0 + # -- starting range of exposed NodePorts + startingPort: 30000 + # -- maximum number of NodePorts exposed + maxServices: 20 + # -- set interactive queue tags + setInteractiveQueuesTag: true + +# -- Enterprise features (work only with an Enterprise license) +enterpriseFeatures: + # -- Enable/Disable Enterprise features + enabled: false + # -- service account access every namespace flag + serviceAccountClusterAccess: false + # -- push env vars from Clear.ML Vault to task pods + applyVaultEnvVars: true + # -- GPU resource general counters + monitoredResources: + # -- Field name used by Agent to count minimum resources + minResourcesFieldName: "resources|limits|nvidia.com/gpu" + # -- Maximum resources counter + maxResources: 0 + # -- Field name used by Agent to count maximum resources + maxResourcesFieldName: "resources|limits|nvidia.com/gpu" + # -- maximum concurrent consume ClearML Task pod + maxPods: 10 + # -- Agent must use owner Token + useOwnerToken: true + # -- ClearML queues and related template OVERRIDES used this agent will consume + queues: + # -- name of the queue will be used for this template + default: + # -- overrides of the base template for this queue (must be declared even if empty!) + templateOverrides: {} + ## -- name of the queue will be used for this template + # default-gpu: + # # -- overrides of the base template for this queue + # templateOverrides: + # # -- resources declaration for pods spawned to consume ClearML Task + # resources: + # limits: + # nvidia.com/gpu: 1