146 agentadd affinity config (#147)

* Added: affinity parameter

* Changed: bump version
This commit is contained in:
Valeriano Manassero 2023-02-02 12:20:06 +01:00 committed by GitHub
parent 12baef0d75
commit c7b3a28989
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 40 additions and 10 deletions

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: clearml-agent name: clearml-agent
description: MLOps platform Task running agent description: MLOps platform Task running agent
type: application type: application
version: "3.2.0" version: "3.3.0"
appVersion: "1.24" appVersion: "1.24"
kubeVersion: ">= 1.21.0-0 < 1.27.0-0" kubeVersion: ">= 1.21.0-0 < 1.27.0-0"
home: https://clear.ml home: https://clear.ml
@ -21,6 +21,4 @@ keywords:
annotations: annotations:
artifacthub.io/changes: | artifacthub.io/changes: |
- kind: added - kind: added
description: securityContext parameter for agent pod description: affinity parameter
- kind: added
description: support for kubernetes 1.26

View File

@ -1,6 +1,6 @@
# ClearML Kubernetes Agent # ClearML Kubernetes Agent
![Version: 3.2.0](https://img.shields.io/badge/Version-3.2.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.24](https://img.shields.io/badge/AppVersion-1.24-informational?style=flat-square) ![Version: 3.3.0](https://img.shields.io/badge/Version-3.3.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.24](https://img.shields.io/badge/AppVersion-1.24-informational?style=flat-square)
MLOps platform Task running agent MLOps platform Task running agent
@ -30,10 +30,12 @@ Kubernetes: `>= 1.21.0-0 < 1.27.0-0`
| Key | Type | Default | Description | | Key | Type | Default | Description |
|-----|------|---------|-------------| |-----|------|---------|-------------|
| agentk8sglue | object | `{"annotations":{},"apiServerUrlReference":"https://api.clear.ml","basePodTemplate":{"annotations":{},"env":[],"fileMounts":[],"hostAliases":{},"initContainers":[],"labels":{},"nodeSelector":{},"resources":{},"schedulerName":"","securityContext":{},"tolerations":[],"volumeMounts":[],"volumes":[]},"clearmlcheckCertificate":true,"containerCustomBashScript":"","customBashScript":"","debugMode":false,"defaultContainerImage":"ubuntu:18.04","extraEnvs":[],"fileMounts":[],"fileServerUrlReference":"https://files.clear.ml","image":{"repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-21"},"labels":{},"nodeSelector":{},"queue":"default","replicaCount":1,"securityContext":{},"serviceExistingAccountName":"","volumeMounts":[],"volumes":[],"webServerUrlReference":"https://app.clear.ml"}` | This agent will spawn queued experiments in new pods, a good use case is to combine this with GPU autoscaling nodes. https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue | | agentk8sglue | object | `{"affinity":{},"annotations":{},"apiServerUrlReference":"https://api.clear.ml","basePodTemplate":{"affinity":{},"annotations":{},"env":[],"fileMounts":[],"hostAliases":{},"initContainers":[],"labels":{},"nodeSelector":{},"resources":{},"schedulerName":"","securityContext":{},"tolerations":[],"volumeMounts":[],"volumes":[]},"clearmlcheckCertificate":true,"containerCustomBashScript":"","customBashScript":"","debugMode":false,"defaultContainerImage":"ubuntu:18.04","extraEnvs":[],"fileMounts":[],"fileServerUrlReference":"https://files.clear.ml","image":{"repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-21"},"labels":{},"nodeSelector":{},"queue":"default","replicaCount":1,"securityContext":{},"serviceExistingAccountName":"","tolerations":[],"volumeMounts":[],"volumes":[],"webServerUrlReference":"https://app.clear.ml"}` | This agent will spawn queued experiments in new pods, a good use case is to combine this with GPU autoscaling nodes. https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue |
| agentk8sglue.affinity | object | `{}` | affinity setup for Agent pod (example in values.yaml comments) |
| agentk8sglue.annotations | object | `{}` | annotations setup for Agent pod (example in values.yaml comments) | | agentk8sglue.annotations | object | `{}` | annotations setup for Agent pod (example in values.yaml comments) |
| agentk8sglue.apiServerUrlReference | string | `"https://api.clear.ml"` | Reference to Api server url | | agentk8sglue.apiServerUrlReference | string | `"https://api.clear.ml"` | Reference to Api server url |
| agentk8sglue.basePodTemplate | object | `{"annotations":{},"env":[],"fileMounts":[],"hostAliases":{},"initContainers":[],"labels":{},"nodeSelector":{},"resources":{},"schedulerName":"","securityContext":{},"tolerations":[],"volumeMounts":[],"volumes":[]}` | base template for pods spawned to consume ClearML Task | | agentk8sglue.basePodTemplate | object | `{"affinity":{},"annotations":{},"env":[],"fileMounts":[],"hostAliases":{},"initContainers":[],"labels":{},"nodeSelector":{},"resources":{},"schedulerName":"","securityContext":{},"tolerations":[],"volumeMounts":[],"volumes":[]}` | base template for pods spawned to consume ClearML Task |
| agentk8sglue.basePodTemplate.affinity | object | `{}` | affinity setup for pods spawned to consume ClearML Task |
| agentk8sglue.basePodTemplate.annotations | object | `{}` | annotations setup for pods spawned to consume ClearML Task (example in values.yaml comments) | | agentk8sglue.basePodTemplate.annotations | object | `{}` | annotations setup for pods spawned to consume ClearML Task (example in values.yaml comments) |
| agentk8sglue.basePodTemplate.env | list | `[]` | environment variables for pods spawned to consume ClearML Task (example in values.yaml comments) | | agentk8sglue.basePodTemplate.env | list | `[]` | environment variables for pods spawned to consume ClearML Task (example in values.yaml comments) |
| agentk8sglue.basePodTemplate.fileMounts | list | `[]` | file definition for pods spawned to consume ClearML Task (example in values.yaml comments) | | agentk8sglue.basePodTemplate.fileMounts | list | `[]` | file definition for pods spawned to consume ClearML Task (example in values.yaml comments) |
@ -61,6 +63,7 @@ Kubernetes: `>= 1.21.0-0 < 1.27.0-0`
| agentk8sglue.replicaCount | int | `1` | Glue Agent number of pods | | agentk8sglue.replicaCount | int | `1` | Glue Agent number of pods |
| agentk8sglue.securityContext | object | `{}` | Web Server pod security context | | agentk8sglue.securityContext | object | `{}` | Web Server pod security context |
| agentk8sglue.serviceExistingAccountName | string | `""` | if set, don't create a serviceAccountName but use defined existing one | | agentk8sglue.serviceExistingAccountName | string | `""` | if set, don't create a serviceAccountName but use defined existing one |
| agentk8sglue.tolerations | list | `[]` | tolerations setup for Agent pod (example in values.yaml comments) |
| agentk8sglue.volumeMounts | list | `[]` | volume mounts definition for Glue Agent (example in values.yaml comments) | | agentk8sglue.volumeMounts | list | `[]` | volume mounts definition for Glue Agent (example in values.yaml comments) |
| agentk8sglue.volumes | list | `[]` | volumes definition for Glue Agent (example in values.yaml comments) | | agentk8sglue.volumes | list | `[]` | volumes definition for Glue Agent (example in values.yaml comments) |
| agentk8sglue.webServerUrlReference | string | `"https://app.clear.ml"` | Reference to Web server url | | agentk8sglue.webServerUrlReference | string | `"https://app.clear.ml"` | Reference to Web server url |

View File

@ -172,6 +172,17 @@ data:
{{- toYaml . | nindent 10 }} {{- toYaml . | nindent 10 }}
{{- end }} {{- end }}
{{- end }} {{- end }}
{{- if $value.templateOverrides.affinity }}
{{- with $value.templateOverrides.affinity }}
affinity:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- else if $.Values.agentk8sglue.basePodTemplate.affinity }}
{{- with $.Values.agentk8sglue.basePodTemplate.affinity }}
affinity:
{{- toYaml . | nindent 10 }}
{{- end }}
{{- end }}
{{- end }} {{- end }}
secrets.yaml: | secrets.yaml: |
{{- range $key, $value := $.Values.enterpriseFeatures.queues }} {{- range $key, $value := $.Values.enterpriseFeatures.queues }}
@ -250,6 +261,10 @@ data:
tolerations: tolerations:
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
{{- with .Values.agentk8sglue.basePodTemplate.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }} {{- end }}
{{- if .Values.sessions.portModeEnabled }} {{- if .Values.sessions.portModeEnabled }}
{{- range untilStep 1 ( ( add .Values.sessions.maxServices 1 ) | int ) 1 }} {{- range untilStep 1 ( ( add .Values.sessions.maxServices 1 ) | int ) 1 }}

View File

@ -177,6 +177,14 @@ spec:
nodeSelector: nodeSelector:
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
{{- with .Values.agentk8sglue.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.agentk8sglue.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
volumes: volumes:
- name: {{ include "clearml.name" . }}-pt - name: {{ include "clearml.name" . }}-pt
configMap: configMap:

View File

@ -82,6 +82,10 @@ agentk8sglue:
# -- nodeSelector setup for Agent pod (example in values.yaml comments) # -- nodeSelector setup for Agent pod (example in values.yaml comments)
nodeSelector: {} nodeSelector: {}
# fleet: agent-nodes # fleet: agent-nodes
# -- tolerations setup for Agent pod (example in values.yaml comments)
tolerations: []
# -- affinity setup for Agent pod (example in values.yaml comments)
affinity: {}
# -- volumes definition for Glue Agent (example in values.yaml comments) # -- volumes definition for Glue Agent (example in values.yaml comments)
volumes: [] volumes: []
# - name: "yourvolume" # - name: "yourvolume"
@ -162,14 +166,16 @@ agentk8sglue:
resources: {} resources: {}
# limits: # limits:
# nvidia.com/gpu: 1 # nvidia.com/gpu: 1
# -- nodeSelector setup for pods spawned to consume ClearML Task (example in values.yaml comments)
nodeSelector: {}
# fleet: gpu-nodes
# -- tolerations setup for pods spawned to consume ClearML Task (example in values.yaml comments) # -- tolerations setup for pods spawned to consume ClearML Task (example in values.yaml comments)
tolerations: [] tolerations: []
# - key: "nvidia.com/gpu" # - key: "nvidia.com/gpu"
# operator: Exists # operator: Exists
# effect: "NoSchedule" # effect: "NoSchedule"
# -- nodeSelector setup for pods spawned to consume ClearML Task (example in values.yaml comments) # -- affinity setup for pods spawned to consume ClearML Task
nodeSelector: {} affinity: {}
# fleet: gpu-nodes
# -- securityContext setup for pods spawned to consume ClearML Task (example in values.yaml comments) # -- securityContext setup for pods spawned to consume ClearML Task (example in values.yaml comments)
securityContext: {} securityContext: {}
# runAsUser: 1001 # runAsUser: 1001