From cd7f22f7d83a7aec5f36ec4da012a4eaf0385c30 Mon Sep 17 00:00:00 2001 From: Niels ten Boom Date: Tue, 18 Jan 2022 23:27:12 +0100 Subject: [PATCH] feat: Add k8s glue agent deployment (#49) --- charts/clearml/Chart.yaml | 2 +- charts/clearml/README.md | 11 +++- .../configmap-agentk8s-template.yaml | 35 +++++++++++ .../templates/deployment-agentk8s.yaml | 62 +++++++++++++++++++ charts/clearml/templates/rbac-agentk8s.yaml | 25 ++++++++ charts/clearml/values.yaml | 21 +++++++ 6 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 charts/clearml/templates/configmap-agentk8s-template.yaml create mode 100644 charts/clearml/templates/deployment-agentk8s.yaml create mode 100644 charts/clearml/templates/rbac-agentk8s.yaml diff --git a/charts/clearml/Chart.yaml b/charts/clearml/Chart.yaml index 00b8e3d..5e12c9c 100644 --- a/charts/clearml/Chart.yaml +++ b/charts/clearml/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: clearml description: MLOps platform type: application -version: "3.3.1" +version: "3.4.0" appVersion: "1.1.1" home: https://clear.ml icon: https://raw.githubusercontent.com/allegroai/clearml/master/docs/clearml-logo.svg diff --git a/charts/clearml/README.md b/charts/clearml/README.md index 893adaf..0037722 100644 --- a/charts/clearml/README.md +++ b/charts/clearml/README.md @@ -1,6 +1,6 @@ # ClearML Ecosystem for Kubernetes -![Version: 3.3.1](https://img.shields.io/badge/Version-3.3.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square) +![Version: 3.4.0](https://img.shields.io/badge/Version-3.4.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square) MLOps platform @@ -163,6 +163,15 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a | agentGroups.agent-group-gpu.replicaCount | int | `0` | | | agentGroups.agent-group-gpu.tolerations | list | `[]` | | | agentGroups.agent-group-gpu.updateStrategy | string | `"Recreate"` | | +| agentk8sglue.defaultDockerImage | string | `"nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04"` | | +| agentk8sglue.enabled | bool | `false` | | +| agentk8sglue.id | string | `"k8s-agent"` | | +| agentk8sglue.image.repository | string | `"allegroai/clearml-agent-k8s"` | | +| agentk8sglue.image.tag | string | `"aws-latest-1.21"` | | +| agentk8sglue.maxPods | int | `10` | | +| agentk8sglue.podTemplate.nodeSelector | object | `{}` | | +| agentk8sglue.podTemplate.tolerations | object | `{}` | | +| agentk8sglue.queue | string | `"aws-instances"` | | | agentservices.affinity | object | `{}` | | | agentservices.agentVersion | string | `""` | | | agentservices.awsAccessKeyId | string | `nil` | | diff --git a/charts/clearml/templates/configmap-agentk8s-template.yaml b/charts/clearml/templates/configmap-agentk8s-template.yaml new file mode 100644 index 0000000..7cccdeb --- /dev/null +++ b/charts/clearml/templates/configmap-agentk8s-template.yaml @@ -0,0 +1,35 @@ +{{- if .Values.agentk8sglue.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: k8sagent-pod-template +data: + template.yaml: | + apiVersion: v1 + metadata: + namespace: {{ .Release.namespace }} + spec: + containers: + - env: + - name: CLEARML_API_HOST + value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.clearml.apiserver.service.port }}" + - name: CLEARML_WEB_HOST + value: "http://{{ include "clearml.fullname" . }}-webserver" + - name: CLEARML_FILES_HOST + value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.clearml.fileserver.service.port }}" + - name: CLEARML_API_ACCESS_KEY + valueFrom: + secretKeyRef: + name: clearml-conf + key: apiserver_key + - name: CLEARML_API_SECRET_KEY + valueFrom: + secretKeyRef: + name: clearml-conf + key: apiserver_secret + tolerations: + {{- toYaml .Values.agentk8sglue.podTemplate.tolerations | nindent 8 }} + nodeSelector: + {{- toYaml .Values.agentk8sglue.podTemplate.nodeSelector | nindent 8 }} +{{- end }} + diff --git a/charts/clearml/templates/deployment-agentk8s.yaml b/charts/clearml/templates/deployment-agentk8s.yaml new file mode 100644 index 0000000..c65116b --- /dev/null +++ b/charts/clearml/templates/deployment-agentk8s.yaml @@ -0,0 +1,62 @@ +{{- if .Values.agentk8sglue.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "{{ include "clearml.fullname" . }}-k8sagent" + labels: + app: k8sagent +spec: + replicas: 1 + selector: + matchLabels: + app: k8sagent + template: + metadata: + labels: + app: k8sagent + spec: + containers: + - name: k8s-glue-container + image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}" + imagePullPolicy: Always + command: ["/bin/bash", "-c", "export PATH=$PATH:$HOME/bin; source /root/.bashrc && /root/entrypoint.sh"] + volumeMounts: + - name: k8sagent-pod-template + mountPath: /root/template + env: + - name: CLEARML_API_HOST + value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.clearml.apiserver.service.port }}" + - name: CLEARML_WEB_HOST + value: "http://{{ include "clearml.fullname" . }}-webserver" + - name: CLEARML_FILES_HOST + value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.clearml.fileserver.service.port }}" + - name: K8S_GLUE_MAX_PODS + value: "{{.Values.agentk8sglue.maxPods}}" + - name: K8S_GLUE_QUEUE + value: "{{.Values.agentk8sglue.queue}}" + - name: K8S_GLUE_EXTRA_ARGS + value: "--template-yaml /root/template/template.yaml" + - name: CLEARML_API_ACCESS_KEY + valueFrom: + secretKeyRef: + name: clearml-conf + key: apiserver_key + - name: CLEARML_API_SECRET_KEY + valueFrom: + secretKeyRef: + name: clearml-conf + key: apiserver_secret + - name: CLEARML_WORKER_ID + value: "{{.Values.agentk8sglue.id}}" + - name: CLEARML_AGENT_UPDATE_REPO + value: "" + - name: FORCE_CLEARML_AGENT_REPO + value: "" + - name: CLEARML_DOCKER_IMAGE + value: "{{.Values.agentk8sglue.defaultDockerImage}}" + volumes: + - name: k8sagent-pod-template + configMap: + name: k8sagent-pod-template +{{- end }} + diff --git a/charts/clearml/templates/rbac-agentk8s.yaml b/charts/clearml/templates/rbac-agentk8s.yaml new file mode 100644 index 0000000..5610ff2 --- /dev/null +++ b/charts/clearml/templates/rbac-agentk8s.yaml @@ -0,0 +1,25 @@ +{{- if .Values.agentk8sglue.enabled }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: k8sagent-pods-access +rules: + - apiGroups: + - "" + resources: + - pods + verbs: ["get", "list", "watch", "create", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: k8sagent-pods-access +subjects: + - kind: ServiceAccount + name: default + namespace: {{ .Release.namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: k8sagent-pods-access +{{- end }} diff --git a/charts/clearml/values.yaml b/charts/clearml/values.yaml index feb2b91..d9544d1 100644 --- a/charts/clearml/values.yaml +++ b/charts/clearml/values.yaml @@ -294,6 +294,27 @@ agentGroups: affinity: {} +# This agent will spawn queued experiments in new pods, a good use case is to combine this with +# GPU autoscaling nodes. +# https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue +agentk8sglue: + enabled: false + image: + repository: "allegroai/clearml-agent-k8s" + tag: "aws-latest-1.21" + maxPods: 10 + defaultDockerImage: nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04 # default docker image that is spawned as new pod + queue: aws-instances # create this queue manually in the UI first for it to work + id: k8s-agent + podTemplate: + tolerations: {} + # - key: "nvidia.com/gpu" + # operator: Exists + # effect: "NoSchedule" + nodeSelector: {} + # fleet: gpu-nodes + + externalServices: # -- Existing ElasticSearch Hostname to use if elasticsearch.enabled is false elasticsearchHost: ""