feat: Add k8s glue agent deployment (#49)

This commit is contained in:
Niels ten Boom 2022-01-18 23:27:12 +01:00 committed by GitHub
parent 078e394e24
commit cd7f22f7d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 154 additions and 2 deletions

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: clearml name: clearml
description: MLOps platform description: MLOps platform
type: application type: application
version: "3.3.1" version: "3.4.0"
appVersion: "1.1.1" appVersion: "1.1.1"
home: https://clear.ml home: https://clear.ml
icon: https://raw.githubusercontent.com/allegroai/clearml/master/docs/clearml-logo.svg icon: https://raw.githubusercontent.com/allegroai/clearml/master/docs/clearml-logo.svg

View File

@ -1,6 +1,6 @@
# ClearML Ecosystem for Kubernetes # ClearML Ecosystem for Kubernetes
![Version: 3.3.1](https://img.shields.io/badge/Version-3.3.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square) ![Version: 3.4.0](https://img.shields.io/badge/Version-3.4.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square)
MLOps platform MLOps platform
@ -163,6 +163,15 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a
| agentGroups.agent-group-gpu.replicaCount | int | `0` | | | agentGroups.agent-group-gpu.replicaCount | int | `0` | |
| agentGroups.agent-group-gpu.tolerations | list | `[]` | | | agentGroups.agent-group-gpu.tolerations | list | `[]` | |
| agentGroups.agent-group-gpu.updateStrategy | string | `"Recreate"` | | | agentGroups.agent-group-gpu.updateStrategy | string | `"Recreate"` | |
| agentk8sglue.defaultDockerImage | string | `"nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04"` | |
| agentk8sglue.enabled | bool | `false` | |
| agentk8sglue.id | string | `"k8s-agent"` | |
| agentk8sglue.image.repository | string | `"allegroai/clearml-agent-k8s"` | |
| agentk8sglue.image.tag | string | `"aws-latest-1.21"` | |
| agentk8sglue.maxPods | int | `10` | |
| agentk8sglue.podTemplate.nodeSelector | object | `{}` | |
| agentk8sglue.podTemplate.tolerations | object | `{}` | |
| agentk8sglue.queue | string | `"aws-instances"` | |
| agentservices.affinity | object | `{}` | | | agentservices.affinity | object | `{}` | |
| agentservices.agentVersion | string | `""` | | | agentservices.agentVersion | string | `""` | |
| agentservices.awsAccessKeyId | string | `nil` | | | agentservices.awsAccessKeyId | string | `nil` | |

View File

@ -0,0 +1,35 @@
{{- if .Values.agentk8sglue.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: k8sagent-pod-template
data:
template.yaml: |
apiVersion: v1
metadata:
namespace: {{ .Release.namespace }}
spec:
containers:
- env:
- name: CLEARML_API_HOST
value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.clearml.apiserver.service.port }}"
- name: CLEARML_WEB_HOST
value: "http://{{ include "clearml.fullname" . }}-webserver"
- name: CLEARML_FILES_HOST
value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.clearml.fileserver.service.port }}"
- name: CLEARML_API_ACCESS_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_key
- name: CLEARML_API_SECRET_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_secret
tolerations:
{{- toYaml .Values.agentk8sglue.podTemplate.tolerations | nindent 8 }}
nodeSelector:
{{- toYaml .Values.agentk8sglue.podTemplate.nodeSelector | nindent 8 }}
{{- end }}

View File

@ -0,0 +1,62 @@
{{- if .Values.agentk8sglue.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: "{{ include "clearml.fullname" . }}-k8sagent"
labels:
app: k8sagent
spec:
replicas: 1
selector:
matchLabels:
app: k8sagent
template:
metadata:
labels:
app: k8sagent
spec:
containers:
- name: k8s-glue-container
image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}"
imagePullPolicy: Always
command: ["/bin/bash", "-c", "export PATH=$PATH:$HOME/bin; source /root/.bashrc && /root/entrypoint.sh"]
volumeMounts:
- name: k8sagent-pod-template
mountPath: /root/template
env:
- name: CLEARML_API_HOST
value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.clearml.apiserver.service.port }}"
- name: CLEARML_WEB_HOST
value: "http://{{ include "clearml.fullname" . }}-webserver"
- name: CLEARML_FILES_HOST
value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.clearml.fileserver.service.port }}"
- name: K8S_GLUE_MAX_PODS
value: "{{.Values.agentk8sglue.maxPods}}"
- name: K8S_GLUE_QUEUE
value: "{{.Values.agentk8sglue.queue}}"
- name: K8S_GLUE_EXTRA_ARGS
value: "--template-yaml /root/template/template.yaml"
- name: CLEARML_API_ACCESS_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_key
- name: CLEARML_API_SECRET_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_secret
- name: CLEARML_WORKER_ID
value: "{{.Values.agentk8sglue.id}}"
- name: CLEARML_AGENT_UPDATE_REPO
value: ""
- name: FORCE_CLEARML_AGENT_REPO
value: ""
- name: CLEARML_DOCKER_IMAGE
value: "{{.Values.agentk8sglue.defaultDockerImage}}"
volumes:
- name: k8sagent-pod-template
configMap:
name: k8sagent-pod-template
{{- end }}

View File

@ -0,0 +1,25 @@
{{- if .Values.agentk8sglue.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: k8sagent-pods-access
rules:
- apiGroups:
- ""
resources:
- pods
verbs: ["get", "list", "watch", "create", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: k8sagent-pods-access
subjects:
- kind: ServiceAccount
name: default
namespace: {{ .Release.namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: k8sagent-pods-access
{{- end }}

View File

@ -294,6 +294,27 @@ agentGroups:
affinity: {} affinity: {}
# This agent will spawn queued experiments in new pods, a good use case is to combine this with
# GPU autoscaling nodes.
# https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue
agentk8sglue:
enabled: false
image:
repository: "allegroai/clearml-agent-k8s"
tag: "aws-latest-1.21"
maxPods: 10
defaultDockerImage: nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04 # default docker image that is spawned as new pod
queue: aws-instances # create this queue manually in the UI first for it to work
id: k8s-agent
podTemplate:
tolerations: {}
# - key: "nvidia.com/gpu"
# operator: Exists
# effect: "NoSchedule"
nodeSelector: {}
# fleet: gpu-nodes
externalServices: externalServices:
# -- Existing ElasticSearch Hostname to use if elasticsearch.enabled is false # -- Existing ElasticSearch Hostname to use if elasticsearch.enabled is false
elasticsearchHost: "" elasticsearchHost: ""