Compare commits

...

3 Commits

Author SHA1 Message Date
Niels ten Boom
9c15a8a348 fix: faulty service values references in k8s agent (#50)
* add k8s glue deployment

* more docs

* bump

* disabled by default

* run helm-docs

* fix service references

* fix readme

* add values file where k8sagent enabled

* empty files

* newline

* fix linter

Co-authored-by: Valeriano Manassero <14011549+valeriano-manassero@users.noreply.github.com>
2022-01-21 16:15:09 +01:00
Niels ten Boom
cd7f22f7d8 feat: Add k8s glue agent deployment (#49) 2022-01-18 23:27:12 +01:00
Shaun Howell
078e394e24 update ingress templates to accept per-service annotation overrides (#48)
* update ingress yamls to accept annotation overrides

* bump version to 3.3.1

* update readme via helm-docs
2022-01-18 18:06:01 +01:00
12 changed files with 192 additions and 11 deletions

View File

@@ -2,7 +2,7 @@ apiVersion: v2
name: clearml
description: MLOps platform
type: application
version: "3.3.0"
version: "3.4.1"
appVersion: "1.1.1"
home: https://clear.ml
icon: https://raw.githubusercontent.com/allegroai/clearml/master/docs/clearml-logo.svg

View File

@@ -1,6 +1,6 @@
# ClearML Ecosystem for Kubernetes
![Version: 3.3.0](https://img.shields.io/badge/Version-3.3.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square)
![Version: 3.4.1](https://img.shields.io/badge/Version-3.4.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square)
MLOps platform
@@ -163,6 +163,16 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a
| agentGroups.agent-group-gpu.replicaCount | int | `0` | |
| agentGroups.agent-group-gpu.tolerations | list | `[]` | |
| agentGroups.agent-group-gpu.updateStrategy | string | `"Recreate"` | |
| agentk8sglue.defaultDockerImage | string | `"nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04"` | |
| agentk8sglue.enabled | bool | `false` | |
| agentk8sglue.id | string | `"k8s-agent"` | |
| agentk8sglue.image.repository | string | `"allegroai/clearml-agent-k8s"` | |
| agentk8sglue.image.tag | string | `"aws-latest-1.21"` | |
| agentk8sglue.maxPods | int | `10` | |
| agentk8sglue.podTemplate.nodeSelector | object | `{}` | |
| agentk8sglue.podTemplate.resources | object | `{}` | |
| agentk8sglue.podTemplate.tolerations | object | `{}` | |
| agentk8sglue.queue | string | `"aws-instances"` | |
| agentservices.affinity | object | `{}` | |
| agentservices.agentVersion | string | `""` | |
| agentservices.awsAccessKeyId | string | `nil` | |
@@ -268,11 +278,14 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a
| fileserver.storage.data.size | string | `"50Gi"` | |
| fileserver.tolerations | list | `[]` | |
| ingress.annotations | object | `{}` | |
| ingress.api.annotations | object | `{}` | |
| ingress.api.hostName | string | `"api.clearml.127-0-0-1.nip.io"` | |
| ingress.api.tlsSecretName | string | `""` | |
| ingress.app.annotations | object | `{}` | |
| ingress.app.hostName | string | `"app.clearml.127-0-0-1.nip.io"` | |
| ingress.app.tlsSecretName | string | `""` | |
| ingress.enabled | bool | `false` | |
| ingress.files.annotations | object | `{}` | |
| ingress.files.hostName | string | `"files.clearml.127-0-0-1.nip.io"` | |
| ingress.files.tlsSecretName | string | `""` | |
| ingress.name | string | `"clearml-server-ingress"` | |

View File

@@ -0,0 +1,7 @@
Place values files with different values in this directory to ensure these cases are tested by the CI as well.
https://github.com/helm/chart-testing/blob/main/doc/ct_install.md
```
"Charts may have multiple custom values files matching the glob pattern '*-values.yaml' in a directory named 'ci' in the root of the chart's directory. The chart is installed and tested for each of these files. If no custom values file is present, the chart is installed and tested with defaults."
```

View File

@@ -0,0 +1 @@
# empty so default values.yaml gets tested

View File

@@ -0,0 +1,2 @@
agentk8sglue:
enabled: true

View File

@@ -0,0 +1,37 @@
{{- if .Values.agentk8sglue.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: k8sagent-pod-template
data:
template.yaml: |
apiVersion: v1
metadata:
namespace: {{ .Release.namespace }}
spec:
containers:
- resources:
{{- toYaml .Values.agentk8sglue.podTemplate.resources | nindent 10 }}
env:
- name: CLEARML_API_HOST
value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.apiserver.service.port }}"
- name: CLEARML_WEB_HOST
value: "http://{{ include "clearml.fullname" . }}-webserver"
- name: CLEARML_FILES_HOST
value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.fileserver.service.port }}"
- name: CLEARML_API_ACCESS_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_key
- name: CLEARML_API_SECRET_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_secret
tolerations:
{{- toYaml .Values.agentk8sglue.podTemplate.tolerations | nindent 8 }}
nodeSelector:
{{- toYaml .Values.agentk8sglue.podTemplate.nodeSelector | nindent 8 }}
{{- end }}

View File

@@ -0,0 +1,62 @@
{{- if .Values.agentk8sglue.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: "{{ include "clearml.fullname" . }}-k8sagent"
labels:
app: k8sagent
spec:
replicas: 1
selector:
matchLabels:
app: k8sagent
template:
metadata:
labels:
app: k8sagent
spec:
containers:
- name: k8s-glue-container
image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}"
imagePullPolicy: Always
command: ["/bin/bash", "-c", "export PATH=$PATH:$HOME/bin; source /root/.bashrc && /root/entrypoint.sh"]
volumeMounts:
- name: k8sagent-pod-template
mountPath: /root/template
env:
- name: CLEARML_API_HOST
value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.apiserver.service.port }}"
- name: CLEARML_WEB_HOST
value: "http://{{ include "clearml.fullname" . }}-webserver"
- name: CLEARML_FILES_HOST
value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.fileserver.service.port }}"
- name: K8S_GLUE_MAX_PODS
value: "{{.Values.agentk8sglue.maxPods}}"
- name: K8S_GLUE_QUEUE
value: "{{.Values.agentk8sglue.queue}}"
- name: K8S_GLUE_EXTRA_ARGS
value: "--template-yaml /root/template/template.yaml"
- name: CLEARML_API_ACCESS_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_key
- name: CLEARML_API_SECRET_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_secret
- name: CLEARML_WORKER_ID
value: "{{.Values.agentk8sglue.id}}"
- name: CLEARML_AGENT_UPDATE_REPO
value: ""
- name: FORCE_CLEARML_AGENT_REPO
value: ""
- name: CLEARML_DOCKER_IMAGE
value: "{{.Values.agentk8sglue.defaultDockerImage}}"
volumes:
- name: k8sagent-pod-template
configMap:
name: k8sagent-pod-template
{{- end }}

View File

@@ -11,10 +11,13 @@ metadata:
name: {{ include "clearml.fullname" . }}-api
labels:
{{- include "clearml.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- $annotations := .Values.ingress.annotations }}
{{- if .Values.ingress.api.annotations }}
{{- $annotations = mergeOverwrite $annotations .Values.ingress.api.annotations }}
{{- end }}
annotations:
{{- toYaml $annotations | nindent 4 }}
spec:
{{- if .Values.ingress.api.tlsSecretName }}
tls:

View File

@@ -11,10 +11,12 @@ metadata:
name: {{ include "clearml.fullname" . }}-app
labels:
{{- include "clearml.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- $annotations := .Values.ingress.annotations }}
{{- if .Values.ingress.app.annotations }}
{{- $annotations = mergeOverwrite $annotations .Values.ingress.app.annotations }}
{{- end }}
annotations:
{{- toYaml $annotations | nindent 4 }}
spec:
{{- if .Values.ingress.app.tlsSecretName }}
tls:

View File

@@ -11,10 +11,12 @@ metadata:
name: {{ include "clearml.fullname" . }}-files
labels:
{{- include "clearml.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- $annotations := .Values.ingress.annotations }}
{{- if .Values.ingress.files.annotations }}
{{- $annotations = mergeOverwrite $annotations .Values.ingress.files.annotations }}
{{- end }}
annotations:
{{- toYaml $annotations | nindent 4 }}
spec:
{{- if .Values.ingress.files.tlsSecretName }}
tls:

View File

@@ -0,0 +1,25 @@
{{- if .Values.agentk8sglue.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: k8sagent-pods-access
rules:
- apiGroups:
- ""
resources:
- pods
verbs: ["get", "list", "watch", "create", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: k8sagent-pods-access
subjects:
- kind: ServiceAccount
name: default
namespace: {{ .Release.namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: k8sagent-pods-access
{{- end }}

View File

@@ -7,12 +7,15 @@ ingress:
app:
hostName: "app.clearml.127-0-0-1.nip.io"
tlsSecretName: ""
annotations: {}
api:
hostName: "api.clearml.127-0-0-1.nip.io"
tlsSecretName: ""
annotations: {}
files:
hostName: "files.clearml.127-0-0-1.nip.io"
tlsSecretName: ""
annotations: {}
secret:
# -- Set for http_session field
@@ -291,6 +294,30 @@ agentGroups:
affinity: {}
# This agent will spawn queued experiments in new pods, a good use case is to combine this with
# GPU autoscaling nodes.
# https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue
agentk8sglue:
enabled: false
image:
repository: "allegroai/clearml-agent-k8s"
tag: "aws-latest-1.21"
maxPods: 10
defaultDockerImage: nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04 # default docker image that is spawned as new pod
queue: aws-instances # create this queue manually in the UI first for it to work
id: k8s-agent
podTemplate:
resources: {}
# limits:
# nvidia.com/gpu: 1
tolerations: {}
# - key: "nvidia.com/gpu"
# operator: Exists
# effect: "NoSchedule"
nodeSelector: {}
# fleet: gpu-nodes
externalServices:
# -- Existing ElasticSearch Hostname to use if elasticsearch.enabled is false
elasticsearchHost: ""