Compare commits

..

10 Commits

Author SHA1 Message Date
Niels ten Boom
cd7f22f7d8 feat: Add k8s glue agent deployment (#49) 2022-01-18 23:27:12 +01:00
Shaun Howell
078e394e24 update ingress templates to accept per-service annotation overrides (#48)
* update ingress yamls to accept annotation overrides

* bump version to 3.3.1

* update readme via helm-docs
2022-01-18 18:06:01 +01:00
Valeriano Manassero
70b07c637a Update Elasticsearch (#47)
* update elasticsearch

* update elasticsearch reference

* bump up chart version
2022-01-05 08:26:57 +01:00
Valeriano Manassero
7b8e40c626 Agent foreground mode (#46)
* use foreground to push output on console

* bump up version
2021-12-13 09:04:02 +01:00
Valeriano Manassero
d8117eeb0d add k8s 1.22.1 to ci procedure (#44)
After some tests I found 1.22.1 doesn't have ulimit issue so I can include it into the ci process
2021-12-09 11:47:13 +01:00
Valeriano Manassero
4c09ae2c92 Fix env typo (#39)
* typo fix

* bump up version
2021-12-09 11:39:04 +01:00
Valeriano Manassero
478eecd5f2 remove k8s 1.22 from ci (#43)
It looks 1.22 k8s image from kind has a very low ulimit preventing elastic search from installing, removing it waiting for a fix.
2021-12-09 11:30:15 +01:00
Valeriano Manassero
43f4c44219 test one single kind cluster at time to avoid pressure fails (#42) 2021-12-09 11:00:57 +01:00
Valeriano Manassero
b83c8cd0e8 indentation fix (#41) 2021-12-09 10:32:42 +01:00
Valeriano Manassero
97f219228d update kind k8s versions (#40) 2021-12-09 10:31:17 +01:00
15 changed files with 193 additions and 27 deletions

View File

@@ -21,8 +21,9 @@ jobs:
strategy:
matrix:
k8s:
- v1.20.7
- v1.21.1
- v1.21.2
- v1.22.1
- v1.23.0
steps:
- name: Checkout
uses: actions/checkout@v1

View File

@@ -7,6 +7,6 @@ dependencies:
version: 10.3.4
- name: elasticsearch
repository: https://helm.elastic.co
version: 7.10.1
digest: sha256:aefd3992b2ab085161e4cca35c6f73dd33f8d19272a9405b5ee4e8c2a0e79bba
generated: "2021-01-05T14:26:33.629164+01:00"
version: 7.16.2
digest: sha256:ac733cb02d50e8398c1d2832988333896f1c7b765c19a0f1eea5e9b24bdb8207
generated: "2022-01-05T07:52:34.913745+01:00"

View File

@@ -2,7 +2,7 @@ apiVersion: v2
name: clearml
description: MLOps platform
type: application
version: "3.2.0"
version: "3.4.0"
appVersion: "1.1.1"
home: https://clear.ml
icon: https://raw.githubusercontent.com/allegroai/clearml/master/docs/clearml-logo.svg
@@ -18,14 +18,14 @@ keywords:
- mlops
dependencies:
- name: redis
version: "~10.9.0"
version: "10.9.0"
repository: "https://charts.bitnami.com/bitnami"
condition: redis.enabled
- name: mongodb
version: "~10.3.2"
version: "10.3.4"
repository: "https://charts.bitnami.com/bitnami"
condition: mongodb.enabled
- name: elasticsearch
version: "~7.10.1"
version: "7.16.2"
repository: "https://helm.elastic.co"
condition: elasticsearch.enabled

View File

@@ -1,6 +1,6 @@
# ClearML Ecosystem for Kubernetes
![Version: 3.2.0](https://img.shields.io/badge/Version-3.2.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square)
![Version: 3.4.0](https://img.shields.io/badge/Version-3.4.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square)
MLOps platform
@@ -107,9 +107,9 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a
| Repository | Name | Version |
|------------|------|---------|
| https://charts.bitnami.com/bitnami | mongodb | ~10.3.2 |
| https://charts.bitnami.com/bitnami | redis | ~10.9.0 |
| https://helm.elastic.co | elasticsearch | ~7.10.1 |
| https://charts.bitnami.com/bitnami | mongodb | 10.3.4 |
| https://charts.bitnami.com/bitnami | redis | 10.9.0 |
| https://helm.elastic.co | elasticsearch | 7.16.2 |
## Values
@@ -163,6 +163,15 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a
| agentGroups.agent-group-gpu.replicaCount | int | `0` | |
| agentGroups.agent-group-gpu.tolerations | list | `[]` | |
| agentGroups.agent-group-gpu.updateStrategy | string | `"Recreate"` | |
| agentk8sglue.defaultDockerImage | string | `"nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04"` | |
| agentk8sglue.enabled | bool | `false` | |
| agentk8sglue.id | string | `"k8s-agent"` | |
| agentk8sglue.image.repository | string | `"allegroai/clearml-agent-k8s"` | |
| agentk8sglue.image.tag | string | `"aws-latest-1.21"` | |
| agentk8sglue.maxPods | int | `10` | |
| agentk8sglue.podTemplate.nodeSelector | object | `{}` | |
| agentk8sglue.podTemplate.tolerations | object | `{}` | |
| agentk8sglue.queue | string | `"aws-instances"` | |
| agentservices.affinity | object | `{}` | |
| agentservices.agentVersion | string | `""` | |
| agentservices.awsAccessKeyId | string | `nil` | |
@@ -191,7 +200,7 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a
| agentservices.tolerations | list | `[]` | |
| apiserver.additionalConfigs | object | `{}` | |
| apiserver.affinity | object | `{}` | |
| apiserver.authCoockiesMaxAge | int | `864000` | Amount of seconds the authorization cookie will last in user browser |
| apiserver.authCookiesMaxAge | int | `864000` | Amount of seconds the authorization cookie will last in user browser |
| apiserver.configDir | string | `"/opt/clearml/config"` | |
| apiserver.extraEnvs | list | `[]` | |
| apiserver.image.pullPolicy | string | `"IfNotPresent"` | |
@@ -268,11 +277,14 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a
| fileserver.storage.data.size | string | `"50Gi"` | |
| fileserver.tolerations | list | `[]` | |
| ingress.annotations | object | `{}` | |
| ingress.api.annotations | object | `{}` | |
| ingress.api.hostName | string | `"api.clearml.127-0-0-1.nip.io"` | |
| ingress.api.tlsSecretName | string | `""` | |
| ingress.app.annotations | object | `{}` | |
| ingress.app.hostName | string | `"app.clearml.127-0-0-1.nip.io"` | |
| ingress.app.tlsSecretName | string | `""` | |
| ingress.enabled | bool | `false` | |
| ingress.files.annotations | object | `{}` | |
| ingress.files.hostName | string | `"files.clearml.127-0-0-1.nip.io"` | |
| ingress.files.tlsSecretName | string | `""` | |
| ingress.name | string | `"clearml-server-ingress"` | |

Binary file not shown.

View File

@@ -0,0 +1,35 @@
{{- if .Values.agentk8sglue.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: k8sagent-pod-template
data:
template.yaml: |
apiVersion: v1
metadata:
namespace: {{ .Release.namespace }}
spec:
containers:
- env:
- name: CLEARML_API_HOST
value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.clearml.apiserver.service.port }}"
- name: CLEARML_WEB_HOST
value: "http://{{ include "clearml.fullname" . }}-webserver"
- name: CLEARML_FILES_HOST
value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.clearml.fileserver.service.port }}"
- name: CLEARML_API_ACCESS_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_key
- name: CLEARML_API_SECRET_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_secret
tolerations:
{{- toYaml .Values.agentk8sglue.podTemplate.tolerations | nindent 8 }}
nodeSelector:
{{- toYaml .Values.agentk8sglue.podTemplate.nodeSelector | nindent 8 }}
{{- end }}

View File

@@ -94,7 +94,7 @@ spec:
apt-get install -y curl python3-pip git;
python3 -m pip install -U pip ;
python3 -m pip install clearml-agent{{ .agentVersion}} ;
CLEARML_AGENT_K8S_HOST_MOUNT=/root/.clearml:/root/.clearml clearml-agent daemon --queue {{ .queues}}"
CLEARML_AGENT_K8S_HOST_MOUNT=/root/.clearml:/root/.clearml clearml-agent daemon --foreground --queue {{ .queues}}"
{{ if .clearmlConfig }}
volumeMounts:
- name: agent-clearml-conf-volume

View File

@@ -0,0 +1,62 @@
{{- if .Values.agentk8sglue.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: "{{ include "clearml.fullname" . }}-k8sagent"
labels:
app: k8sagent
spec:
replicas: 1
selector:
matchLabels:
app: k8sagent
template:
metadata:
labels:
app: k8sagent
spec:
containers:
- name: k8s-glue-container
image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}"
imagePullPolicy: Always
command: ["/bin/bash", "-c", "export PATH=$PATH:$HOME/bin; source /root/.bashrc && /root/entrypoint.sh"]
volumeMounts:
- name: k8sagent-pod-template
mountPath: /root/template
env:
- name: CLEARML_API_HOST
value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.clearml.apiserver.service.port }}"
- name: CLEARML_WEB_HOST
value: "http://{{ include "clearml.fullname" . }}-webserver"
- name: CLEARML_FILES_HOST
value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.clearml.fileserver.service.port }}"
- name: K8S_GLUE_MAX_PODS
value: "{{.Values.agentk8sglue.maxPods}}"
- name: K8S_GLUE_QUEUE
value: "{{.Values.agentk8sglue.queue}}"
- name: K8S_GLUE_EXTRA_ARGS
value: "--template-yaml /root/template/template.yaml"
- name: CLEARML_API_ACCESS_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_key
- name: CLEARML_API_SECRET_KEY
valueFrom:
secretKeyRef:
name: clearml-conf
key: apiserver_secret
- name: CLEARML_WORKER_ID
value: "{{.Values.agentk8sglue.id}}"
- name: CLEARML_AGENT_UPDATE_REPO
value: ""
- name: FORCE_CLEARML_AGENT_REPO
value: ""
- name: CLEARML_DOCKER_IMAGE
value: "{{.Values.agentk8sglue.defaultDockerImage}}"
volumes:
- name: k8sagent-pod-template
configMap:
name: k8sagent-pod-template
{{- end }}

View File

@@ -71,7 +71,7 @@ spec:
- name: CLEARML_SERVER_DEPLOYMENT_TYPE
value: "helm-cloud"
- name: CLEARML__APISERVER__AUTH__COOKIES__MAX_AGE
value: "{{ .Values.apiserver.authCoockiesMaxAge }}"
value: "{{ .Values.apiserver.authCookiesMaxAge }}"
- name: CLEARML_CONFIG_DIR
value: /opt/clearml/config
- name: CLEARML__APISERVER__DEFAULT_COMPANY

View File

@@ -11,10 +11,13 @@ metadata:
name: {{ include "clearml.fullname" . }}-api
labels:
{{- include "clearml.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- $annotations := .Values.ingress.annotations }}
{{- if .Values.ingress.api.annotations }}
{{- $annotations = mergeOverwrite $annotations .Values.ingress.api.annotations }}
{{- end }}
annotations:
{{- toYaml $annotations | nindent 4 }}
spec:
{{- if .Values.ingress.api.tlsSecretName }}
tls:

View File

@@ -11,10 +11,12 @@ metadata:
name: {{ include "clearml.fullname" . }}-app
labels:
{{- include "clearml.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- $annotations := .Values.ingress.annotations }}
{{- if .Values.ingress.app.annotations }}
{{- $annotations = mergeOverwrite $annotations .Values.ingress.app.annotations }}
{{- end }}
annotations:
{{- toYaml $annotations | nindent 4 }}
spec:
{{- if .Values.ingress.app.tlsSecretName }}
tls:

View File

@@ -11,10 +11,12 @@ metadata:
name: {{ include "clearml.fullname" . }}-files
labels:
{{- include "clearml.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- $annotations := .Values.ingress.annotations }}
{{- if .Values.ingress.files.annotations }}
{{- $annotations = mergeOverwrite $annotations .Values.ingress.files.annotations }}
{{- end }}
annotations:
{{- toYaml $annotations | nindent 4 }}
spec:
{{- if .Values.ingress.files.tlsSecretName }}
tls:

View File

@@ -0,0 +1,25 @@
{{- if .Values.agentk8sglue.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: k8sagent-pods-access
rules:
- apiGroups:
- ""
resources:
- pods
verbs: ["get", "list", "watch", "create", "patch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: k8sagent-pods-access
subjects:
- kind: ServiceAccount
name: default
namespace: {{ .Release.namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: k8sagent-pods-access
{{- end }}

View File

@@ -7,12 +7,15 @@ ingress:
app:
hostName: "app.clearml.127-0-0-1.nip.io"
tlsSecretName: ""
annotations: {}
api:
hostName: "api.clearml.127-0-0-1.nip.io"
tlsSecretName: ""
annotations: {}
files:
hostName: "files.clearml.127-0-0-1.nip.io"
tlsSecretName: ""
annotations: {}
secret:
# -- Set for http_session field
@@ -38,7 +41,7 @@ apiserver:
configDir: /opt/clearml/config
# -- Amount of seconds the authorization cookie will last in user browser
authCoockiesMaxAge: 864000
authCookiesMaxAge: 864000
service:
# -- This will set to service's spec.type field
@@ -291,6 +294,27 @@ agentGroups:
affinity: {}
# This agent will spawn queued experiments in new pods, a good use case is to combine this with
# GPU autoscaling nodes.
# https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue
agentk8sglue:
enabled: false
image:
repository: "allegroai/clearml-agent-k8s"
tag: "aws-latest-1.21"
maxPods: 10
defaultDockerImage: nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04 # default docker image that is spawned as new pod
queue: aws-instances # create this queue manually in the UI first for it to work
id: k8s-agent
podTemplate:
tolerations: {}
# - key: "nvidia.com/gpu"
# operator: Exists
# effect: "NoSchedule"
nodeSelector: {}
# fleet: gpu-nodes
externalServices:
# -- Existing ElasticSearch Hostname to use if elasticsearch.enabled is false
elasticsearchHost: ""
@@ -337,7 +361,7 @@ mongodb: # configuration from https://github.com/bitnami/charts/blob/master/bit
port: 27017
portName: mongo-service
elasticsearch: # configuration from https://github.com/elastic/helm-charts/blob/7.10/elasticsearch/values.yaml
elasticsearch: # configuration from https://github.com/elastic/helm-charts/blob/7.16/elasticsearch/values.yaml
enabled: true
httpPort: 9200
roles: