This commit is contained in:
Niels ten Boom 2022-09-13 14:11:27 +02:00
parent 19a6785a03
commit 2c88ecc70e
No known key found for this signature in database
GPG Key ID: BA06100106A9088F
5 changed files with 15 additions and 32 deletions

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: clearml-agent
description: MLOps platform
type: application
version: "1.3.0"
version: "2.0.0"
appVersion: "1.24"
kubeVersion: ">= 1.19.0-0 < 1.25.0-0"
home: https://clear.ml

View File

@ -1,6 +1,6 @@
# clearml-agent
![Version: 1.3.0](https://img.shields.io/badge/Version-1.3.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.24](https://img.shields.io/badge/AppVersion-1.24-informational?style=flat-square)
![Version: 2.0.0](https://img.shields.io/badge/Version-2.0.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.24](https://img.shields.io/badge/AppVersion-1.24-informational?style=flat-square)
MLOps platform
@ -25,7 +25,7 @@ Kubernetes: `>= 1.19.0-0 < 1.25.0-0`
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| agentk8sglue | object | `{"apiServerUrlReference":"https://api.clear.ml","clearmlcheckCertificate":true,"defaultContainerImage":"ubuntu:18.04","extraEnvs":[],"fileServerUrlReference":"https://files.clear.ml","id":"k8s-agent","image":{"repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-18"},"maxPods":10,"podTemplate":{"env":[],"nodeSelector":{},"resources":{},"tolerations":[],"volumes":[]},"queue":"default","replicaCount":1,"serviceAccountName":"default","webServerUrlReference":"https://app.clear.ml"}` | This agent will spawn queued experiments in new pods, a good use case is to combine this with GPU autoscaling nodes. https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue |
| agentk8sglue | object | `{"apiServerUrlReference":"https://api.clear.ml","clearmlcheckCertificate":true,"defaultContainerImage":"ubuntu:18.04","extraEnvs":[],"fileServerUrlReference":"https://files.clear.ml","id":"k8s-agent","image":{"repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-18"},"maxPods":10,"podTemplate":{"env":[],"nodeSelector":{},"resources":{},"tolerations":[],"volumeMounts":[],"volumes":[]},"queue":"default","replicaCount":1,"serviceAccountName":"default","webServerUrlReference":"https://app.clear.ml"}` | This agent will spawn queued experiments in new pods, a good use case is to combine this with GPU autoscaling nodes. https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue |
| agentk8sglue.apiServerUrlReference | string | `"https://api.clear.ml"` | Reference to Api server url |
| agentk8sglue.clearmlcheckCertificate | bool | `true` | Check certificates validity for evefry UrlReference below. |
| agentk8sglue.defaultContainerImage | string | `"ubuntu:18.04"` | default container image for ClearML Task pod |
@ -34,11 +34,12 @@ Kubernetes: `>= 1.19.0-0 < 1.25.0-0`
| agentk8sglue.id | string | `"k8s-agent"` | ClearML worker ID (must be unique across the entire ClearMLenvironment) |
| agentk8sglue.image | object | `{"repository":"allegroai/clearml-agent-k8s-base","tag":"1.24-18"}` | Glue Agent image configuration |
| agentk8sglue.maxPods | int | `10` | maximum concurrent consume ClearML Task pod |
| agentk8sglue.podTemplate | object | `{"env":[],"nodeSelector":{},"resources":{},"tolerations":[],"volumes":[]}` | template for pods spawned to consume ClearML Task |
| agentk8sglue.podTemplate | object | `{"env":[],"nodeSelector":{},"resources":{},"tolerations":[],"volumeMounts":[],"volumes":[]}` | template for pods spawned to consume ClearML Task |
| agentk8sglue.podTemplate.env | list | `[]` | environment variables for pods spawned to consume ClearML Task (example in values.yaml comments) |
| agentk8sglue.podTemplate.nodeSelector | object | `{}` | nodeSelector setup for pods spawned to consume ClearML Task (example in values.yaml comments) |
| agentk8sglue.podTemplate.resources | object | `{}` | resources declaration for pods spawned to consume ClearML Task (example in values.yaml comments) |
| agentk8sglue.podTemplate.tolerations | list | `[]` | tolerations setup for pods spawned to consume ClearML Task (example in values.yaml comments) |
| agentk8sglue.podTemplate.volumeMounts | list | `[]` | volumeMounts definition for pods spawned to consume ClearML Task (example in values.yaml comments) |
| agentk8sglue.podTemplate.volumes | list | `[]` | volumes definition for pods spawned to consume ClearML Task (example in values.yaml comments) |
| agentk8sglue.queue | string | `"default"` | ClearML queue this agent will consume |
| agentk8sglue.replicaCount | int | `1` | Glue Agent number of pods |

View File

@ -17,21 +17,18 @@ data:
{{- end }}
{{- end }}
serviceAccountName: {{ .Values.agentk8sglue.serviceAccountName }}
{{- with .Values.agentk8sglue.podTemplate.volumes }}
volumes:
{{- range .Values.agentk8sglue.podTemplate.volumes }}
- name: {{ .name }}
persistentVolumeClaim:
claimName: {{ .name }}
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- resources:
{{- toYaml .Values.agentk8sglue.podTemplate.resources | nindent 10 }}
ports:
- containerPort: 10022
{{- with .Values.agentk8sglue.podTemplate.volumeMounts }}
volumeMounts:
{{- range .Values.agentk8sglue.podTemplate.volumes }}
- mountPath: {{ .path }}
name: {{ .name }}
{{- toYaml . | nindent 10 }}
{{- end }}
env:
- name: CLEARML_API_HOST

View File

@ -24,26 +24,6 @@ spec:
- name: {{ include "agentk8sglue.referenceName" . }}-clearml-agent-registry-key
{{- end }}
{{- end }}
initContainers:
- name: init-k8s-glue
image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}"
command:
- /bin/sh
- -c
- >
set -x;
while [ $(curl {{ if not .Values.agentk8sglue.clearmlcheckCertificate }}--insecure{{ end }} -sw '%{http_code}' "{{.Values.agentk8sglue.apiServerUrlReference}}/debug.ping" -o /dev/null) -ne 200 ] ; do
echo "waiting for apiserver" ;
sleep 5 ;
done;
while [[ $(curl {{ if not .Values.agentk8sglue.clearmlcheckCertificate }}--insecure{{ end }} -sw '%{http_code}' "{{.Values.agentk8sglue.fileServerUrlReference}}/" -o /dev/null) =~ 403|405 ]] ; do
echo "waiting for fileserver" ;
sleep 5 ;
done;
while [ $(curl {{ if not .Values.agentk8sglue.clearmlcheckCertificate }}--insecure{{ end }} -sw '%{http_code}' "{{.Values.agentk8sglue.webServerUrlReference}}/" -o /dev/null) -ne 200 ] ; do
echo "waiting for webserver" ;
sleep 5 ;
done
containers:
- name: k8s-glue
image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}"

View File

@ -71,7 +71,12 @@ agentk8sglue:
# -- volumes definition for pods spawned to consume ClearML Task (example in values.yaml comments)
volumes: []
# - name: "yourvolume"
# path: "/yourpath"
# persistentVolumeClaim:
# claimName: "yourvolume"
# -- volumeMounts definition for pods spawned to consume ClearML Task (example in values.yaml comments)
volumeMounts: []
# - name: "yourvolume"
# mountPath: "/yourpath"
# -- environment variables for pods spawned to consume ClearML Task (example in values.yaml comments)
env: []
# # to setup access to private repo, setup secret with git credentials: