Improvements k8sagent (#54)

This commit is contained in:
Niels ten Boom 2022-03-01 17:48:33 +01:00 committed by GitHub
parent 018348bc1d
commit fa3739b643
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 39 additions and 8 deletions

View File

@ -2,7 +2,7 @@ apiVersion: v2
name: clearml
description: MLOps platform
type: application
version: "3.5.1"
version: "3.6.0"
appVersion: "1.2.0"
home: https://clear.ml
icon: https://raw.githubusercontent.com/allegroai/clearml/master/docs/clearml-logo.svg

View File

@ -1,6 +1,6 @@
# ClearML Ecosystem for Kubernetes
![Version: 3.5.1](https://img.shields.io/badge/Version-3.5.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.2.0](https://img.shields.io/badge/AppVersion-1.2.0-informational?style=flat-square)
![Version: 3.6.0](https://img.shields.io/badge/Version-3.6.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.2.0](https://img.shields.io/badge/AppVersion-1.2.0-informational?style=flat-square)
MLOps platform
@ -163,16 +163,19 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a
| agentGroups.agent-group-gpu.replicaCount | int | `0` | |
| agentGroups.agent-group-gpu.tolerations | list | `[]` | |
| agentGroups.agent-group-gpu.updateStrategy | string | `"Recreate"` | |
| agentk8sglue.defaultDockerImage | string | `"nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04"` | |
| agentk8sglue.defaultDockerImage | string | `"nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu20.04"` | |
| agentk8sglue.enabled | bool | `false` | |
| agentk8sglue.id | string | `"k8s-agent"` | |
| agentk8sglue.image.repository | string | `"allegroai/clearml-agent-k8s"` | |
| agentk8sglue.image.tag | string | `"aws-latest-1.21"` | |
| agentk8sglue.maxPods | int | `10` | |
| agentk8sglue.podTemplate.env | list | `[]` | |
| agentk8sglue.podTemplate.nodeSelector | object | `{}` | |
| agentk8sglue.podTemplate.resources | object | `{}` | |
| agentk8sglue.podTemplate.tolerations | object | `{}` | |
| agentk8sglue.podTemplate.tolerations | list | `[]` | |
| agentk8sglue.podTemplate.volumes | list | `[]` | |
| agentk8sglue.queue | string | `"aws-instances"` | |
| agentk8sglue.serviceAccountName | string | `"default"` | |
| agentservices.affinity | object | `{}` | |
| agentservices.agentVersion | string | `""` | |
| agentservices.awsAccessKeyId | string | `nil` | |

View File

@ -9,9 +9,23 @@ data:
metadata:
namespace: {{ .Release.namespace }}
spec:
serviceAccountName: {{ .Values.agentk8sglue.serviceAccountName }}
volumes:
{{- range .Values.agentk8sglue.podTemplate.volumes }}
- name: {{ .name }}
persistentVolumeClaim:
claimName: {{ .name }}
{{- end }}
containers:
- resources:
{{- toYaml .Values.agentk8sglue.podTemplate.resources | nindent 10 }}
ports:
- containerPort: 10022
volumeMounts:
{{- range .Values.agentk8sglue.podTemplate.volumes }}
- mountPath: {{ .path }}
name: {{ .name }}
{{- end }}
env:
- name: CLEARML_API_HOST
value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.apiserver.service.port }}"
@ -29,6 +43,7 @@ data:
secretKeyRef:
name: clearml-conf
key: apiserver_secret
{{- toYaml .Values.agentk8sglue.podTemplate.env | nindent 8 }}
tolerations:
{{- toYaml .Values.agentk8sglue.podTemplate.tolerations | nindent 8 }}
nodeSelector:

View File

@ -8,7 +8,7 @@ rules:
- ""
resources:
- pods
verbs: ["get", "list", "watch", "create", "patch"]
verbs: ["get", "list", "watch", "create", "patch", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding

View File

@ -302,15 +302,28 @@ agentk8sglue:
image:
repository: "allegroai/clearml-agent-k8s"
tag: "aws-latest-1.21"
serviceAccountName: default
maxPods: 10
defaultDockerImage: nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04 # default docker image that is spawned as new pod
queue: aws-instances # create this queue manually in the UI first for it to work
defaultDockerImage: nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu20.04 # default docker image that is spawned as new pod
queue: aws-instances # create this queue manually in the UI first for it to work
id: k8s-agent
podTemplate:
volumes: []
# - name: "yourvolume"
# path: "/yourpath"
env: []
# # to setup access to private repo, setup secret with git credentials:
# - name: CLEARML_AGENT_GIT_USER
# value: mygitusername
# - name: CLEARML_AGENT_GIT_PASS
# valueFrom:
# secretKeyRef:
# name: git-password
# key: git-password
resources: {}
# limits:
# nvidia.com/gpu: 1
tolerations: {}
tolerations: []
# - key: "nvidia.com/gpu"
# operator: Exists
# effect: "NoSchedule"