feat: Add k8s glue agent deployment (#49)

2025-04-17 01:31:13 +00:00 · 2022-01-18 23:27:12 +01:00 · 2022-01-18 23:27:12 +01:00 · cd7f22f7d8
commit cd7f22f7d8
parent 078e394e24
6 changed files with 154 additions and 2 deletions
--- a/charts/clearml/Chart.yaml
+++ b/charts/clearml/Chart.yaml
@ -2,7 +2,7 @@ apiVersion: v2
 name: clearml
 description: MLOps platform
 type: application
-version: "3.3.1"
+version: "3.4.0"
 appVersion: "1.1.1"
 home: https://clear.ml
 icon: https://raw.githubusercontent.com/allegroai/clearml/master/docs/clearml-logo.svg
--- a/charts/clearml/README.md
+++ b/charts/clearml/README.md
@ -1,6 +1,6 @@
 # ClearML Ecosystem for Kubernetes
-![Version: 3.3.1](https://img.shields.io/badge/Version-3.3.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square)
+![Version: 3.4.0](https://img.shields.io/badge/Version-3.4.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.1.1](https://img.shields.io/badge/AppVersion-1.1.1-informational?style=flat-square)
 MLOps platform
@ -163,6 +163,15 @@ For detailed instructions, see the [Optional Configuration](https://github.com/a
 | agentGroups.agent-group-gpu.replicaCount | int | `0` |  |
 | agentGroups.agent-group-gpu.tolerations | list | `[]` |  |
 | agentGroups.agent-group-gpu.updateStrategy | string | `"Recreate"` |  |
 | agentk8sglue.defaultDockerImage | string | `"nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04"` |  |
 | agentk8sglue.enabled | bool | `false` |  |
 | agentk8sglue.id | string | `"k8s-agent"` |  |
 | agentk8sglue.image.repository | string | `"allegroai/clearml-agent-k8s"` |  |
 | agentk8sglue.image.tag | string | `"aws-latest-1.21"` |  |
 | agentk8sglue.maxPods | int | `10` |  |
 | agentk8sglue.podTemplate.nodeSelector | object | `{}` |  |
 | agentk8sglue.podTemplate.tolerations | object | `{}` |  |
 | agentk8sglue.queue | string | `"aws-instances"` |  |
 | agentservices.affinity | object | `{}` |  |
 | agentservices.agentVersion | string | `""` |  |
 | agentservices.awsAccessKeyId | string | `nil` |  |
--- a/charts/clearml/templates/configmap-agentk8s-template.yaml
+++ b/charts/clearml/templates/configmap-agentk8s-template.yaml
@ -0,0 +1,35 @@
 {{- if .Values.agentk8sglue.enabled }}
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: k8sagent-pod-template
 data:
  template.yaml: |
    apiVersion: v1
    metadata:
      namespace: {{ .Release.namespace }}
    spec:
      containers:
      - env: 
        - name: CLEARML_API_HOST
          value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.clearml.apiserver.service.port }}"
        - name: CLEARML_WEB_HOST
          value: "http://{{ include "clearml.fullname" . }}-webserver"
        - name: CLEARML_FILES_HOST
          value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.clearml.fileserver.service.port }}"
        - name: CLEARML_API_ACCESS_KEY
          valueFrom:
            secretKeyRef:
              name: clearml-conf
              key: apiserver_key
        - name: CLEARML_API_SECRET_KEY
          valueFrom:
            secretKeyRef:
              name: clearml-conf
              key: apiserver_secret
      tolerations:
        {{- toYaml .Values.agentk8sglue.podTemplate.tolerations | nindent 8 }}
      nodeSelector:
        {{- toYaml .Values.agentk8sglue.podTemplate.nodeSelector | nindent 8 }}
 {{- end }}
--- a/charts/clearml/templates/deployment-agentk8s.yaml
+++ b/charts/clearml/templates/deployment-agentk8s.yaml
@ -0,0 +1,62 @@
 {{- if .Values.agentk8sglue.enabled }}
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: "{{ include "clearml.fullname" . }}-k8sagent"
  labels:
    app: k8sagent
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: k8sagent
  template:
    metadata:
      labels:
        app: k8sagent
    spec:
      containers:
      - name: k8s-glue-container
        image: "{{ .Values.agentk8sglue.image.repository }}:{{ .Values.agentk8sglue.image.tag }}"
        imagePullPolicy: Always
        command: ["/bin/bash", "-c", "export PATH=$PATH:$HOME/bin; source /root/.bashrc && /root/entrypoint.sh"]
        volumeMounts:
          - name: k8sagent-pod-template
            mountPath: /root/template
        env:
          - name: CLEARML_API_HOST
            value: "http://{{ include "clearml.fullname" . }}-apiserver:{{ .Values.clearml.apiserver.service.port }}"
          - name: CLEARML_WEB_HOST
            value: "http://{{ include "clearml.fullname" . }}-webserver"
          - name: CLEARML_FILES_HOST
            value: "http://{{ include "clearml.fullname" . }}-fileserver:{{ .Values.clearml.fileserver.service.port }}"
          - name: K8S_GLUE_MAX_PODS
            value: "{{.Values.agentk8sglue.maxPods}}"
          - name: K8S_GLUE_QUEUE
            value: "{{.Values.agentk8sglue.queue}}"
          - name: K8S_GLUE_EXTRA_ARGS
            value: "--template-yaml /root/template/template.yaml"
          - name: CLEARML_API_ACCESS_KEY
            valueFrom:
              secretKeyRef:
                name: clearml-conf
                key: apiserver_key
          - name: CLEARML_API_SECRET_KEY
            valueFrom:
              secretKeyRef:
                name: clearml-conf
                key: apiserver_secret
          - name: CLEARML_WORKER_ID
            value: "{{.Values.agentk8sglue.id}}"
          - name: CLEARML_AGENT_UPDATE_REPO
            value: ""
          - name: FORCE_CLEARML_AGENT_REPO
            value: ""
          - name: CLEARML_DOCKER_IMAGE
            value: "{{.Values.agentk8sglue.defaultDockerImage}}"
      volumes:
        - name: k8sagent-pod-template
          configMap: 
            name: k8sagent-pod-template
 {{- end }}
--- a/charts/clearml/templates/rbac-agentk8s.yaml
+++ b/charts/clearml/templates/rbac-agentk8s.yaml
@ -0,0 +1,25 @@
 {{- if .Values.agentk8sglue.enabled }}
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: k8sagent-pods-access
 rules:
  - apiGroups:
      - ""
    resources:
      - pods
    verbs: ["get", "list", "watch", "create", "patch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: k8sagent-pods-access
 subjects:
  - kind: ServiceAccount
    name: default
    namespace: {{ .Release.namespace }}
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: k8sagent-pods-access
 {{- end }}
--- a/charts/clearml/values.yaml
+++ b/charts/clearml/values.yaml
@ -294,6 +294,27 @@ agentGroups:
    affinity: {}
 # This agent will spawn queued experiments in new pods, a good use case is to combine this with
 # GPU autoscaling nodes.
 # https://github.com/allegroai/clearml-agent/tree/master/docker/k8s-glue
 agentk8sglue:
  enabled: false
  image:
    repository: "allegroai/clearml-agent-k8s"
    tag: "aws-latest-1.21"
  maxPods: 10
  defaultDockerImage: nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04  # default docker image that is spawned as new pod
  queue: aws-instances  # create this queue manually in the UI first for it to work
  id: k8s-agent
  podTemplate:
    tolerations: {}
      # - key: "nvidia.com/gpu"
      #   operator: Exists
      #   effect: "NoSchedule"
    nodeSelector: {}
      # fleet: gpu-nodes
 externalServices:
  # -- Existing ElasticSearch Hostname to use if elasticsearch.enabled is false
  elasticsearchHost: ""