diff --git a/README.md b/README.md index ec3f75c..1fa4538 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,8 @@ It is a zero configuration fire-and-forget execution agent, providing a full ML/ ### Kubernetes Integration (Optional) We think Kubernetes is awesome, but it should be a choice. We designed `clearml-agent` so you can run bare-metal or inside a pod with any mix that fits your environment. + +Find Dockerfiles in [docker](./docker) dir and a helm Chart in https://github.com/allegroai/clearml-helm-charts #### Benefits of integrating existing K8s with ClearML-Agent - ClearML-Agent adds the missing scheduling capabilities to K8s - Allowing for more flexible automation from code diff --git a/docker/k8s-glue/glue-build/Dockerfile.alpine b/docker/k8s-glue/glue-build/Dockerfile.alpine new file mode 100644 index 0000000..186a976 --- /dev/null +++ b/docker/k8s-glue/glue-build/Dockerfile.alpine @@ -0,0 +1,75 @@ +ARG TAG=3.7.12-alpine3.15 + +FROM python:${TAG} as build + +RUN apk add --no-cache \ + gcc \ + musl-dev \ + libffi-dev + +RUN python3 \ + -m pip \ + install \ + --prefix=/install \ + --no-cache-dir \ + -U \ + clearml-agent \ + cryptography>=2.9 + +FROM python:${TAG} as target + +WORKDIR /app + +ARG KUBECTL_VERSION=1.22.4 + +# Not sure about these ENV vars +# ENV LC_ALL=en_US.UTF-8 +# ENV LANG=en_US.UTF-8 +# ENV LANGUAGE=en_US.UTF-8 +# ENV PYTHONIOENCODING=UTF-8 + +COPY --from=build /install /usr/local + +ADD https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl /usr/bin/ + +RUN chmod +x /usr/bin/kubectl + +RUN apk add --no-cache \ + bash + +COPY k8s_glue_example.py . + +# AWS CLI +# https://github.com/kyleknap/aws-cli/blob/source-proposal/proposals/source-install.md#alpine-linux +# https://github.com/aws/aws-cli/issues/4685 +# https://github.com/aws/aws-cli/pull/6352 + +# https://github.com/GoogleCloudPlatform/cloud-sdk-docker/blob/master/alpine/Dockerfile + +FROM target as gcp + +ARG CLOUD_SDK_VERSION=371.0.0 +ENV CLOUD_SDK_VERSION=$CLOUD_SDK_VERSION +ENV PATH /google-cloud-sdk/bin:$PATH + +WORKDIR / + +RUN apk --no-cache add \ + curl \ + python3 \ + py3-crcmod \ + py3-openssl \ + bash \ + libc6-compat \ + openssh-client \ + git \ + gnupg \ + && curl -O https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \ + tar xzf google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \ + rm google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment github_docker_image && \ + gcloud --version + +WORKDIR /app \ No newline at end of file diff --git a/docker/k8s-glue/glue-build/Dockerfile.bullseye b/docker/k8s-glue/glue-build/Dockerfile.bullseye new file mode 100644 index 0000000..9725838 --- /dev/null +++ b/docker/k8s-glue/glue-build/Dockerfile.bullseye @@ -0,0 +1,82 @@ +ARG TAG=3.7.12-slim-bullseye + +FROM python:${TAG} as target + +ARG KUBECTL_VERSION=1.22.4 + +WORKDIR /app + +RUN python3 \ + -m pip \ + install \ + --no-cache-dir \ + -U \ + clearml-agent \ + cryptography>=2.9 + +# Not sure about these ENV vars +# ENV LC_ALL=en_US.UTF-8 +# ENV LANG=en_US.UTF-8 +# ENV LANGUAGE=en_US.UTF-8 +# ENV PYTHONIOENCODING=UTF-8 + +ADD https://storage.googleapis.com/kubernetes-release/release/v${KUBECTL_VERSION}/bin/linux/amd64/kubectl /usr/bin/ + +RUN chmod +x /usr/bin/kubectl + +COPY k8s_glue_example.py . + +CMD ["python3", "k8s_glue_example.py"] + +FROM target as aws + +# https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html +# https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html + +RUN apt-get update -qqy && \ + apt-get install -qqy \ + unzip && \ + rm -rf /var/lib/apt/lists/* + +ADD https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip awscliv2.zip +ADD https://amazon-eks.s3.us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/aws-iam-authenticator /usr/local/bin/aws-iam-authenticator + +RUN unzip awscliv2.zip && \ + ./aws/install && \ + rm -r awscliv2.zip aws/ && \ + chmod +x /usr/local/bin/aws-iam-authenticator && \ + aws --version && \ + aws-iam-authenticator version + +# https://github.com/GoogleCloudPlatform/cloud-sdk-docker/blob/master/debian_slim/Dockerfile + +FROM target as gcp + +ARG CLOUD_SDK_VERSION=371.0.0 +ENV CLOUD_SDK_VERSION=$CLOUD_SDK_VERSION + +ENV PATH "$PATH:/opt/google-cloud-sdk/bin/" + +ARG INSTALL_COMPONENTS +RUN mkdir -p /usr/share/man/man1/ +RUN apt-get update -qqy && \ + apt-get install -qqy \ + curl \ + gcc \ + python3-dev \ + python3-pip \ + apt-transport-https \ + lsb-release \ + openssh-client \ + git \ + gnupg && \ + rm -rf /var/lib/apt/lists/* && \ + pip3 install -U crcmod && \ + export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \ + echo "deb https://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" > /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \ + apt-get update && apt-get install -y google-cloud-sdk=${CLOUD_SDK_VERSION}-0 $INSTALL_COMPONENTS && \ + gcloud config set core/disable_usage_reporting true && \ + gcloud config set component_manager/disable_update_check true && \ + gcloud config set metrics/environment github_docker_image && \ + gcloud --version diff --git a/docker/k8s-glue/glue-build/k8s_glue_example.py b/docker/k8s-glue/glue-build/k8s_glue_example.py new file mode 100644 index 0000000..dc69c37 --- /dev/null +++ b/docker/k8s-glue/glue-build/k8s_glue_example.py @@ -0,0 +1,94 @@ +""" +This example assumes you have preconfigured services with selectors in the form of + "ai.allegro.agent.serial=pod-" and a targetPort of 10022. +The K8sIntegration component will label each pod accordingly. +""" +from argparse import ArgumentParser + +from clearml_agent.glue.k8s import K8sIntegration + + +def parse_args(): + parser = ArgumentParser() + group = parser.add_mutually_exclusive_group() + + parser.add_argument( + "--queue", type=str, help="Queue to pull tasks from" + ) + group.add_argument( + "--ports-mode", action='store_true', default=False, + help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports" + "Should not be used with max-pods" + ) + parser.add_argument( + "--num-of-services", type=int, default=20, + help="Specify the number of k8s services to be used. Use only with ports-mode." + ) + parser.add_argument( + "--base-port", type=int, + help="Used in conjunction with ports-mode, specifies the base port exposed by the services. " + "For pod #X, the port will be +X. Note that pod number is calculated based on base-pod-num" + "e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003" + ) + parser.add_argument( + "--base-pod-num", type=int, default=1, + help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the " + "service (default: %(default)s)" + ) + parser.add_argument( + "--gateway-address", type=str, default=None, + help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB" + ) + parser.add_argument( + "--pod-clearml-conf", type=str, + help="Configuration file to be used by the pod itself (if not provided, current configuration is used)" + ) + parser.add_argument( + "--overrides-yaml", type=str, + help="YAML file containing pod overrides to be used when launching a new pod" + ) + parser.add_argument( + "--template-yaml", type=str, + help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply " + "and overrides are ignored, otherwise it will be scheduled with kubectl run" + ) + parser.add_argument( + "--ssh-server-port", type=int, default=0, + help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)" + ) + parser.add_argument( + "--namespace", type=str, + help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml" + ) + group.add_argument( + "--max-pods", type=int, + help="Limit the maximum number of pods that this service can run at the same time." + "Should not be used with ports-mode" + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + user_props_cb = None + if args.ports_mode and args.base_port: + def k8s_user_props_cb(pod_number=0): + user_prop = {"k8s-pod-port": args.base_port + pod_number} + if args.gateway_address: + user_prop["k8s-gateway-address"] = args.gateway_address + return user_prop + user_props_cb = k8s_user_props_cb + + k8s = K8sIntegration( + ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num, + user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf, + template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash( + ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None, + namespace=args.namespace, max_pods_limit=args.max_pods or None, + ) + k8s.k8s_daemon(args.queue) + + +if __name__ == "__main__": + main()