From 6dd0445ed31572dbe007d783836da4556e1e2c54 Mon Sep 17 00:00:00 2001 From: revital <revital@allegro.ai> Date: Mon, 24 Mar 2025 07:47:42 +0200 Subject: [PATCH 1/2] Add Simple k8s deployment --- .../enterprise_deploy/k8s.md | 448 ++++++++++++++++++ sidebars.js | 6 +- 2 files changed, 452 insertions(+), 2 deletions(-) create mode 100644 docs/deploying_clearml/enterprise_deploy/k8s.md diff --git a/docs/deploying_clearml/enterprise_deploy/k8s.md b/docs/deploying_clearml/enterprise_deploy/k8s.md new file mode 100644 index 00000000..dacd53ca --- /dev/null +++ b/docs/deploying_clearml/enterprise_deploy/k8s.md @@ -0,0 +1,448 @@ +--- +title: Kubernetes +--- + + +This guide provides step-by-step instructions for installing the ClearML Enterprise setup in a Kubernetes cluster. + + +## Prerequisites + + +* A Kubernetes cluster +* An ingress controller (e.g. `nginx-ingress`) and the ability to create LoadBalancer services (e.g. MetalLB) if needed + to expose ClearML +* Credentials for ClearML Enterprise GitHub Helm chart repository +* Credentials for ClearML Enterprise DockerHub repository +* URL for downloading the ClearML Enterprise applications configuration + + +## Control Plane Installation + + +The following steps cover installing the control plane (server and required charts) and will +require some or all of the tokens/deliverables mentioned above. + + +### Requirements + + +* Add the ClearML Enterprise repository: + + + ``` + helm repo add clearml-enterprise https://raw.githubusercontent.com/clearml/clearml-enterprise-helm-charts/gh-pages --username <clearmlenterprise_GitHub_TOKEN> --password <clearmlenterprise_GitHub_TOKEN> + ``` + + +* Update the repository locally: + + + ``` + helm repo update + ``` + + +### Install ClearML Enterprise Chart + + +#### Configuration + + +The Helm Chart must be installed with an `overrides.yaml` overriding values as follows: + + +:::note +In the following configuration, replace `<BASE_DOMAIN>` with a valid domain +that will have records pointing to the cluster’s ingress controller (see ingress details in the values below). +::: + + +``` +imageCredentials: + password: "<allegroaienterprise_DockerHub_TOKEN>" + + +clearml: + cookieDomain: "<BASE_DOMAIN>" + # Set values for improved security + apiserverKey: "" + apiserverSecret: "" + fileserverKey: "" + fileserverSecret: "" + secureAuthTokenSecret: "" + testUserKey: "" + testUserSecret: "" + + +apiserver: + ingress: + enabled: true + hostName: "api.<BASE_DOMAIN>" + service: + type: ClusterIP + extraEnvs: + - name: CLEARML__services__organization__features__user_management_advanced + value: "true" + - name: CLEARML__services__auth__ui_features_per_role__user__show_datasets + value: "false" + - name: CLEARML__services__auth__ui_features_per_role__user__show_orchestration + value: "false" + - name: CLEARML__services__workers__resource_usages__supervisor_company + value: "<SUPERVISOR_TENANT_ID>" + - name: CLEARML__secure__credentials__supervisor__role + value: "system" + - name: CLEARML__secure__credentials__supervisor__allow_login + value: "true" + - name: CLEARML__secure__credentials__supervisor__user_key + value: "<SUPERVISOR_USER_KEY>" + - name: CLEARML__secure__credentials__supervisor__user_secret + value: "<SUPERVISOR_USER_SECRET>" + - name: CLEARML__secure__credentials__supervisor__sec_groups + value: "[\"users\", \"admins\", \"queue_admins\"]" + - name: CLEARML__secure__credentials__supervisor__email + value: "\"<SUPERVISOR_USER_EMAIL>\"" + - name: CLEARML__apiserver__company__unique_names + value: "true" + + +fileserver: + ingress: + enabled: true + hostName: "file.<BASE_DOMAIN>" + service: + type: ClusterIP + + +webserver: + ingress: + enabled: true + hostName: "app.<BASE_DOMAIN>" + service: + type: ClusterIP + + +clearmlApplications: + enabled: true +``` + + +The credentials specified in `<SUPERVISOR_USER_KEY>` and `<SUPERVISOR_USER_SECRET>` can be used to login as the +supervisor user from the ClearML Web UI accessible using the URL `app.<BASE_DOMAIN>`. + + +Note that the `<SUPERVISOR_USER_EMAIL>` value must be explicitly quoted. To do so, put `\"` around the quoted value. +For example `"\"email@example.com\""`. + + +#### Additional Configuration Options +##### Fixed Users (Simple Login) + + +Enable static login with username and password in `overrides.yaml`. + + +This is an optional step in case SSO (Identity provider) configuration will not be performed. + + +``` +apiserver: + additionalConfigs: + apiserver.conf: | + auth { + fixed_users { + enabled: true + pass_hashed: false + users: [ + { + username: "my_user" + password: "my_password" + name: "My User" + admin: true + }, + ] + } + } +``` + + +##### SSO (Identity Provider) + + +The following examples (Auth0 and Keycloak) show how to configure an identity provider on the ClearML server. + + +Add the following values configuring `extraEnvs` for `apiserver` in the `clearml-enterprise` values `override.yaml` file. + + +Substitute all `<PLACEHOLDER>`s with the correct value for your configuration. + + +##### Auth0 Identity Provider + + +``` +apiserver: + extraEnvs: + - name: CLEARML__secure__login__sso__oauth_client__auth0__client_id + value: "<SSO_CLIENT_ID>" + - name: CLEARML__secure__login__sso__oauth_client__auth0__client_secret + value: "<SSO_CLIENT_SECRET>" + - name: CLEARML__services__login__sso__oauth_client__auth0__base_url + value: "<SSO_CLIENT_URL>" + - name: CLEARML__services__login__sso__oauth_client__auth0__authorize_url + value: "<SSO_CLIENT_AUTHORIZE_URL>" + - name: CLEARML__services__login__sso__oauth_client__auth0__access_token_url + value: "<SSO_CLIENT_ACCESS_TOKEN_URL>" + - name: CLEARML__services__login__sso__oauth_client__auth0__audience + value: "<SSO_CLIENT_AUDIENCE>" +``` + + +##### Keycloak Identity Provider + + +``` +apiserver: + extraEnvs: + - name: CLEARML__secure__login__sso__oauth_client__keycloak__client_id + value: "<KC_CLIENT_ID>" + - name: CLEARML__secure__login__sso__oauth_client__keycloak__client_secret + value: "<KC_SECRET_ID>" + - name: CLEARML__services__login__sso__oauth_client__keycloak__base_url + value: "<KC_URL>/realms/<REALM_NAME>/" + - name: CLEARML__services__login__sso__oauth_client__keycloak__authorize_url + value: "<KC_URL>/realms/<REALM_NAME>/protocol/openid-connect/auth" + - name: CLEARML__services__login__sso__oauth_client__keycloak__access_token_url + value: "<KC_URL>/realms/<REALM_NAME>/protocol/openid-connect/token" + - name: CLEARML__services__login__sso__oauth_client__keycloak__idp_logout + value: "true" + + +``` + + +#### Installing the Chart + + +``` +helm install -n clearml \ + clearml \ + clearml-enterprise/clearml-enterprise \ + --create-namespace \ + -f overrides.yaml +``` + + +### Install ClearML Agent Chart + + +#### Configuration + + +To configure the agent you will need to choose a Redis password and use that when setting up Redis as well +(see [Shared Redis installation](multi_tenant_k8s.md#shared-redis-installation)). + + +The Helm Chart must be installed with `overrides.yaml`: + + +``` +imageCredentials: + password: "<CLEARML_DOCKERHUB_TOKEN>" +clearml: + agentk8sglueKey: "<ACCESS_KEY>" + agentk8sglueSecret: "<SECRET_KEY>" +agentk8sglue: + apiServerUrlReference: "https://api.<BASE_DOMAIN>" + fileServerUrlReference: "https://files.<BASE_DOMAIN>" + webServerUrlReference: "https://app.<BASE_DOMAIN>" + defaultContainerImage: "python:3.9" +``` + + +#### Installing the Chart + + +``` +helm install -n <WORKLOAD_NAMESPACE> \ + clearml-agent \ + clearml-enterprise/clearml-enterprise-agent \ + --create-namespace \ + -f overrides.yaml +``` + + +To create a queue by API: + + +``` +curl $APISERVER_URL/queues.create \ +-H "Content-Type: application/json" \ +-H "X-Clearml-Impersonate-As:<USER_ID>" \ +-u $APISERVER_KEY:$APISERVER_SECRET \ +-d '{"name":"default"}' +``` + + +## ClearML AI Application Gateway Installation + + +### Configuring Chart + + +The Helm Chart must be installed with `overrides.yaml`: + + +``` +imageCredentials: + password: "<DOCKERHUB_TOKEN>" +clearml: + apiServerKey: "" + apiServerSecret: "" + apiServerUrlReference: "https://api." + authCookieName: "" +ingress: + enabled: true + hostName: "task-router.dev" +tcpSession: + routerAddress: "<NODE_IP OR EXTERNAL_NAME>" + portRange: + start: <START_PORT> + end: <END_PORT> +``` + + +**Configuration options:** + + +* **`clearml.apiServerUrlReference`:** URL usually starting with `https://api.` +* **`clearml.apiServerKey`:** ClearML server API key +* **`clearml.apiServerSecret`:** ClearML server secret key +* **`ingress.hostName`:** URL of the router we configured previously for load balancer starting with `https://` +* **`clearml.sslVerify`:** Enable or disable SSL certificate validation on apiserver calls check +* **`clearml.authCookieName`:** Value from `value_prefix` key starting with `allegro_token` in `envoy.yaml` file in ClearML server installation. +* **`tcpSession.routerAddress`**: Router external address can be an IP or the host machine or a load balancer hostname, depends on the network configuration +* **`tcpSession.portRange.start`**: Start port for the TCP Session feature +* **`tcpSession.portRange.end`**: End port for the TCP Session feature + + +### Installing the Chart + + +``` +helm install -n <WORKLOAD_NAMESPACE> \ + clearml-ttr \ + clearml-enterprise/clearml-enterprise-task-traffic-router \ + --create-namespace \ + -f overrides.yaml +``` + + + + +## Applications Installation + + +To install the ClearML Applications on the newly installed ClearML Enterprise control-plane, download the applications +package using the URL provided by the ClearML staff. + + + + +### Download and Extract + + +``` +wget -O apps.zip "<ClearML enterprise applications configuration download url>" +unzip apps.zip +``` + + +### Adjust Application Docker Images Location (Air-Gapped Systems) + + +ClearML applications use pre-built docker images provided by ClearML on the ClearML DockerHub +repository. If you are using an air-gapped system, these images must be available as part of your internal docker +registry, and the correct docker images location must be specified before installing the applications. + + +Use the following script to adjust the applications packages accordingly before installing the applications: + + +``` +python convert_image_registry.py \ + --apps-dir /path/to/apps/ \ + --repo local_registry/clearml-apps +``` + + +The script will change the application zip files to point to the new registry, and will output the list of containers +that need to be copied to the local registry. For example: + + +``` +make sure allegroai/clearml-apps:hpo-1.10.0-1062 was added to local_registry/clearml-apps +``` + + +### Install Applications + + +Use the `upload_apps.py` script to upload the application packages to the ClearML server: + + +``` +python upload_apps.py \ + --host $APISERVER_ADDRESS \ + --user $APISERVER_USER --password $APISERVER_PASSWORD \ + --dir apps -ml +``` + + +## Configuring Shared Memory for Large Model Deployment + + +Deploying large models may fail due to shared memory size limitations. This issue commonly arises when the allocated +`/dev/shm` space is insufficient.: + + +``` +> 3d3e22c3066f:168:168 [0] misc/shmutils.cc:72 NCCL WARN Error: failed to extend /dev/shm/nccl-UbzKZ9 to 9637892 bytes +> 3d3e22c3066f:168:168 [0] misc/shmutils.cc:113 NCCL WARN Error while creating shared memory segment /dev/shm/nccl-UbzKZ9 (size 9637888) +> 3d3e22c3066f:168:168 [0] NCCL INFO transport/shm.cc:114 -> 2 +> 3d3e22c3066f:168:168 [0] NCCL INFO transport.cc:33 -> 2 +> 3d3e22c3066f:168:168 [0] NCCL INFO transport.cc:113 -> 2 +> 3d3e22c3066f:168:168 [0] NCCL INFO init.cc:1263 -> 2 +> 3d3e22c3066f:168:168 [0] NCCL INFO init.cc:1548 -> 2 +> 3d3e22c3066f:168:168 [0] NCCL INFO init.cc:1799 -> 2 +``` + + +To configure a proper SHM size you can use the following configuration in the agent `overrides.yaml`. + + +Replace `<SIZE>` with the desired memory allocation in GiB, based on your model requirements. + + +This example configures a specific queue, but you can include this setting in the `basePodTemplate` if you need to +apply it to all tasks. + + +``` +agentk8sglue: + queues: + GPUshm: + templateOverrides: + env: + - name: VLLM_SKIP_P2P_CHECK + value: "1" + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: <SIZE>Gi +``` diff --git a/sidebars.js b/sidebars.js index 0674d6a0..1a6b15e1 100644 --- a/sidebars.js +++ b/sidebars.js @@ -327,9 +327,10 @@ module.exports = { { 'Open Source': [ - 'release_notes/sdk/open_source/ver_1_17', + 'release_notes/sdk/open_source/ver_1_18', { 'Older Versions': [ + 'release_notes/sdk/open_source/ver_1_17', 'release_notes/sdk/open_source/ver_1_16', 'release_notes/sdk/open_source/ver_1_15', 'release_notes/sdk/open_source/ver_1_14', 'release_notes/sdk/open_source/ver_1_13', 'release_notes/sdk/open_source/ver_1_12', 'release_notes/sdk/open_source/ver_1_11', @@ -639,11 +640,12 @@ module.exports = { { 'Enterprise Server': { 'Deployment Options': [ + 'deploying_clearml/enterprise_deploy/k8s', 'deploying_clearml/enterprise_deploy/multi_tenant_k8s', 'deploying_clearml/enterprise_deploy/vpc_aws', 'deploying_clearml/enterprise_deploy/on_prem_ubuntu', ], - 'Maintenance': [ + 'Maintenance and Migration': [ 'deploying_clearml/enterprise_deploy/import_projects', 'deploying_clearml/enterprise_deploy/change_artifact_links', 'deploying_clearml/enterprise_deploy/delete_tenant', From 3fabb27803ca03d126ce25605a13d3a53eb3a973 Mon Sep 17 00:00:00 2001 From: pollfly <75068813+pollfly@users.noreply.github.com> Date: Wed, 26 Mar 2025 17:33:35 +0200 Subject: [PATCH 2/2] edits --- docs/deploying_clearml/enterprise_deploy/k8s.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deploying_clearml/enterprise_deploy/k8s.md b/docs/deploying_clearml/enterprise_deploy/k8s.md index dacd53ca..3a217321 100644 --- a/docs/deploying_clearml/enterprise_deploy/k8s.md +++ b/docs/deploying_clearml/enterprise_deploy/k8s.md @@ -60,7 +60,7 @@ that will have records pointing to the cluster’s ingress controller (see ingre ``` imageCredentials: - password: "<allegroaienterprise_DockerHub_TOKEN>" + password: "<clearml_enterprise_DockerHub_TOKEN>" clearml: