mirror of
https://github.com/clearml/clearml-docs
synced 2025-04-15 13:14:20 +00:00
Add Simple k8s deployment
This commit is contained in:
parent
001ae8e5ed
commit
6dd0445ed3
448
docs/deploying_clearml/enterprise_deploy/k8s.md
Normal file
448
docs/deploying_clearml/enterprise_deploy/k8s.md
Normal file
@ -0,0 +1,448 @@
|
|||||||
|
---
|
||||||
|
title: Kubernetes
|
||||||
|
---
|
||||||
|
|
||||||
|
|
||||||
|
This guide provides step-by-step instructions for installing the ClearML Enterprise setup in a Kubernetes cluster.
|
||||||
|
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
|
||||||
|
* A Kubernetes cluster
|
||||||
|
* An ingress controller (e.g. `nginx-ingress`) and the ability to create LoadBalancer services (e.g. MetalLB) if needed
|
||||||
|
to expose ClearML
|
||||||
|
* Credentials for ClearML Enterprise GitHub Helm chart repository
|
||||||
|
* Credentials for ClearML Enterprise DockerHub repository
|
||||||
|
* URL for downloading the ClearML Enterprise applications configuration
|
||||||
|
|
||||||
|
|
||||||
|
## Control Plane Installation
|
||||||
|
|
||||||
|
|
||||||
|
The following steps cover installing the control plane (server and required charts) and will
|
||||||
|
require some or all of the tokens/deliverables mentioned above.
|
||||||
|
|
||||||
|
|
||||||
|
### Requirements
|
||||||
|
|
||||||
|
|
||||||
|
* Add the ClearML Enterprise repository:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
helm repo add clearml-enterprise https://raw.githubusercontent.com/clearml/clearml-enterprise-helm-charts/gh-pages --username <clearmlenterprise_GitHub_TOKEN> --password <clearmlenterprise_GitHub_TOKEN>
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
* Update the repository locally:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
helm repo update
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Install ClearML Enterprise Chart
|
||||||
|
|
||||||
|
|
||||||
|
#### Configuration
|
||||||
|
|
||||||
|
|
||||||
|
The Helm Chart must be installed with an `overrides.yaml` overriding values as follows:
|
||||||
|
|
||||||
|
|
||||||
|
:::note
|
||||||
|
In the following configuration, replace `<BASE_DOMAIN>` with a valid domain
|
||||||
|
that will have records pointing to the cluster’s ingress controller (see ingress details in the values below).
|
||||||
|
:::
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
imageCredentials:
|
||||||
|
password: "<allegroaienterprise_DockerHub_TOKEN>"
|
||||||
|
|
||||||
|
|
||||||
|
clearml:
|
||||||
|
cookieDomain: "<BASE_DOMAIN>"
|
||||||
|
# Set values for improved security
|
||||||
|
apiserverKey: ""
|
||||||
|
apiserverSecret: ""
|
||||||
|
fileserverKey: ""
|
||||||
|
fileserverSecret: ""
|
||||||
|
secureAuthTokenSecret: ""
|
||||||
|
testUserKey: ""
|
||||||
|
testUserSecret: ""
|
||||||
|
|
||||||
|
|
||||||
|
apiserver:
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
hostName: "api.<BASE_DOMAIN>"
|
||||||
|
service:
|
||||||
|
type: ClusterIP
|
||||||
|
extraEnvs:
|
||||||
|
- name: CLEARML__services__organization__features__user_management_advanced
|
||||||
|
value: "true"
|
||||||
|
- name: CLEARML__services__auth__ui_features_per_role__user__show_datasets
|
||||||
|
value: "false"
|
||||||
|
- name: CLEARML__services__auth__ui_features_per_role__user__show_orchestration
|
||||||
|
value: "false"
|
||||||
|
- name: CLEARML__services__workers__resource_usages__supervisor_company
|
||||||
|
value: "<SUPERVISOR_TENANT_ID>"
|
||||||
|
- name: CLEARML__secure__credentials__supervisor__role
|
||||||
|
value: "system"
|
||||||
|
- name: CLEARML__secure__credentials__supervisor__allow_login
|
||||||
|
value: "true"
|
||||||
|
- name: CLEARML__secure__credentials__supervisor__user_key
|
||||||
|
value: "<SUPERVISOR_USER_KEY>"
|
||||||
|
- name: CLEARML__secure__credentials__supervisor__user_secret
|
||||||
|
value: "<SUPERVISOR_USER_SECRET>"
|
||||||
|
- name: CLEARML__secure__credentials__supervisor__sec_groups
|
||||||
|
value: "[\"users\", \"admins\", \"queue_admins\"]"
|
||||||
|
- name: CLEARML__secure__credentials__supervisor__email
|
||||||
|
value: "\"<SUPERVISOR_USER_EMAIL>\""
|
||||||
|
- name: CLEARML__apiserver__company__unique_names
|
||||||
|
value: "true"
|
||||||
|
|
||||||
|
|
||||||
|
fileserver:
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
hostName: "file.<BASE_DOMAIN>"
|
||||||
|
service:
|
||||||
|
type: ClusterIP
|
||||||
|
|
||||||
|
|
||||||
|
webserver:
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
hostName: "app.<BASE_DOMAIN>"
|
||||||
|
service:
|
||||||
|
type: ClusterIP
|
||||||
|
|
||||||
|
|
||||||
|
clearmlApplications:
|
||||||
|
enabled: true
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
The credentials specified in `<SUPERVISOR_USER_KEY>` and `<SUPERVISOR_USER_SECRET>` can be used to login as the
|
||||||
|
supervisor user from the ClearML Web UI accessible using the URL `app.<BASE_DOMAIN>`.
|
||||||
|
|
||||||
|
|
||||||
|
Note that the `<SUPERVISOR_USER_EMAIL>` value must be explicitly quoted. To do so, put `\"` around the quoted value.
|
||||||
|
For example `"\"email@example.com\""`.
|
||||||
|
|
||||||
|
|
||||||
|
#### Additional Configuration Options
|
||||||
|
##### Fixed Users (Simple Login)
|
||||||
|
|
||||||
|
|
||||||
|
Enable static login with username and password in `overrides.yaml`.
|
||||||
|
|
||||||
|
|
||||||
|
This is an optional step in case SSO (Identity provider) configuration will not be performed.
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
apiserver:
|
||||||
|
additionalConfigs:
|
||||||
|
apiserver.conf: |
|
||||||
|
auth {
|
||||||
|
fixed_users {
|
||||||
|
enabled: true
|
||||||
|
pass_hashed: false
|
||||||
|
users: [
|
||||||
|
{
|
||||||
|
username: "my_user"
|
||||||
|
password: "my_password"
|
||||||
|
name: "My User"
|
||||||
|
admin: true
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
##### SSO (Identity Provider)
|
||||||
|
|
||||||
|
|
||||||
|
The following examples (Auth0 and Keycloak) show how to configure an identity provider on the ClearML server.
|
||||||
|
|
||||||
|
|
||||||
|
Add the following values configuring `extraEnvs` for `apiserver` in the `clearml-enterprise` values `override.yaml` file.
|
||||||
|
|
||||||
|
|
||||||
|
Substitute all `<PLACEHOLDER>`s with the correct value for your configuration.
|
||||||
|
|
||||||
|
|
||||||
|
##### Auth0 Identity Provider
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
apiserver:
|
||||||
|
extraEnvs:
|
||||||
|
- name: CLEARML__secure__login__sso__oauth_client__auth0__client_id
|
||||||
|
value: "<SSO_CLIENT_ID>"
|
||||||
|
- name: CLEARML__secure__login__sso__oauth_client__auth0__client_secret
|
||||||
|
value: "<SSO_CLIENT_SECRET>"
|
||||||
|
- name: CLEARML__services__login__sso__oauth_client__auth0__base_url
|
||||||
|
value: "<SSO_CLIENT_URL>"
|
||||||
|
- name: CLEARML__services__login__sso__oauth_client__auth0__authorize_url
|
||||||
|
value: "<SSO_CLIENT_AUTHORIZE_URL>"
|
||||||
|
- name: CLEARML__services__login__sso__oauth_client__auth0__access_token_url
|
||||||
|
value: "<SSO_CLIENT_ACCESS_TOKEN_URL>"
|
||||||
|
- name: CLEARML__services__login__sso__oauth_client__auth0__audience
|
||||||
|
value: "<SSO_CLIENT_AUDIENCE>"
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
##### Keycloak Identity Provider
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
apiserver:
|
||||||
|
extraEnvs:
|
||||||
|
- name: CLEARML__secure__login__sso__oauth_client__keycloak__client_id
|
||||||
|
value: "<KC_CLIENT_ID>"
|
||||||
|
- name: CLEARML__secure__login__sso__oauth_client__keycloak__client_secret
|
||||||
|
value: "<KC_SECRET_ID>"
|
||||||
|
- name: CLEARML__services__login__sso__oauth_client__keycloak__base_url
|
||||||
|
value: "<KC_URL>/realms/<REALM_NAME>/"
|
||||||
|
- name: CLEARML__services__login__sso__oauth_client__keycloak__authorize_url
|
||||||
|
value: "<KC_URL>/realms/<REALM_NAME>/protocol/openid-connect/auth"
|
||||||
|
- name: CLEARML__services__login__sso__oauth_client__keycloak__access_token_url
|
||||||
|
value: "<KC_URL>/realms/<REALM_NAME>/protocol/openid-connect/token"
|
||||||
|
- name: CLEARML__services__login__sso__oauth_client__keycloak__idp_logout
|
||||||
|
value: "true"
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### Installing the Chart
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
helm install -n clearml \
|
||||||
|
clearml \
|
||||||
|
clearml-enterprise/clearml-enterprise \
|
||||||
|
--create-namespace \
|
||||||
|
-f overrides.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Install ClearML Agent Chart
|
||||||
|
|
||||||
|
|
||||||
|
#### Configuration
|
||||||
|
|
||||||
|
|
||||||
|
To configure the agent you will need to choose a Redis password and use that when setting up Redis as well
|
||||||
|
(see [Shared Redis installation](multi_tenant_k8s.md#shared-redis-installation)).
|
||||||
|
|
||||||
|
|
||||||
|
The Helm Chart must be installed with `overrides.yaml`:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
imageCredentials:
|
||||||
|
password: "<CLEARML_DOCKERHUB_TOKEN>"
|
||||||
|
clearml:
|
||||||
|
agentk8sglueKey: "<ACCESS_KEY>"
|
||||||
|
agentk8sglueSecret: "<SECRET_KEY>"
|
||||||
|
agentk8sglue:
|
||||||
|
apiServerUrlReference: "https://api.<BASE_DOMAIN>"
|
||||||
|
fileServerUrlReference: "https://files.<BASE_DOMAIN>"
|
||||||
|
webServerUrlReference: "https://app.<BASE_DOMAIN>"
|
||||||
|
defaultContainerImage: "python:3.9"
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### Installing the Chart
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
helm install -n <WORKLOAD_NAMESPACE> \
|
||||||
|
clearml-agent \
|
||||||
|
clearml-enterprise/clearml-enterprise-agent \
|
||||||
|
--create-namespace \
|
||||||
|
-f overrides.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
To create a queue by API:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
curl $APISERVER_URL/queues.create \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "X-Clearml-Impersonate-As:<USER_ID>" \
|
||||||
|
-u $APISERVER_KEY:$APISERVER_SECRET \
|
||||||
|
-d '{"name":"default"}'
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## ClearML AI Application Gateway Installation
|
||||||
|
|
||||||
|
|
||||||
|
### Configuring Chart
|
||||||
|
|
||||||
|
|
||||||
|
The Helm Chart must be installed with `overrides.yaml`:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
imageCredentials:
|
||||||
|
password: "<DOCKERHUB_TOKEN>"
|
||||||
|
clearml:
|
||||||
|
apiServerKey: ""
|
||||||
|
apiServerSecret: ""
|
||||||
|
apiServerUrlReference: "https://api."
|
||||||
|
authCookieName: ""
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
hostName: "task-router.dev"
|
||||||
|
tcpSession:
|
||||||
|
routerAddress: "<NODE_IP OR EXTERNAL_NAME>"
|
||||||
|
portRange:
|
||||||
|
start: <START_PORT>
|
||||||
|
end: <END_PORT>
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
**Configuration options:**
|
||||||
|
|
||||||
|
|
||||||
|
* **`clearml.apiServerUrlReference`:** URL usually starting with `https://api.`
|
||||||
|
* **`clearml.apiServerKey`:** ClearML server API key
|
||||||
|
* **`clearml.apiServerSecret`:** ClearML server secret key
|
||||||
|
* **`ingress.hostName`:** URL of the router we configured previously for load balancer starting with `https://`
|
||||||
|
* **`clearml.sslVerify`:** Enable or disable SSL certificate validation on apiserver calls check
|
||||||
|
* **`clearml.authCookieName`:** Value from `value_prefix` key starting with `allegro_token` in `envoy.yaml` file in ClearML server installation.
|
||||||
|
* **`tcpSession.routerAddress`**: Router external address can be an IP or the host machine or a load balancer hostname, depends on the network configuration
|
||||||
|
* **`tcpSession.portRange.start`**: Start port for the TCP Session feature
|
||||||
|
* **`tcpSession.portRange.end`**: End port for the TCP Session feature
|
||||||
|
|
||||||
|
|
||||||
|
### Installing the Chart
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
helm install -n <WORKLOAD_NAMESPACE> \
|
||||||
|
clearml-ttr \
|
||||||
|
clearml-enterprise/clearml-enterprise-task-traffic-router \
|
||||||
|
--create-namespace \
|
||||||
|
-f overrides.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Applications Installation
|
||||||
|
|
||||||
|
|
||||||
|
To install the ClearML Applications on the newly installed ClearML Enterprise control-plane, download the applications
|
||||||
|
package using the URL provided by the ClearML staff.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Download and Extract
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
wget -O apps.zip "<ClearML enterprise applications configuration download url>"
|
||||||
|
unzip apps.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Adjust Application Docker Images Location (Air-Gapped Systems)
|
||||||
|
|
||||||
|
|
||||||
|
ClearML applications use pre-built docker images provided by ClearML on the ClearML DockerHub
|
||||||
|
repository. If you are using an air-gapped system, these images must be available as part of your internal docker
|
||||||
|
registry, and the correct docker images location must be specified before installing the applications.
|
||||||
|
|
||||||
|
|
||||||
|
Use the following script to adjust the applications packages accordingly before installing the applications:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
python convert_image_registry.py \
|
||||||
|
--apps-dir /path/to/apps/ \
|
||||||
|
--repo local_registry/clearml-apps
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
The script will change the application zip files to point to the new registry, and will output the list of containers
|
||||||
|
that need to be copied to the local registry. For example:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
make sure allegroai/clearml-apps:hpo-1.10.0-1062 was added to local_registry/clearml-apps
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Install Applications
|
||||||
|
|
||||||
|
|
||||||
|
Use the `upload_apps.py` script to upload the application packages to the ClearML server:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
python upload_apps.py \
|
||||||
|
--host $APISERVER_ADDRESS \
|
||||||
|
--user $APISERVER_USER --password $APISERVER_PASSWORD \
|
||||||
|
--dir apps -ml
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Configuring Shared Memory for Large Model Deployment
|
||||||
|
|
||||||
|
|
||||||
|
Deploying large models may fail due to shared memory size limitations. This issue commonly arises when the allocated
|
||||||
|
`/dev/shm` space is insufficient.:
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
> 3d3e22c3066f:168:168 [0] misc/shmutils.cc:72 NCCL WARN Error: failed to extend /dev/shm/nccl-UbzKZ9 to 9637892 bytes
|
||||||
|
> 3d3e22c3066f:168:168 [0] misc/shmutils.cc:113 NCCL WARN Error while creating shared memory segment /dev/shm/nccl-UbzKZ9 (size 9637888)
|
||||||
|
> 3d3e22c3066f:168:168 [0] NCCL INFO transport/shm.cc:114 -> 2
|
||||||
|
> 3d3e22c3066f:168:168 [0] NCCL INFO transport.cc:33 -> 2
|
||||||
|
> 3d3e22c3066f:168:168 [0] NCCL INFO transport.cc:113 -> 2
|
||||||
|
> 3d3e22c3066f:168:168 [0] NCCL INFO init.cc:1263 -> 2
|
||||||
|
> 3d3e22c3066f:168:168 [0] NCCL INFO init.cc:1548 -> 2
|
||||||
|
> 3d3e22c3066f:168:168 [0] NCCL INFO init.cc:1799 -> 2
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
To configure a proper SHM size you can use the following configuration in the agent `overrides.yaml`.
|
||||||
|
|
||||||
|
|
||||||
|
Replace `<SIZE>` with the desired memory allocation in GiB, based on your model requirements.
|
||||||
|
|
||||||
|
|
||||||
|
This example configures a specific queue, but you can include this setting in the `basePodTemplate` if you need to
|
||||||
|
apply it to all tasks.
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
agentk8sglue:
|
||||||
|
queues:
|
||||||
|
GPUshm:
|
||||||
|
templateOverrides:
|
||||||
|
env:
|
||||||
|
- name: VLLM_SKIP_P2P_CHECK
|
||||||
|
value: "1"
|
||||||
|
volumeMounts:
|
||||||
|
- name: dshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
volumes:
|
||||||
|
- name: dshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: <SIZE>Gi
|
||||||
|
```
|
@ -327,9 +327,10 @@ module.exports = {
|
|||||||
{
|
{
|
||||||
'Open Source':
|
'Open Source':
|
||||||
[
|
[
|
||||||
'release_notes/sdk/open_source/ver_1_17',
|
'release_notes/sdk/open_source/ver_1_18',
|
||||||
{
|
{
|
||||||
'Older Versions': [
|
'Older Versions': [
|
||||||
|
'release_notes/sdk/open_source/ver_1_17',
|
||||||
'release_notes/sdk/open_source/ver_1_16', 'release_notes/sdk/open_source/ver_1_15',
|
'release_notes/sdk/open_source/ver_1_16', 'release_notes/sdk/open_source/ver_1_15',
|
||||||
'release_notes/sdk/open_source/ver_1_14', 'release_notes/sdk/open_source/ver_1_13',
|
'release_notes/sdk/open_source/ver_1_14', 'release_notes/sdk/open_source/ver_1_13',
|
||||||
'release_notes/sdk/open_source/ver_1_12', 'release_notes/sdk/open_source/ver_1_11',
|
'release_notes/sdk/open_source/ver_1_12', 'release_notes/sdk/open_source/ver_1_11',
|
||||||
@ -639,11 +640,12 @@ module.exports = {
|
|||||||
{
|
{
|
||||||
'Enterprise Server': {
|
'Enterprise Server': {
|
||||||
'Deployment Options': [
|
'Deployment Options': [
|
||||||
|
'deploying_clearml/enterprise_deploy/k8s',
|
||||||
'deploying_clearml/enterprise_deploy/multi_tenant_k8s',
|
'deploying_clearml/enterprise_deploy/multi_tenant_k8s',
|
||||||
'deploying_clearml/enterprise_deploy/vpc_aws',
|
'deploying_clearml/enterprise_deploy/vpc_aws',
|
||||||
'deploying_clearml/enterprise_deploy/on_prem_ubuntu',
|
'deploying_clearml/enterprise_deploy/on_prem_ubuntu',
|
||||||
],
|
],
|
||||||
'Maintenance': [
|
'Maintenance and Migration': [
|
||||||
'deploying_clearml/enterprise_deploy/import_projects',
|
'deploying_clearml/enterprise_deploy/import_projects',
|
||||||
'deploying_clearml/enterprise_deploy/change_artifact_links',
|
'deploying_clearml/enterprise_deploy/change_artifact_links',
|
||||||
'deploying_clearml/enterprise_deploy/delete_tenant',
|
'deploying_clearml/enterprise_deploy/delete_tenant',
|
||||||
|
Loading…
Reference in New Issue
Block a user