From 60be54d54be808b36fa1812f8e9ff100e2c9051d Mon Sep 17 00:00:00 2001 From: revital Date: Sun, 18 May 2025 13:48:11 +0300 Subject: [PATCH] Small edits --- docs/deploying_clearml/clearml_server_gcp.md | 2 +- .../appgw_install_compose.md | 2 +- .../dynamic_edit_task_pod_template.md | 3 + .../extra_configs/self_signed_certificates.md | 8 +- .../enterprise_deploy/fractional_gpus/cdmo.md | 40 ++--- .../enterprise_deploy/fractional_gpus/cfgi.md | 152 +++++++++--------- 6 files changed, 105 insertions(+), 102 deletions(-) diff --git a/docs/deploying_clearml/clearml_server_gcp.md b/docs/deploying_clearml/clearml_server_gcp.md index 029f1b2e..be50e080 100644 --- a/docs/deploying_clearml/clearml_server_gcp.md +++ b/docs/deploying_clearml/clearml_server_gcp.md @@ -125,7 +125,7 @@ If the data and the configuration need to be restored: The following section contains a list of Custom Image URLs (exported in different formats) for each released ClearML Server version. -### Latest Version - v1.13.1 +### Latest Version - v1.13.0 - [https://storage.googleapis.com/allegro-files/clearml-server/clearml-server.tar.gz](https://storage.googleapis.com/allegro-files/clearml-server/clearml-server.tar.gz) diff --git a/docs/deploying_clearml/enterprise_deploy/appgw_install_compose.md b/docs/deploying_clearml/enterprise_deploy/appgw_install_compose.md index 23908a0e..0f9a119e 100644 --- a/docs/deploying_clearml/enterprise_deploy/appgw_install_compose.md +++ b/docs/deploying_clearml/enterprise_deploy/appgw_install_compose.md @@ -8,7 +8,7 @@ The Application Gateway is available under the ClearML Enterprise plan. The AI Application Gateway enables external HTTP(S) or direct TCP access to ClearML tasks and applications running on nodes. The gateway is configured with an endpoint or external address, making these services accessible from the user's -machine, outside the workload’ network. +machine, outside the workload's network. This guide describes how to install and run the ClearML AI Application Gateway using docker-compose for environments where you manage both the ClearML Server and the workload nodes. diff --git a/docs/deploying_clearml/enterprise_deploy/extra_configs/dynamic_edit_task_pod_template.md b/docs/deploying_clearml/enterprise_deploy/extra_configs/dynamic_edit_task_pod_template.md index 911ff411..c7e651e5 100644 --- a/docs/deploying_clearml/enterprise_deploy/extra_configs/dynamic_edit_task_pod_template.md +++ b/docs/deploying_clearml/enterprise_deploy/extra_configs/dynamic_edit_task_pod_template.md @@ -84,6 +84,8 @@ agentk8sglue: # -- Custom Bash script for the Task Pods ran by Glue Agent containerCustomBashScript: "" ``` +::: + ## Examples @@ -246,6 +248,7 @@ agentk8sglue: - mountPath: "/tmp/task/" name: task-pvc ``` +::: ### Example: Required Role diff --git a/docs/deploying_clearml/enterprise_deploy/extra_configs/self_signed_certificates.md b/docs/deploying_clearml/enterprise_deploy/extra_configs/self_signed_certificates.md index 741f5636..e529e20e 100644 --- a/docs/deploying_clearml/enterprise_deploy/extra_configs/self_signed_certificates.md +++ b/docs/deploying_clearml/enterprise_deploy/extra_configs/self_signed_certificates.md @@ -29,7 +29,7 @@ You have two configuration options: - [**Append**](#append-extra-certificates-to-the-existing-ca-certificatescrt) extra certificates to the existing `ca-certificates.crt` -### Replace Entire `ca-certificates.crt` File +### Replace Entire ca-certificates.crt File To replace the whole ca-bundle, provide a concatenated list of all trusted CA certificates in `pem` format as they are stored in a standard `ca-certificates.crt`. @@ -51,7 +51,7 @@ customCertificates: ... ``` -### Append Extra Certificates to the Existing `ca-certificates.crt` +### Append Extra Certificates to the Existing ca-certificates.crt You can add certificates to the existing CA bundle. Each certificate must have a unique `alias`. @@ -104,7 +104,7 @@ You have two configuration options: - [**Append**](#append-extra-certificates-to-the-existing-ca-certificatescrt-1) extra certificates to the existing `ca-certificates.crt` -### Replace Entire `ca-certificates.crt` File +### Replace Entire ca-certificates.crt File To replace the whole ca-bundle, provide a concatenated list of all trusted CA certificates in `pem` format as they are stored in a standard `ca-certificates.crt`. @@ -127,7 +127,7 @@ customCertificates: ... ``` -### Append Extra Certificates to the Existing `ca-certificates.crt` +### Append Extra Certificates to the Existing ca-certificates.crt You can add certificates to the existing CA bundle. Each certificate must have a unique `alias`. diff --git a/docs/deploying_clearml/enterprise_deploy/fractional_gpus/cdmo.md b/docs/deploying_clearml/enterprise_deploy/fractional_gpus/cdmo.md index 138792fe..0cbf1d32 100644 --- a/docs/deploying_clearml/enterprise_deploy/fractional_gpus/cdmo.md +++ b/docs/deploying_clearml/enterprise_deploy/fractional_gpus/cdmo.md @@ -52,37 +52,37 @@ The ClearML Dynamic MIG Operator (CDMO) enables dynamic MIG (Multi-Instance GPU ### Installing CDMO -1. Create a `cdmo-values.override.yaml` file with the following content: - - ```yaml - imageCredentials: - password: "" - ``` +1. Create a `cdmo-values.override.yaml` file with the following content: + + ```yaml + imageCredentials: + password: "" + ``` 1. Install the CDMO Helm Chart using the previous override file: - ```bash - helm install -n cdmo cdmo clearml-enterprise/clearml-dynamic-mig-operator --create-namespace -f cdmo-values.override.yaml - ``` + ```bash + helm install -n cdmo cdmo clearml-enterprise/clearml-dynamic-mig-operator --create-namespace -f cdmo-values.override.yaml + ``` 1. Enable the NVIDIA MIG support on your cluster by running the following command on all nodes with a MIG-supported GPU (run it for each GPU `` on the host): - ```bash - nvidia-smi -mig 1 - ``` + ```bash + nvidia-smi -mig 1 + ``` - :::note notes - * A node reboot may be required if the command output indicates so. - - * For convenience, this command can be run from within the `nvidia-device-plugin-daemonset` pod running on the related node. - ::: + :::note notes + * A node reboot may be required if the command output indicates so. + + * For convenience, this command can be run from within the `nvidia-device-plugin-daemonset` pod running on the related node. + ::: 1. Label all MIG-enabled GPU node `` from the previous step: - ```bash - kubectl label nodes "cdmo.clear.ml/gpu-partitioning=mig" - ``` + ```bash + kubectl label nodes "cdmo.clear.ml/gpu-partitioning=mig" + ``` ## Disabling MIGs diff --git a/docs/deploying_clearml/enterprise_deploy/fractional_gpus/cfgi.md b/docs/deploying_clearml/enterprise_deploy/fractional_gpus/cfgi.md index 749c02d6..13ba2e85 100644 --- a/docs/deploying_clearml/enterprise_deploy/fractional_gpus/cfgi.md +++ b/docs/deploying_clearml/enterprise_deploy/fractional_gpus/cfgi.md @@ -33,75 +33,75 @@ helm repo update 1. Create a Docker Registry secret named `clearml-dockerhub-access` in the `gpu-operator` namespace. Make sure to replace `` with your token. - ```bash - kubectl create secret -n gpu-operator docker-registry clearml-dockerhub-access \ - --docker-server=docker.io \ - --docker-username=allegroaienterprise \ - --docker-password="" \ - --docker-email="" - ``` + ```bash + kubectl create secret -n gpu-operator docker-registry clearml-dockerhub-access \ + --docker-server=docker.io \ + --docker-username=allegroaienterprise \ + --docker-password="" \ + --docker-email="" + ``` 1. Create a `gpu-operator.override.yaml` file as follows: - * Set `devicePlugin.repository` to `docker.io/clearml` - * Configure `devicePlugin.config.data.renamed-resources.sharing.timeSlicing.resources` for each GPU index on the host - * Use `nvidia.com/gpu-` format for the `rename` field, and set `replicas` to `8`. + * Set `devicePlugin.repository` to `docker.io/clearml` + * Configure `devicePlugin.config.data.renamed-resources.sharing.timeSlicing.resources` for each GPU index on the host + * Use `nvidia.com/gpu-` format for the `rename` field, and set `replicas` to `8`. -```yaml -gfd: - imagePullSecrets: - - "clearml-dockerhub-access" -toolkit: - env: - - name: ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED - value: "false" - - name: ACCEPT_NVIDIA_VISIBLE_DEVICES_AS_VOLUME_MOUNTS - value: "true" -devicePlugin: - repository: docker.io/clearml - image: k8s-device-plugin - version: v0.17.1-gpu-card-selection - imagePullPolicy: Always - imagePullSecrets: - - "clearml-dockerhub-access" - env: - - name: PASS_DEVICE_SPECS - value: "true" - - name: FAIL_ON_INIT_ERROR - value: "true" - - name: DEVICE_LIST_STRATEGY # Use volume-mounts - value: volume-mounts - - name: DEVICE_ID_STRATEGY - value: uuid - - name: NVIDIA_VISIBLE_DEVICES - value: all - - name: NVIDIA_DRIVER_CAPABILITIES - value: all - config: - name: device-plugin-config - create: true - default: "renamed-resources" - data: - renamed-resources: |- - version: v1 - flags: - migStrategy: none - sharing: - timeSlicing: - renameByDefault: false - failRequestsGreaterThanOne: false - # Edit the following configuration as needed, adding as many GPU indices as many cards are installed on the Host. - resources: - - name: nvidia.com/gpu - rename: nvidia.com/gpu-0 - devices: - - "0" - replicas: 8 - - name: nvidia.com/gpu - rename: nvidia.com/gpu-1 - devices: - - "1" - replicas: 8 -``` + ```yaml + gfd: + imagePullSecrets: + - "clearml-dockerhub-access" + toolkit: + env: + - name: ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED + value: "false" + - name: ACCEPT_NVIDIA_VISIBLE_DEVICES_AS_VOLUME_MOUNTS + value: "true" + devicePlugin: + repository: docker.io/clearml + image: k8s-device-plugin + version: v0.17.1-gpu-card-selection + imagePullPolicy: Always + imagePullSecrets: + - "clearml-dockerhub-access" + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY # Use volume-mounts + value: volume-mounts + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + config: + name: device-plugin-config + create: true + default: "renamed-resources" + data: + renamed-resources: |- + version: v1 + flags: + migStrategy: none + sharing: + timeSlicing: + renameByDefault: false + failRequestsGreaterThanOne: false + # Edit the following configuration as needed, adding as many GPU indices as many cards are installed on the Host. + resources: + - name: nvidia.com/gpu + rename: nvidia.com/gpu-0 + devices: + - "0" + replicas: 8 + - name: nvidia.com/gpu + rename: nvidia.com/gpu-1 + devices: + - "1" + replicas: 8 + ``` #### For CFGI version < 1.3.0 (Legacy) @@ -150,22 +150,22 @@ devicePlugin: 1. Install the NVIDIA `gpu-operator` using the previously created `gpu-operator.override.yaml` file: - ```bash - helm install -n gpu-operator gpu-operator nvidia/gpu-operator --create-namespace -f gpu-operator.override.yaml - ``` + ```bash + helm install -n gpu-operator gpu-operator nvidia/gpu-operator --create-namespace -f gpu-operator.override.yaml + ``` 1. Create a `cfgi-values.override.yaml` file with the following content: - ```yaml - imageCredentials: - password: "" - ``` + ```yaml + imageCredentials: + password: "" + ``` 1. Install the CFGI Helm Chart using the previous override file: - ```bash - helm install -n cfgi cfgi clearml-enterprise/clearml-fractional-gpu-injector --create-namespace -f cfgi-values.override.yaml - ``` + ```bash + helm install -n cfgi cfgi clearml-enterprise/clearml-fractional-gpu-injector --create-namespace -f cfgi-values.override.yaml + ``` ## Usage