From 15a76d263982cb1dec8c90d01848008209216ce8 Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Thu, 17 Oct 2024 15:45:27 +0530 Subject: [PATCH 01/13] feat: Add server-level GPU support for Docker Swarm deployments and API endpoint for setup --- apps/dokploy/server/api/trpc.ts | 8 ++++++++ packages/server/src/constants/index.ts | 3 +++ packages/server/src/utils/gpu-setup.ts | 9 +++++++++ 3 files changed, 20 insertions(+) create mode 100644 packages/server/src/utils/gpu-setup.ts diff --git a/apps/dokploy/server/api/trpc.ts b/apps/dokploy/server/api/trpc.ts index d37315c3..8aec99ec 100644 --- a/apps/dokploy/server/api/trpc.ts +++ b/apps/dokploy/server/api/trpc.ts @@ -21,6 +21,7 @@ import { import type { Session, User } from "lucia"; import superjson from "superjson"; import { ZodError } from "zod"; +import { setupGPUSupport } from '@dokploy/server/src/utils/gpu-setup'; /** * 1. CONTEXT @@ -208,3 +209,10 @@ export const adminProcedure = t.procedure.use(({ ctx, next }) => { }, }); }); + +const appRouter = t.router({ + setupGPU: t.procedure.mutation(async () => { + await setupGPUSupport(); + return { success: true }; + }), + }); \ No newline at end of file diff --git a/packages/server/src/constants/index.ts b/packages/server/src/constants/index.ts index f2f1a4d8..fd89a53d 100644 --- a/packages/server/src/constants/index.ts +++ b/packages/server/src/constants/index.ts @@ -37,3 +37,6 @@ export const paths = (isServer = false) => { REGISTRY_PATH: `${BASE_PATH}/registry`, }; }; + +export const GPU_ENABLED = process.env.GPU_ENABLED === 'true'; +export const GPU_RESOURCE_NAME = 'DOCKER_RESOURCE_GPU'; \ No newline at end of file diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts new file mode 100644 index 00000000..459c3395 --- /dev/null +++ b/packages/server/src/utils/gpu-setup.ts @@ -0,0 +1,9 @@ +import { docker } from '../constants'; + +export async function setupGPUSupport() { + await docker.swarmUpdate({ + TaskDefaults: { + GenericResources: [{ DiscreteResourceSpec: { Kind: 'gpu', Value: 1 } }] + } + }); +} \ No newline at end of file From e52a0fc9d4785a881c974ab6d0d7ec35967f214e Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Fri, 18 Oct 2024 04:55:37 +0530 Subject: [PATCH 02/13] feat: Added Blender template --- apps/dokploy/public/templates/blender.svg | 153 ++++++++++++++++++ .../templates/blender/docker-compose.yml | 37 +++++ apps/dokploy/templates/blender/index.ts | 34 ++++ apps/dokploy/templates/templates.ts | 14 ++ 4 files changed, 238 insertions(+) create mode 100644 apps/dokploy/public/templates/blender.svg create mode 100644 apps/dokploy/templates/blender/docker-compose.yml create mode 100644 apps/dokploy/templates/blender/index.ts diff --git a/apps/dokploy/public/templates/blender.svg b/apps/dokploy/public/templates/blender.svg new file mode 100644 index 00000000..e59079f5 --- /dev/null +++ b/apps/dokploy/public/templates/blender.svg @@ -0,0 +1,153 @@ + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/apps/dokploy/templates/blender/docker-compose.yml b/apps/dokploy/templates/blender/docker-compose.yml new file mode 100644 index 00000000..bc3de4b7 --- /dev/null +++ b/apps/dokploy/templates/blender/docker-compose.yml @@ -0,0 +1,37 @@ +version: "3.8" + +services: + blender: + image: lscr.io/linuxserver/blender:latest + privileged: true + container_name: blender + security_opt: + - seccomp:unconfined #optional + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: + - compute + - video + - graphics + - utility + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=all + - PUID=1000 + - PGID=1000 + - TZ=Etc/UTC + - SUBFOLDER=/ #optional + volumes: + - blender:/config + ports: + - 3000:3000 + - 3001:3001 + restart: unless-stopped + shm_size: 1gb + +volumes: + blender: null diff --git a/apps/dokploy/templates/blender/index.ts b/apps/dokploy/templates/blender/index.ts new file mode 100644 index 00000000..088e6fcc --- /dev/null +++ b/apps/dokploy/templates/blender/index.ts @@ -0,0 +1,34 @@ +import { + generateHash, + generateRandomDomain, + type Template, + type Schema, + type DomainSchema, +} from "../utils"; + +export function generate(schema: Schema): Template { + const mainServiceHash = generateHash(schema.projectName); + const mainDomain = generateRandomDomain(schema); + + const domains: DomainSchema[] = [ + { + host: mainDomain, + port: 3000, + serviceName: "blender", + }, + ]; + + const envs = [ + `PUID=1000`, + `PGID=1000`, + `TZ=Etc/UTC`, + `SUBFOLDER=/`, + `NVIDIA_VISIBLE_DEVICES=all`, + `NVIDIA_DRIVER_CAPABILITIES=all`, + ]; + + return { + envs, + domains, + }; +} diff --git a/apps/dokploy/templates/templates.ts b/apps/dokploy/templates/templates.ts index afe9d1b6..e5acb390 100644 --- a/apps/dokploy/templates/templates.ts +++ b/apps/dokploy/templates/templates.ts @@ -512,4 +512,18 @@ export const templates: TemplateData[] = [ tags: ["self-hosted", "email", "webmail"], load: () => import("./roundcube/index").then((m) => m.generate), }, + { + id: "blender", + name: "Blender", + version: "latest", + description: "Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.", + logo: "blender.svg", + links: { + github: "https://github.com/linuxserver/docker-blender", + website: "https://www.blender.org/", + docs: "https://docs.blender.org/", + }, + tags: ["3d", "rendering", "animation"], + load: () => import("./blender/index").then((m) => m.generate), + }, ]; From 5a440d934d5ac1d5aae8940a3fb507ae761dacbb Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Fri, 25 Oct 2024 02:32:50 +0530 Subject: [PATCH 03/13] fix: Remove privileged mode and seccomp option, update runtime to nvidia --- apps/dokploy/templates/blender/docker-compose.yml | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/apps/dokploy/templates/blender/docker-compose.yml b/apps/dokploy/templates/blender/docker-compose.yml index bc3de4b7..90fa8da8 100644 --- a/apps/dokploy/templates/blender/docker-compose.yml +++ b/apps/dokploy/templates/blender/docker-compose.yml @@ -3,10 +3,8 @@ version: "3.8" services: blender: image: lscr.io/linuxserver/blender:latest - privileged: true container_name: blender - security_opt: - - seccomp:unconfined #optional + runtime: nvidia deploy: resources: reservations: @@ -14,10 +12,7 @@ services: - driver: nvidia count: all capabilities: - - compute - - video - - graphics - - utility + - gpu environment: - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=all @@ -25,13 +20,8 @@ services: - PGID=1000 - TZ=Etc/UTC - SUBFOLDER=/ #optional - volumes: - - blender:/config ports: - 3000:3000 - 3001:3001 restart: unless-stopped shm_size: 1gb - -volumes: - blender: null From 3e467959c9232b332dbbaaafae66d8c0cd76097b Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Sun, 27 Oct 2024 22:00:08 +0530 Subject: [PATCH 04/13] refactor: Update docker-compose.yml to remove port mapping and remove GPU constants from index.ts --- apps/dokploy/templates/blender/docker-compose.yml | 4 ++-- packages/server/src/constants/index.ts | 5 +---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/apps/dokploy/templates/blender/docker-compose.yml b/apps/dokploy/templates/blender/docker-compose.yml index 90fa8da8..da769c6b 100644 --- a/apps/dokploy/templates/blender/docker-compose.yml +++ b/apps/dokploy/templates/blender/docker-compose.yml @@ -21,7 +21,7 @@ services: - TZ=Etc/UTC - SUBFOLDER=/ #optional ports: - - 3000:3000 - - 3001:3001 + - 3000 + - 3001 restart: unless-stopped shm_size: 1gb diff --git a/packages/server/src/constants/index.ts b/packages/server/src/constants/index.ts index fd89a53d..be2a72de 100644 --- a/packages/server/src/constants/index.ts +++ b/packages/server/src/constants/index.ts @@ -36,7 +36,4 @@ export const paths = (isServer = false) => { MONITORING_PATH: `${BASE_PATH}/monitoring`, REGISTRY_PATH: `${BASE_PATH}/registry`, }; -}; - -export const GPU_ENABLED = process.env.GPU_ENABLED === 'true'; -export const GPU_RESOURCE_NAME = 'DOCKER_RESOURCE_GPU'; \ No newline at end of file +}; \ No newline at end of file From 1b6d8d803b34482ab56c692034ace63a4fb15d80 Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Sat, 2 Nov 2024 15:15:58 +0530 Subject: [PATCH 05/13] feat: Added GPU support feature for Remote Server with setup and status checks, including API endpoints and utility functions --- .../settings/servers/setup-server.tsx | 12 +- apps/dokploy/server/api/routers/settings.ts | 60 ++++ apps/dokploy/server/api/trpc.ts | 9 - apps/dokploy/templates/blender/index.ts | 52 ++-- apps/dokploy/templates/templates.ts | 3 +- packages/server/src/constants/index.ts | 2 +- packages/server/src/index.ts | 1 + packages/server/src/utils/gpu-setup.ts | 268 +++++++++++++++++- 8 files changed, 361 insertions(+), 46 deletions(-) diff --git a/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx b/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx index 8bfcf4da..119d4d29 100644 --- a/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx +++ b/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx @@ -32,6 +32,7 @@ import Link from "next/link"; import { useState } from "react"; import { toast } from "sonner"; import { ShowDeployment } from "../../application/deployments/show-deployment"; +import { GPUSupport } from "./gpu-support"; interface Props { serverId: string; @@ -89,9 +90,10 @@ export const SetupServer = ({ serverId }: Props) => { ) : (
- + SSH Keys Deployments + GPU Setup {
+ +
+ +
+
)} diff --git a/apps/dokploy/server/api/routers/settings.ts b/apps/dokploy/server/api/routers/settings.ts index e1e63579..4a000889 100644 --- a/apps/dokploy/server/api/routers/settings.ts +++ b/apps/dokploy/server/api/routers/settings.ts @@ -52,6 +52,10 @@ import { writeMainConfig, writeTraefikConfigInPath, } from "@dokploy/server"; +import { + checkGPUStatus, + setupGPUSupport, +} from "@dokploy/server/src/utils/gpu-setup"; import { generateOpenApiDocument } from "@dokploy/trpc-openapi"; import { TRPCError } from "@trpc/server"; import { sql } from "drizzle-orm"; @@ -650,6 +654,62 @@ export const settingsRouter = createTRPCRouter({ } return { status: "not_cloud" }; }), + setupGPU: adminProcedure + .input( + z.object({ + serverId: z.string(), + }), + ) + .mutation(async ({ input }) => { + try { + if (IS_CLOUD) { + return { success: true }; + } + + if (!input.serverId) { + throw new TRPCError({ + code: "BAD_REQUEST", + message: "Server ID is required", + }); + } + + await setupGPUSupport(input.serverId); + return { success: true }; + } catch (error) { + throw new TRPCError({ + code: "INTERNAL_SERVER_ERROR", + message: + error instanceof Error + ? error.message + : "Failed to enable GPU support", + cause: error, + }); + } + }), + checkGPUStatus: adminProcedure + .input( + z.object({ + serverId: z.string().optional(), + }), + ) + .query(async ({ input }) => { + if (IS_CLOUD) { + return { + driverInstalled: false, + driverVersion: undefined, + gpuModel: undefined, + runtimeInstalled: false, + runtimeConfigured: false, + cudaSupport: undefined, + cudaVersion: undefined, + memoryInfo: undefined, + availableGPUs: 0, + swarmEnabled: false, + gpuResources: 0, + }; + } + return await checkGPUStatus(input.serverId); + }), }); // { // "Parallelism": 1, diff --git a/apps/dokploy/server/api/trpc.ts b/apps/dokploy/server/api/trpc.ts index 8aec99ec..db4f7adf 100644 --- a/apps/dokploy/server/api/trpc.ts +++ b/apps/dokploy/server/api/trpc.ts @@ -21,8 +21,6 @@ import { import type { Session, User } from "lucia"; import superjson from "superjson"; import { ZodError } from "zod"; -import { setupGPUSupport } from '@dokploy/server/src/utils/gpu-setup'; - /** * 1. CONTEXT * @@ -209,10 +207,3 @@ export const adminProcedure = t.procedure.use(({ ctx, next }) => { }, }); }); - -const appRouter = t.router({ - setupGPU: t.procedure.mutation(async () => { - await setupGPUSupport(); - return { success: true }; - }), - }); \ No newline at end of file diff --git a/apps/dokploy/templates/blender/index.ts b/apps/dokploy/templates/blender/index.ts index 088e6fcc..baf243e0 100644 --- a/apps/dokploy/templates/blender/index.ts +++ b/apps/dokploy/templates/blender/index.ts @@ -1,34 +1,34 @@ import { - generateHash, - generateRandomDomain, - type Template, - type Schema, - type DomainSchema, + type DomainSchema, + type Schema, + type Template, + generateHash, + generateRandomDomain, } from "../utils"; export function generate(schema: Schema): Template { - const mainServiceHash = generateHash(schema.projectName); - const mainDomain = generateRandomDomain(schema); + const mainServiceHash = generateHash(schema.projectName); + const mainDomain = generateRandomDomain(schema); - const domains: DomainSchema[] = [ - { - host: mainDomain, - port: 3000, - serviceName: "blender", - }, - ]; + const domains: DomainSchema[] = [ + { + host: mainDomain, + port: 3000, + serviceName: "blender", + }, + ]; - const envs = [ - `PUID=1000`, - `PGID=1000`, - `TZ=Etc/UTC`, - `SUBFOLDER=/`, - `NVIDIA_VISIBLE_DEVICES=all`, - `NVIDIA_DRIVER_CAPABILITIES=all`, - ]; + const envs = [ + `PUID=1000`, + `PGID=1000`, + `TZ=Etc/UTC`, + `SUBFOLDER=/`, + `NVIDIA_VISIBLE_DEVICES=all`, + `NVIDIA_DRIVER_CAPABILITIES=all`, + ]; - return { - envs, - domains, - }; + return { + envs, + domains, + }; } diff --git a/apps/dokploy/templates/templates.ts b/apps/dokploy/templates/templates.ts index 40d493e5..115a1ecf 100644 --- a/apps/dokploy/templates/templates.ts +++ b/apps/dokploy/templates/templates.ts @@ -516,7 +516,8 @@ export const templates: TemplateData[] = [ id: "blender", name: "Blender", version: "latest", - description: "Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.", + description: + "Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.", logo: "blender.svg", links: { github: "https://github.com/linuxserver/docker-blender", diff --git a/packages/server/src/constants/index.ts b/packages/server/src/constants/index.ts index be2a72de..f2f1a4d8 100644 --- a/packages/server/src/constants/index.ts +++ b/packages/server/src/constants/index.ts @@ -36,4 +36,4 @@ export const paths = (isServer = false) => { MONITORING_PATH: `${BASE_PATH}/monitoring`, REGISTRY_PATH: `${BASE_PATH}/registry`, }; -}; \ No newline at end of file +}; diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 06f2bc87..90daec2d 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -118,3 +118,4 @@ export * from "./monitoring/utilts"; export * from "./db/validations/domain"; export * from "./db/validations/index"; +export * from "./utils/gpu-setup"; diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts index 459c3395..71f3bf0f 100644 --- a/packages/server/src/utils/gpu-setup.ts +++ b/packages/server/src/utils/gpu-setup.ts @@ -1,9 +1,261 @@ -import { docker } from '../constants'; +import { docker } from "../constants"; +import { execAsync } from "../utils/process/execAsync"; +import { execAsyncRemote } from "../utils/process/execAsync"; +import { getRemoteDocker } from "./servers/remote-docker"; -export async function setupGPUSupport() { - await docker.swarmUpdate({ - TaskDefaults: { - GenericResources: [{ DiscreteResourceSpec: { Kind: 'gpu', Value: 1 } }] - } - }); -} \ No newline at end of file +interface GPUInfo { + driverInstalled: boolean; + driverVersion?: string; + gpuModel?: string; + runtimeInstalled: boolean; + runtimeConfigured: boolean; + cudaSupport: boolean; + cudaVersion?: string; + memoryInfo?: string; + availableGPUs: number; + swarmEnabled: boolean; + gpuResources: number; +} + +interface DiscreteResourceSpec { + Kind: string; + Value: number; +} + +interface NamedGenericResource { + NamedResourceSpec?: { Kind: string; Value: string }; + DiscreteResourceSpec?: DiscreteResourceSpec; +} + +export async function checkGPUStatus(serverId?: string): Promise { + try { + // Check NVIDIA Driver + let driverInstalled = false; + let driverVersion: string | undefined; + let availableGPUs = 0; + + try { + const driverCommand = + "nvidia-smi --query-gpu=driver_version --format=csv,noheader"; + const { stdout: nvidiaSmi } = serverId + ? await execAsyncRemote(serverId, driverCommand) + : await execAsync(driverCommand); + + driverVersion = nvidiaSmi.trim(); + if (driverVersion) { + driverInstalled = true; + const countCommand = + "nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l"; + const { stdout: gpuCount } = serverId + ? await execAsyncRemote(serverId, countCommand) + : await execAsync(countCommand); + + availableGPUs = Number.parseInt(gpuCount.trim(), 10); + } + } catch (error) { + console.debug("GPU driver check:", error); + } + + // Check Runtime Configuration + let runtimeInstalled = false; + let runtimeConfigured = false; + try { + const runtimeCommand = 'docker info --format "{{json .Runtimes}}"'; + const { stdout: runtimeInfo } = serverId + ? await execAsyncRemote(serverId, runtimeCommand) + : await execAsync(runtimeCommand); + + const runtimes = JSON.parse(runtimeInfo); + runtimeInstalled = "nvidia" in runtimes; + + // Check if it's the default runtime + const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"'; + const { stdout: defaultRuntime } = serverId + ? await execAsyncRemote(serverId, defaultCommand) + : await execAsync(defaultCommand); + + runtimeConfigured = defaultRuntime.trim() === "nvidia"; + } catch (error) { + console.debug("Runtime check:", error); + } + + // Check Swarm GPU Resources + let swarmEnabled = false; + let gpuResources = 0; + + try { + // Check node resources directly from inspect + const nodeCommand = + "docker node inspect self --format '{{json .Description.Resources.GenericResources}}'"; + const { stdout: resources } = serverId + ? await execAsyncRemote(serverId, nodeCommand) + : await execAsync(nodeCommand); + + if (resources && resources !== "null") { + const genericResources = JSON.parse(resources); + for (const resource of genericResources) { + if ( + resource.DiscreteResourceSpec && + (resource.DiscreteResourceSpec.Kind === "GPU" || + resource.DiscreteResourceSpec.Kind === "gpu") + ) { + gpuResources = resource.DiscreteResourceSpec.Value; + swarmEnabled = true; + break; + } + } + } + } catch (error) { + console.debug("Swarm resource check:", error); + } + + // Get GPU Model and Memory Info + const gpuInfoCommand = + "nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader"; + const { stdout: gpuInfo } = serverId + ? await execAsyncRemote(serverId, gpuInfoCommand) + : await execAsync(gpuInfoCommand); + + const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim()); + + // Check CUDA Support + const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"'; + const { stdout: cudaInfo } = serverId + ? await execAsyncRemote(serverId, cudaCommand) + : await execAsync(cudaCommand); + + const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/); + const cudaVersion = cudaMatch ? cudaMatch[1] : undefined; + const cudaSupport = !!cudaVersion; + + return { + driverInstalled, + driverVersion, + runtimeInstalled, + runtimeConfigured, + availableGPUs, + swarmEnabled, + gpuResources, + gpuModel, + memoryInfo: memoryTotal, + cudaSupport, + cudaVersion, + }; + } catch (error) { + console.error("Error in checkGPUStatus:", error); + return { + driverInstalled: false, + driverVersion: undefined, + runtimeInstalled: false, + runtimeConfigured: false, + cudaSupport: false, + cudaVersion: undefined, + gpuModel: undefined, + memoryInfo: undefined, + availableGPUs: 0, + swarmEnabled: false, + gpuResources: 0, + }; + } +} + +export async function setupGPUSupport(serverId?: string): Promise { + try { + // 1. Check current GPU status first + const initialStatus = await checkGPUStatus(serverId); + + // If GPU is already configured, just verify and return quickly + if ( + initialStatus.swarmEnabled && + initialStatus.runtimeConfigured && + initialStatus.driverInstalled + ) { + console.log("GPU already configured, skipping setup"); + return; + } + + // 2. Verify GPU prerequisites + if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) { + throw new Error( + "NVIDIA drivers or runtime not installed. Please install them first.", + ); + } + + // Get the node ID + const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"'; + const { stdout: nodeId } = serverId + ? await execAsyncRemote(serverId, nodeIdCommand) + : await execAsync(nodeIdCommand); + + if (!nodeId.trim()) { + throw new Error("Setup Server before enabling GPU support"); + } + + // 3. Configure NVIDIA runtime in daemon.json + const daemonConfig = { + runtimes: { + nvidia: { + path: "nvidia-container-runtime", + runtimeArgs: [], + }, + }, + "default-runtime": "nvidia", + "node-generic-resources": [`GPU=${initialStatus.availableGPUs}`], + }; + + const setupCommands = [ + "sudo -n true", + `echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`, + "sudo mkdir -p /etc/nvidia-container-runtime", + 'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml', + "sudo systemctl daemon-reload", + "sudo systemctl restart docker", + ].join(" && "); + + if (serverId) { + await execAsyncRemote(serverId, setupCommands); + } else { + await execAsync(setupCommands); + } + + // 4. Reduced wait time for Docker restart + await new Promise((resolve) => setTimeout(resolve, 10000)); + + // 5. Add GPU label to the node + const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`; + if (serverId) { + await execAsyncRemote(serverId, labelCommand); + } else { + await execAsync(labelCommand); + } + + // 6. Quick final verification + await new Promise((resolve) => setTimeout(resolve, 5000)); + const finalStatus = await checkGPUStatus(serverId); + + if (!finalStatus.swarmEnabled) { + const diagnosticCommands = [ + `docker node inspect ${nodeId.trim()}`, + 'nvidia-smi -a | grep "GPU UUID"', + "cat /etc/docker/daemon.json", + "cat /etc/nvidia-container-runtime/config.toml", + ].join(" && "); + + const { stdout: diagnostics } = serverId + ? await execAsyncRemote(serverId, diagnosticCommands) + : await execAsync(diagnosticCommands); + + console.error("Diagnostic Information:", diagnostics); + throw new Error("GPU support not detected in swarm after setup"); + } + + console.log("GPU setup completed successfully:", { + availableGPUs: initialStatus.availableGPUs, + driverVersion: initialStatus.driverVersion, + nodeId: nodeId.trim(), + }); + } catch (error) { + console.error("GPU Setup Error:", error); + throw error; + } +} From ed7150fac10e0ea7645cd3f0a72b0c1406f33f6c Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Sun, 3 Nov 2024 04:16:51 +0530 Subject: [PATCH 06/13] fix: Remove unused imports and interfaces from gpu-setup.ts --- packages/server/src/utils/gpu-setup.ts | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts index 71f3bf0f..f1936bf6 100644 --- a/packages/server/src/utils/gpu-setup.ts +++ b/packages/server/src/utils/gpu-setup.ts @@ -1,7 +1,5 @@ -import { docker } from "../constants"; import { execAsync } from "../utils/process/execAsync"; import { execAsyncRemote } from "../utils/process/execAsync"; -import { getRemoteDocker } from "./servers/remote-docker"; interface GPUInfo { driverInstalled: boolean; @@ -17,16 +15,6 @@ interface GPUInfo { gpuResources: number; } -interface DiscreteResourceSpec { - Kind: string; - Value: number; -} - -interface NamedGenericResource { - NamedResourceSpec?: { Kind: string; Value: string }; - DiscreteResourceSpec?: DiscreteResourceSpec; -} - export async function checkGPUStatus(serverId?: string): Promise { try { // Check NVIDIA Driver From 7306d8c5139f4c41c2b2334c0a3c8d2432c44c88 Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Sun, 3 Nov 2024 21:34:03 +0530 Subject: [PATCH 07/13] feat: Add GPU configuration and Update import path for gpu-setup functions --- .../settings/servers/gpu-support.tsx | 219 ++++++++++++++++++ apps/dokploy/server/api/routers/settings.ts | 2 +- 2 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx diff --git a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx new file mode 100644 index 00000000..a0ef8d80 --- /dev/null +++ b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx @@ -0,0 +1,219 @@ +import { Button } from '@/components/ui/button'; +import { useState } from 'react'; +import { api } from '@/utils/api'; +import { toast } from 'sonner'; +import { TRPCClientError } from '@trpc/client'; +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card'; +import { DialogAction } from '@/components/shared/dialog-action'; +import { AlertBlock } from '@/components/shared/alert-block'; +import { Cpu, CheckCircle2, XCircle, Loader2 } from 'lucide-react'; + +interface GPUSupportProps { + serverId?: string; +} + +export function GPUSupport({ serverId }: GPUSupportProps) { + const [isLoading, setIsLoading] = useState(false); + const utils = api.useContext(); + + const { data: gpuStatus, isLoading: isChecking } = api.settings.checkGPUStatus.useQuery( + { serverId }, + { + enabled: !!serverId, + refetchInterval: 5000 + } + ); + +const setupGPU = api.settings.setupGPU.useMutation({ + onMutate: () => { + setIsLoading(true); + }, + onSuccess: async () => { + toast.success('GPU support enabled successfully'); + setIsLoading(false); + + await Promise.all([ + utils.settings.checkGPUStatus.invalidate({ serverId }), + utils.server.invalidate() + ]); + }, + onError: (error) => { + if (error instanceof TRPCClientError) { + const errorMessage = error.message; + if (errorMessage.includes('permission denied')) { + toast.error('Permission denied. Please ensure proper sudo access.'); + } else if (errorMessage.includes('Failed to configure GPU')) { + toast.error('GPU configuration failed. Please check system requirements.'); + } else { + toast.error(errorMessage); + } + } else { + toast.error('Failed to enable GPU support. Please check server logs.'); + } + + setIsLoading(false); + } +}); + + const handleEnableGPU = async () => { + if (!serverId) { + toast.error('No server selected'); + return; + } + + try { + await setupGPU.mutateAsync({ serverId }); + } catch (error) { + // Error handling is done in mutation's onError + } + }; + + return ( + +
+ + +
+
+
+ + GPU Configuration +
+ Configure and monitor GPU support +
+ + + +
+
+ + + +
System Requirements:
+
    +
  • NVIDIA drivers must be installed on the host system
  • +
  • NVIDIA Container Runtime is required for GPU support
  • +
  • Compatible GPU hardware must be present
  • +
+
+ + {isChecking ? ( +
+ + Checking GPU status... +
+ ) : ( +
+ {/* Prerequisites Section */} +
+

Prerequisites

+

Shows all software checks and available hardware

+
+ + + + + + +
+
+ + {/* Configuration Status */} +
+

Docker Swarm GPU Status

+

Shows the configuration state that changes with the Enable GPU

+
+ + +
+
+
+ )} +
+
+
+
+ ); +} + +interface StatusRowProps { + label: string; + isEnabled?: boolean; + description?: string; + value?: string | number; + showIcon?: boolean; +} + +function StatusRow({ label, isEnabled, description, value, showIcon = true }: StatusRowProps) { + return ( +
+ {label} +
+ {showIcon ? ( + <> + {isEnabled ? ( + + ) : ( + + )} + + {description || (isEnabled ? 'Installed' : 'Not Installed')} + + + ) : ( + {value} + )} +
+
+ ); +} \ No newline at end of file diff --git a/apps/dokploy/server/api/routers/settings.ts b/apps/dokploy/server/api/routers/settings.ts index 13f67126..94167a2e 100644 --- a/apps/dokploy/server/api/routers/settings.ts +++ b/apps/dokploy/server/api/routers/settings.ts @@ -55,7 +55,7 @@ import { import { checkGPUStatus, setupGPUSupport, -} from "@dokploy/server/src/utils/gpu-setup"; +} from "@dokploy/server"; import { generateOpenApiDocument } from "@dokploy/trpc-openapi"; import { TRPCError } from "@trpc/server"; import { sql } from "drizzle-orm"; From b53da82204eb5a9f180d78d1f7f52356357868e5 Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Tue, 5 Nov 2024 12:07:35 +0530 Subject: [PATCH 08/13] refactor: gpu support component and related api routers; update template environment variables --- .../settings/servers/gpu-support.tsx | 443 ++++++++++-------- apps/dokploy/server/api/routers/settings.ts | 5 +- apps/dokploy/templates/blender/index.ts | 12 +- 3 files changed, 249 insertions(+), 211 deletions(-) diff --git a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx index a0ef8d80..ae931a3a 100644 --- a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx +++ b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx @@ -1,219 +1,260 @@ -import { Button } from '@/components/ui/button'; -import { useState } from 'react'; -import { api } from '@/utils/api'; -import { toast } from 'sonner'; -import { TRPCClientError } from '@trpc/client'; -import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card'; -import { DialogAction } from '@/components/shared/dialog-action'; -import { AlertBlock } from '@/components/shared/alert-block'; -import { Cpu, CheckCircle2, XCircle, Loader2 } from 'lucide-react'; +import { AlertBlock } from "@/components/shared/alert-block"; +import { DialogAction } from "@/components/shared/dialog-action"; +import { Button } from "@/components/ui/button"; +import { + Card, + CardContent, + CardDescription, + CardHeader, + CardTitle, +} from "@/components/ui/card"; +import { api } from "@/utils/api"; +import { TRPCClientError } from "@trpc/client"; +import { CheckCircle2, Cpu, Loader2, XCircle } from "lucide-react"; +import { useState } from "react"; +import { toast } from "sonner"; interface GPUSupportProps { - serverId?: string; + serverId?: string; } export function GPUSupport({ serverId }: GPUSupportProps) { - const [isLoading, setIsLoading] = useState(false); - const utils = api.useContext(); + const [isLoading, setIsLoading] = useState(false); + const utils = api.useContext(); - const { data: gpuStatus, isLoading: isChecking } = api.settings.checkGPUStatus.useQuery( - { serverId }, - { - enabled: !!serverId, - refetchInterval: 5000 - } - ); + const { data: gpuStatus, isLoading: isChecking } = + api.settings.checkGPUStatus.useQuery( + { serverId }, + { + enabled: !!serverId, + refetchInterval: 5000, + }, + ); -const setupGPU = api.settings.setupGPU.useMutation({ - onMutate: () => { - setIsLoading(true); - }, - onSuccess: async () => { - toast.success('GPU support enabled successfully'); - setIsLoading(false); - - await Promise.all([ - utils.settings.checkGPUStatus.invalidate({ serverId }), - utils.server.invalidate() - ]); - }, - onError: (error) => { - if (error instanceof TRPCClientError) { - const errorMessage = error.message; - if (errorMessage.includes('permission denied')) { - toast.error('Permission denied. Please ensure proper sudo access.'); - } else if (errorMessage.includes('Failed to configure GPU')) { - toast.error('GPU configuration failed. Please check system requirements.'); - } else { - toast.error(errorMessage); - } - } else { - toast.error('Failed to enable GPU support. Please check server logs.'); - } - - setIsLoading(false); - } -}); + const setupGPU = api.settings.setupGPU.useMutation({ + onMutate: () => { + setIsLoading(true); + }, + onSuccess: async () => { + toast.success("GPU support enabled successfully"); + setIsLoading(false); - const handleEnableGPU = async () => { - if (!serverId) { - toast.error('No server selected'); - return; - } + await Promise.all([ + utils.settings.checkGPUStatus.invalidate({ serverId }), + utils.server.invalidate(), + ]); + }, + onError: (error) => { + if (error instanceof TRPCClientError) { + const errorMessage = error.message; + if (errorMessage.includes("permission denied")) { + toast.error("Permission denied. Please ensure proper sudo access."); + } else if (errorMessage.includes("Failed to configure GPU")) { + toast.error( + "GPU configuration failed. Please check system requirements.", + ); + } else { + toast.error(errorMessage); + } + } else { + toast.error("Failed to enable GPU support. Please check server logs."); + } - try { - await setupGPU.mutateAsync({ serverId }); - } catch (error) { - // Error handling is done in mutation's onError - } - }; + setIsLoading(false); + }, + }); - return ( - -
- - -
-
-
- - GPU Configuration -
- Configure and monitor GPU support -
- - - -
-
+ const handleEnableGPU = async () => { + if (!serverId) { + toast.error("No server selected"); + return; + } - - -
System Requirements:
-
    -
  • NVIDIA drivers must be installed on the host system
  • -
  • NVIDIA Container Runtime is required for GPU support
  • -
  • Compatible GPU hardware must be present
  • -
-
+ try { + await setupGPU.mutateAsync({ serverId }); + } catch (error) { + // Error handling is done in mutation's onError + } + }; - {isChecking ? ( -
- - Checking GPU status... -
- ) : ( -
- {/* Prerequisites Section */} -
-

Prerequisites

-

Shows all software checks and available hardware

-
- - - - - - -
-
+ return ( + +
+ + +
+
+
+ + GPU Configuration +
+ + Configure and monitor GPU support + +
+ + + +
+
- {/* Configuration Status */} -
-

Docker Swarm GPU Status

-

Shows the configuration state that changes with the Enable GPU

-
- - -
-
-
- )} -
- -
-
- ); + + +
System Requirements:
+
    +
  • NVIDIA drivers must be installed on the host system
  • +
  • NVIDIA Container Runtime is required for GPU support
  • +
  • Compatible GPU hardware must be present
  • +
+
+ + {isChecking ? ( +
+ + Checking GPU status... +
+ ) : ( +
+ {/* Prerequisites Section */} +
+

Prerequisites

+

+ Shows all software checks and available hardware +

+
+ + + + + + +
+
+ + {/* Configuration Status */} +
+

+ Docker Swarm GPU Status +

+

+ Shows the configuration state that changes with the Enable + GPU +

+
+ + +
+
+
+ )} +
+
+
+
+ ); } interface StatusRowProps { - label: string; - isEnabled?: boolean; - description?: string; - value?: string | number; - showIcon?: boolean; + label: string; + isEnabled?: boolean; + description?: string; + value?: string | number; + showIcon?: boolean; } -function StatusRow({ label, isEnabled, description, value, showIcon = true }: StatusRowProps) { - return ( -
- {label} -
- {showIcon ? ( - <> - {isEnabled ? ( - - ) : ( - - )} - - {description || (isEnabled ? 'Installed' : 'Not Installed')} - - - ) : ( - {value} - )} -
-
- ); -} \ No newline at end of file +function StatusRow({ + label, + isEnabled, + description, + value, + showIcon = true, +}: StatusRowProps) { + return ( +
+ {label} +
+ {showIcon ? ( + <> + {isEnabled ? ( + + ) : ( + + )} + + {description || (isEnabled ? "Installed" : "Not Installed")} + + + ) : ( + {value} + )} +
+
+ ); +} diff --git a/apps/dokploy/server/api/routers/settings.ts b/apps/dokploy/server/api/routers/settings.ts index 46529282..608a5028 100644 --- a/apps/dokploy/server/api/routers/settings.ts +++ b/apps/dokploy/server/api/routers/settings.ts @@ -52,10 +52,7 @@ import { writeMainConfig, writeTraefikConfigInPath, } from "@dokploy/server"; -import { - checkGPUStatus, - setupGPUSupport, -} from "@dokploy/server"; +import { checkGPUStatus, setupGPUSupport } from "@dokploy/server"; import { generateOpenApiDocument } from "@dokploy/trpc-openapi"; import { TRPCError } from "@trpc/server"; import { sql } from "drizzle-orm"; diff --git a/apps/dokploy/templates/blender/index.ts b/apps/dokploy/templates/blender/index.ts index baf243e0..84e52755 100644 --- a/apps/dokploy/templates/blender/index.ts +++ b/apps/dokploy/templates/blender/index.ts @@ -19,12 +19,12 @@ export function generate(schema: Schema): Template { ]; const envs = [ - `PUID=1000`, - `PGID=1000`, - `TZ=Etc/UTC`, - `SUBFOLDER=/`, - `NVIDIA_VISIBLE_DEVICES=all`, - `NVIDIA_DRIVER_CAPABILITIES=all`, + "PUID=1000", + "PGID=1000", + "TZ=Etc/UTC", + "SUBFOLDER=/", + "NVIDIA_VISIBLE_DEVICES=all", + "NVIDIA_DRIVER_CAPABILITIES=all", ]; return { From 2e6d9c34c0bdc61d9c4d8fb759f2ea9fddcbe654 Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Thu, 7 Nov 2024 02:52:41 +0530 Subject: [PATCH 09/13] feat: add dokploy server gpu setup --- .../servers/actions/show-dokploy-actions.tsx | 2 + .../settings/servers/gpu-support-modal.tsx | 36 +++++++++++++++ .../settings/servers/gpu-support.tsx | 26 ++++++----- apps/dokploy/server/api/routers/settings.ts | 37 +++++++-------- packages/server/src/utils/gpu-setup.ts | 45 +++++++++++++++---- 5 files changed, 104 insertions(+), 42 deletions(-) create mode 100644 apps/dokploy/components/dashboard/settings/servers/gpu-support-modal.tsx diff --git a/apps/dokploy/components/dashboard/settings/servers/actions/show-dokploy-actions.tsx b/apps/dokploy/components/dashboard/settings/servers/actions/show-dokploy-actions.tsx index 49f6772b..9b12af84 100644 --- a/apps/dokploy/components/dashboard/settings/servers/actions/show-dokploy-actions.tsx +++ b/apps/dokploy/components/dashboard/settings/servers/actions/show-dokploy-actions.tsx @@ -13,6 +13,7 @@ import { import { api } from "@/utils/api"; import { toast } from "sonner"; import { ShowModalLogs } from "../../web-server/show-modal-logs"; +import { GPUSupportModal } from "../gpu-support-modal"; export const ShowDokployActions = () => { const { mutateAsync: reloadServer, isLoading } = @@ -45,6 +46,7 @@ export const ShowDokployActions = () => { Watch logs + diff --git a/apps/dokploy/components/dashboard/settings/servers/gpu-support-modal.tsx b/apps/dokploy/components/dashboard/settings/servers/gpu-support-modal.tsx new file mode 100644 index 00000000..9cf858cd --- /dev/null +++ b/apps/dokploy/components/dashboard/settings/servers/gpu-support-modal.tsx @@ -0,0 +1,36 @@ +import { + Dialog, + DialogContent, + DialogHeader, + DialogTitle, + DialogTrigger, +} from "@/components/ui/dialog"; +import { DropdownMenuItem } from "@/components/ui/dropdown-menu"; +import { useState } from "react"; +import { GPUSupport } from "./gpu-support"; + +export const GPUSupportModal = () => { + const [isOpen, setIsOpen] = useState(false); + + return ( + + + e.preventDefault()} + > + GPU Setup + + + + + + Dokploy Server GPU Setup + + + + + + + ); +}; diff --git a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx index ae931a3a..d0c178c4 100644 --- a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx +++ b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx @@ -26,7 +26,7 @@ export function GPUSupport({ serverId }: GPUSupportProps) { api.settings.checkGPUStatus.useQuery( { serverId }, { - enabled: !!serverId, + enabled: serverId !== undefined, refetchInterval: 5000, }, ); @@ -38,17 +38,20 @@ export function GPUSupport({ serverId }: GPUSupportProps) { onSuccess: async () => { toast.success("GPU support enabled successfully"); setIsLoading(false); - - await Promise.all([ - utils.settings.checkGPUStatus.invalidate({ serverId }), - utils.server.invalidate(), - ]); + await utils.settings.checkGPUStatus.invalidate({ serverId }); }, onError: (error) => { if (error instanceof TRPCClientError) { const errorMessage = error.message; - if (errorMessage.includes("permission denied")) { - toast.error("Permission denied. Please ensure proper sudo access."); + if ( + errorMessage.includes( + "Permission denied. Please ensure proper sudo access.", + ) || + errorMessage.includes("sudo access required") + ) { + toast.error( + "Administrator privileges required. Please enter your password when prompted.", + ); } else if (errorMessage.includes("Failed to configure GPU")) { toast.error( "GPU configuration failed. Please check system requirements.", @@ -59,13 +62,12 @@ export function GPUSupport({ serverId }: GPUSupportProps) { } else { toast.error("Failed to enable GPU support. Please check server logs."); } - setIsLoading(false); }, }); const handleEnableGPU = async () => { - if (!serverId) { + if (serverId === undefined) { toast.error("No server selected"); return; } @@ -99,7 +101,7 @@ export function GPUSupport({ serverId }: GPUSupportProps) { > + + - + @@ -117,9 +120,17 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
System Requirements:
    -
  • NVIDIA drivers must be installed on the host system
  • -
  • NVIDIA Container Runtime is required for GPU support
  • -
  • Compatible GPU hardware must be present
  • +
  • NVIDIA GPU hardware must be physically installed
  • +
  • + NVIDIA drivers must be installed and running (check with + nvidia-smi) +
  • +
  • + NVIDIA Container Runtime must be installed + (nvidia-container-runtime) +
  • +
  • User must have sudo/administrative privileges
  • +
  • System must support CUDA for GPU acceleration
diff --git a/apps/dokploy/server/api/routers/settings.ts b/apps/dokploy/server/api/routers/settings.ts index a700c82a..56b6431c 100644 --- a/apps/dokploy/server/api/routers/settings.ts +++ b/apps/dokploy/server/api/routers/settings.ts @@ -665,8 +665,8 @@ export const settingsRouter = createTRPCRouter({ }), ) .mutation(async ({ input }) => { - if (IS_CLOUD) { - throw new Error("GPU setup is not available in cloud mode"); + if (IS_CLOUD && !input.serverId) { + throw new Error("Select a server to enable the GPU Setup"); } try { @@ -684,7 +684,7 @@ export const settingsRouter = createTRPCRouter({ }), ) .query(async ({ input }) => { - if (IS_CLOUD) { + if (IS_CLOUD && !input.serverId) { return { driverInstalled: false, driverVersion: undefined, diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts index ecdb3e2b..bb366762 100644 --- a/packages/server/src/utils/gpu-setup.ts +++ b/packages/server/src/utils/gpu-setup.ts @@ -18,117 +18,21 @@ interface GPUInfo { export async function checkGPUStatus(serverId?: string): Promise { try { - // Check NVIDIA Driver - let driverInstalled = false; - let driverVersion: string | undefined; - let availableGPUs = 0; - - try { - const driverCommand = - "nvidia-smi --query-gpu=driver_version --format=csv,noheader"; - const { stdout: nvidiaSmi } = serverId - ? await execAsyncRemote(serverId, driverCommand) - : await execAsync(driverCommand); - - driverVersion = nvidiaSmi.trim(); - if (driverVersion) { - driverInstalled = true; - const countCommand = - "nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l"; - const { stdout: gpuCount } = serverId - ? await execAsyncRemote(serverId, countCommand) - : await execAsync(countCommand); - - availableGPUs = Number.parseInt(gpuCount.trim(), 10); - } - } catch (error) { - console.debug("GPU driver check:", error); - } - - // Check Runtime Configuration - let runtimeInstalled = false; - let runtimeConfigured = false; - try { - const runtimeCommand = 'docker info --format "{{json .Runtimes}}"'; - const { stdout: runtimeInfo } = serverId - ? await execAsyncRemote(serverId, runtimeCommand) - : await execAsync(runtimeCommand); - - const runtimes = JSON.parse(runtimeInfo); - runtimeInstalled = "nvidia" in runtimes; - - // Check if it's the default runtime - const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"'; - const { stdout: defaultRuntime } = serverId - ? await execAsyncRemote(serverId, defaultCommand) - : await execAsync(defaultCommand); - - runtimeConfigured = defaultRuntime.trim() === "nvidia"; - } catch (error) { - console.debug("Runtime check:", error); - } - - // Check Swarm GPU Resources - let swarmEnabled = false; - let gpuResources = 0; - - try { - // Check node resources directly from inspect - const nodeCommand = - "docker node inspect self --format '{{json .Description.Resources.GenericResources}}'"; - const { stdout: resources } = serverId - ? await execAsyncRemote(serverId, nodeCommand) - : await execAsync(nodeCommand); - - if (resources && resources !== "null") { - const genericResources = JSON.parse(resources); - for (const resource of genericResources) { - if ( - resource.DiscreteResourceSpec && - (resource.DiscreteResourceSpec.Kind === "GPU" || - resource.DiscreteResourceSpec.Kind === "gpu") - ) { - gpuResources = resource.DiscreteResourceSpec.Value; - swarmEnabled = true; - break; - } - } - } - } catch (error) { - console.debug("Swarm resource check:", error); - } - - // Get GPU Model and Memory Info - const gpuInfoCommand = - "nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader"; - const { stdout: gpuInfo } = serverId - ? await execAsyncRemote(serverId, gpuInfoCommand) - : await execAsync(gpuInfoCommand); - - const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim()); - - // Check CUDA Support - const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"'; - const { stdout: cudaInfo } = serverId - ? await execAsyncRemote(serverId, cudaCommand) - : await execAsync(cudaCommand); - - const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/); - const cudaVersion = cudaMatch ? cudaMatch[1] : undefined; - const cudaSupport = !!cudaVersion; + const [driverInfo, runtimeInfo, swarmInfo, gpuInfo, cudaInfo] = + await Promise.all([ + checkGpuDriver(serverId), + checkRuntime(serverId), + checkSwarmResources(serverId), + checkGpuInfo(serverId), + checkCudaSupport(serverId), + ]); return { - driverInstalled, - driverVersion, - runtimeInstalled, - runtimeConfigured, - availableGPUs, - swarmEnabled, - gpuResources, - gpuModel, - memoryInfo: memoryTotal, - cudaSupport, - cudaVersion, + ...driverInfo, + ...runtimeInfo, + ...swarmInfo, + ...gpuInfo, + ...cudaInfo, }; } catch (error) { console.error("Error in checkGPUStatus:", error); @@ -148,118 +52,167 @@ export async function checkGPUStatus(serverId?: string): Promise { } } +const checkGpuDriver = async (serverId?: string) => { + let driverVersion: string | undefined; + let driverInstalled = false; + let availableGPUs = 0; + + try { + const driverCommand = + "nvidia-smi --query-gpu=driver_version --format=csv,noheader"; + const { stdout: nvidiaSmi } = serverId + ? await execAsyncRemote(serverId, driverCommand) + : await execAsync(driverCommand); + + driverVersion = nvidiaSmi.trim(); + if (driverVersion) { + driverInstalled = true; + const countCommand = + "nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l"; + const { stdout: gpuCount } = serverId + ? await execAsyncRemote(serverId, countCommand) + : await execAsync(countCommand); + + availableGPUs = Number.parseInt(gpuCount.trim(), 10); + } + } catch (error) { + console.debug("GPU driver check:", error); + } + + return { driverVersion, driverInstalled, availableGPUs }; +}; + +const checkRuntime = async (serverId?: string) => { + let runtimeInstalled = false; + let runtimeConfigured = false; + + try { + const runtimeCommand = 'docker info --format "{{json .Runtimes}}"'; + const { stdout: runtimeInfo } = serverId + ? await execAsyncRemote(serverId, runtimeCommand) + : await execAsync(runtimeCommand); + + const runtimes = JSON.parse(runtimeInfo); + runtimeInstalled = "nvidia" in runtimes; + + const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"'; + const { stdout: defaultRuntime } = serverId + ? await execAsyncRemote(serverId, defaultCommand) + : await execAsync(defaultCommand); + + runtimeConfigured = defaultRuntime.trim() === "nvidia"; + } catch (error) { + console.debug("Runtime check:", error); + } + + return { runtimeInstalled, runtimeConfigured }; +}; + +const checkSwarmResources = async (serverId?: string) => { + let swarmEnabled = false; + let gpuResources = 0; + + try { + const nodeCommand = + "docker node inspect self --format '{{json .Description.Resources.GenericResources}}'"; + const { stdout: resources } = serverId + ? await execAsyncRemote(serverId, nodeCommand) + : await execAsync(nodeCommand); + + if (resources && resources !== "null") { + const genericResources = JSON.parse(resources); + for (const resource of genericResources) { + if ( + resource.DiscreteResourceSpec && + (resource.DiscreteResourceSpec.Kind === "GPU" || + resource.DiscreteResourceSpec.Kind === "gpu") + ) { + gpuResources = resource.DiscreteResourceSpec.Value; + swarmEnabled = true; + break; + } + } + } + } catch (error) { + console.debug("Swarm resource check:", error); + } + + return { swarmEnabled, gpuResources }; +}; + +const checkGpuInfo = async (serverId?: string) => { + let gpuModel: string | undefined; + let memoryInfo: string | undefined; + + try { + const gpuInfoCommand = + "nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader"; + const { stdout: gpuInfo } = serverId + ? await execAsyncRemote(serverId, gpuInfoCommand) + : await execAsync(gpuInfoCommand); + + [gpuModel, memoryInfo] = gpuInfo.split(",").map((s) => s.trim()); + } catch (error) { + console.debug("GPU info check:", error); + } + + return { gpuModel, memoryInfo }; +}; + +const checkCudaSupport = async (serverId?: string) => { + let cudaVersion: string | undefined; + let cudaSupport = false; + + try { + const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"'; + const { stdout: cudaInfo } = serverId + ? await execAsyncRemote(serverId, cudaCommand) + : await execAsync(cudaCommand); + + const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/); + cudaVersion = cudaMatch ? cudaMatch[1] : undefined; + cudaSupport = !!cudaVersion; + } catch (error) { + console.debug("CUDA support check:", error); + } + + return { cudaVersion, cudaSupport }; +}; + export async function setupGPUSupport(serverId?: string): Promise { try { - // 1. Check current GPU status first + // 1. Initial status check and validation const initialStatus = await checkGPUStatus(serverId); + const shouldContinue = await validatePrerequisites(initialStatus); + if (!shouldContinue) return; - // If GPU is already configured, just verify and return quickly - if ( - initialStatus.swarmEnabled && - initialStatus.runtimeConfigured && - initialStatus.driverInstalled - ) { - console.log("GPU already configured, skipping setup"); - return; - } + // 2. Get node ID + const nodeId = await getNodeId(serverId); - // 2. Verify GPU prerequisites - if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) { - throw new Error( - "NVIDIA drivers or runtime not installed. Please install them first.", - ); - } + // 3. Create daemon configuration + const daemonConfig = createDaemonConfig(initialStatus.availableGPUs); - // Get the node ID - const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"'; - const { stdout: nodeId } = serverId - ? await execAsyncRemote(serverId, nodeIdCommand) - : await execAsync(nodeIdCommand); - - if (!nodeId.trim()) { - throw new Error("Setup Server before enabling GPU support"); - } - - // 3. Configure NVIDIA runtime in daemon.json - const daemonConfig = { - runtimes: { - nvidia: { - path: "nvidia-container-runtime", - runtimeArgs: [], - }, - }, - "default-runtime": "nvidia", - "node-generic-resources": [`GPU=${initialStatus.availableGPUs}`], - }; - - // Different commands for local and remote setup + // 4. Setup server based on environment if (serverId) { - // Remote server setup (using sudo) - const setupCommands = [ - "sudo -n true", - `echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`, - "sudo mkdir -p /etc/nvidia-container-runtime", - 'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml', - "sudo systemctl daemon-reload", - "sudo systemctl restart docker", - ].join(" && "); - - await execAsyncRemote(serverId, setupCommands); + await setupRemoteServer(serverId, daemonConfig); } else { - // Local server setup (using pkexec for GUI password prompt) - const configFile = `/tmp/docker-daemon-${Date.now()}.json`; - await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2)); - - const setupCommands = [ - // Use pkexec for GUI password prompt - `pkexec sh -c ' - cp ${configFile} /etc/docker/daemon.json && - mkdir -p /etc/nvidia-container-runtime && - echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml && - systemctl daemon-reload && - systemctl restart docker - '`, - `rm ${configFile}`, // Clean up temp file - ].join(" && "); - - await execAsync(setupCommands); + await setupLocalServer(daemonConfig); } - // 4. Reduced wait time for Docker restart - await new Promise((resolve) => setTimeout(resolve, 10000)); + // 5. Wait for Docker restart + await sleep(10000); - // 5. Add GPU label to the node - const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`; - if (serverId) { - await execAsyncRemote(serverId, labelCommand); - } else { - await execAsync(labelCommand); - } + // 6. Add GPU label + await addGpuLabel(nodeId, serverId); - // 6. Quick final verification - await new Promise((resolve) => setTimeout(resolve, 5000)); - const finalStatus = await checkGPUStatus(serverId); - - if (!finalStatus.swarmEnabled) { - const diagnosticCommands = [ - `docker node inspect ${nodeId.trim()}`, - 'nvidia-smi -a | grep "GPU UUID"', - "cat /etc/docker/daemon.json", - "cat /etc/nvidia-container-runtime/config.toml", - ].join(" && "); - - const { stdout: diagnostics } = serverId - ? await execAsyncRemote(serverId, diagnosticCommands) - : await execAsync(diagnosticCommands); - - console.error("Diagnostic Information:", diagnostics); - throw new Error("GPU support not detected in swarm after setup"); - } + // 7. Final verification + await sleep(5000); + const finalStatus = await verifySetup(nodeId, serverId); console.log("GPU setup completed successfully:", { availableGPUs: initialStatus.availableGPUs, driverVersion: initialStatus.driverVersion, - nodeId: nodeId.trim(), + nodeId, }); } catch (error) { console.error("GPU Setup Error:", error); @@ -274,3 +227,113 @@ export async function setupGPUSupport(serverId?: string): Promise { throw error; } } + +const validatePrerequisites = async (initialStatus: GPUInfo) => { + if (!initialStatus.driverInstalled) { + throw new Error( + "NVIDIA drivers not installed. Please install appropriate NVIDIA drivers first.", + ); + } + + if (!initialStatus.runtimeInstalled) { + throw new Error( + "NVIDIA Container Runtime not installed. Please install nvidia-container-runtime first.", + ); + } + + if (initialStatus.swarmEnabled && initialStatus.runtimeConfigured) { + console.log("GPU already configured, skipping setup"); + return false; + } + + return true; +}; + +const getNodeId = async (serverId?: string) => { + const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"'; + const { stdout: nodeId } = serverId + ? await execAsyncRemote(serverId, nodeIdCommand) + : await execAsync(nodeIdCommand); + + const trimmedNodeId = nodeId.trim(); + if (!trimmedNodeId) { + throw new Error("Setup Server before enabling GPU support"); + } + + return trimmedNodeId; +}; + +const createDaemonConfig = (availableGPUs: number) => ({ + runtimes: { + nvidia: { + path: "nvidia-container-runtime", + runtimeArgs: [], + }, + }, + "default-runtime": "nvidia", + "node-generic-resources": [`GPU=${availableGPUs}`], +}); + +const setupRemoteServer = async (serverId: string, daemonConfig: any) => { + const setupCommands = [ + "sudo -n true", + `echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`, + "sudo mkdir -p /etc/nvidia-container-runtime", + 'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml', + "sudo systemctl daemon-reload", + "sudo systemctl restart docker", + ].join(" && "); + + await execAsyncRemote(serverId, setupCommands); +}; + +const setupLocalServer = async (daemonConfig: any) => { + const configFile = `/tmp/docker-daemon-${Date.now()}.json`; + await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2)); + + const setupCommands = [ + `pkexec sh -c ' + cp ${configFile} /etc/docker/daemon.json && + mkdir -p /etc/nvidia-container-runtime && + echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml && + systemctl daemon-reload && + systemctl restart docker + '`, + `rm ${configFile}`, + ].join(" && "); + + await execAsync(setupCommands); +}; + +const addGpuLabel = async (nodeId: string, serverId?: string) => { + const labelCommand = `docker node update --label-add gpu=true ${nodeId}`; + if (serverId) { + await execAsyncRemote(serverId, labelCommand); + } else { + await execAsync(labelCommand); + } +}; + +const verifySetup = async (nodeId: string, serverId?: string) => { + const finalStatus = await checkGPUStatus(serverId); + + if (!finalStatus.swarmEnabled) { + const diagnosticCommands = [ + `docker node inspect ${nodeId}`, + 'nvidia-smi -a | grep "GPU UUID"', + "cat /etc/docker/daemon.json", + "cat /etc/nvidia-container-runtime/config.toml", + ].join(" && "); + + const { stdout: diagnostics } = serverId + ? await execAsyncRemote(serverId, diagnosticCommands) + : await execAsync(diagnosticCommands); + + console.error("Diagnostic Information:", diagnostics); + throw new Error("GPU support not detected in swarm after setup"); + } + + return finalStatus; +}; + +const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)); From b817b4b6ee52be17d3e2a141a64893276acf19a7 Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Mon, 11 Nov 2024 23:18:24 +0530 Subject: [PATCH 11/13] refactor: gpu support and docker setup improvements - Add gpu status refresh with useEffect - Update docker-compose.yml configuration - Modify gpu setup scripts - Improve gpu support checks --- .../settings/servers/gpu-support.tsx | 15 +++++-- .../templates/blender/docker-compose.yml | 1 - packages/server/src/utils/gpu-setup.ts | 42 ++++++++++++++----- 3 files changed, 43 insertions(+), 15 deletions(-) diff --git a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx index e89a9b66..b398fe74 100644 --- a/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx +++ b/apps/dokploy/components/dashboard/settings/servers/gpu-support.tsx @@ -11,7 +11,7 @@ import { import { api } from "@/utils/api"; import { TRPCClientError } from "@trpc/client"; import { CheckCircle2, Cpu, Loader2, RefreshCw, XCircle } from "lucide-react"; -import { useState } from "react"; +import { useEffect, useState } from "react"; import { toast } from "sonner"; interface GPUSupportProps { @@ -54,9 +54,18 @@ export function GPUSupport({ serverId }: GPUSupportProps) { const handleRefresh = async () => { setIsRefreshing(true); - await refetch(); - setIsRefreshing(false); + try { + await utils.settings.checkGPUStatus.invalidate({ serverId }); + await refetch(); + } catch (error) { + toast.error("Failed to refresh GPU status"); + } finally { + setIsRefreshing(false); + } }; + useEffect(() => { + handleRefresh(); + }, []); const handleEnableGPU = async () => { if (serverId === undefined) { diff --git a/apps/dokploy/templates/blender/docker-compose.yml b/apps/dokploy/templates/blender/docker-compose.yml index da769c6b..893f3dee 100644 --- a/apps/dokploy/templates/blender/docker-compose.yml +++ b/apps/dokploy/templates/blender/docker-compose.yml @@ -3,7 +3,6 @@ version: "3.8" services: blender: image: lscr.io/linuxserver/blender:latest - container_name: blender runtime: nvidia deploy: resources: diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts index bb366762..12d46dc1 100644 --- a/packages/server/src/utils/gpu-setup.ts +++ b/packages/server/src/utils/gpu-setup.ts @@ -87,20 +87,38 @@ const checkRuntime = async (serverId?: string) => { let runtimeConfigured = false; try { - const runtimeCommand = 'docker info --format "{{json .Runtimes}}"'; - const { stdout: runtimeInfo } = serverId - ? await execAsyncRemote(serverId, runtimeCommand) - : await execAsync(runtimeCommand); + // First check: Is nvidia-container-runtime installed? + const checkBinaryCommand = "command -v nvidia-container-runtime"; + try { + const { stdout } = serverId + ? await execAsyncRemote(serverId, checkBinaryCommand) + : await execAsync(checkBinaryCommand); + runtimeInstalled = !!stdout.trim(); + } catch (error) { + console.debug("Runtime binary check:", error); + } - const runtimes = JSON.parse(runtimeInfo); - runtimeInstalled = "nvidia" in runtimes; + // Second check: Is it configured in Docker? + try { + const runtimeCommand = 'docker info --format "{{json .Runtimes}}"'; + const { stdout: runtimeInfo } = serverId + ? await execAsyncRemote(serverId, runtimeCommand) + : await execAsync(runtimeCommand); - const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"'; - const { stdout: defaultRuntime } = serverId - ? await execAsyncRemote(serverId, defaultCommand) - : await execAsync(defaultCommand); + const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"'; + const { stdout: defaultRuntime } = serverId + ? await execAsyncRemote(serverId, defaultCommand) + : await execAsync(defaultCommand); - runtimeConfigured = defaultRuntime.trim() === "nvidia"; + const runtimes = JSON.parse(runtimeInfo); + const hasNvidiaRuntime = "nvidia" in runtimes; + const isDefaultRuntime = defaultRuntime.trim() === "nvidia"; + + // Only set runtimeConfigured if both conditions are met + runtimeConfigured = hasNvidiaRuntime && isDefaultRuntime; + } catch (error) { + console.debug("Runtime configuration check:", error); + } } catch (error) { console.debug("Runtime check:", error); } @@ -279,6 +297,7 @@ const setupRemoteServer = async (serverId: string, daemonConfig: any) => { "sudo -n true", `echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`, "sudo mkdir -p /etc/nvidia-container-runtime", + 'sudo sed -i "/swarm-resource/d" /etc/nvidia-container-runtime/config.toml', 'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml', "sudo systemctl daemon-reload", "sudo systemctl restart docker", @@ -295,6 +314,7 @@ const setupLocalServer = async (daemonConfig: any) => { `pkexec sh -c ' cp ${configFile} /etc/docker/daemon.json && mkdir -p /etc/nvidia-container-runtime && + sed -i "/swarm-resource/d" /etc/nvidia-container-runtime/config.toml && echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml && systemctl daemon-reload && systemctl restart docker From 6961ee1fc0a6e0e808f21eb3bef84435e5082858 Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Thu, 14 Nov 2024 04:00:05 +0530 Subject: [PATCH 12/13] refactor: removed console logs and error handling --- apps/dokploy/server/api/routers/settings.ts | 5 +---- packages/server/src/utils/gpu-setup.ts | 10 +--------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/apps/dokploy/server/api/routers/settings.ts b/apps/dokploy/server/api/routers/settings.ts index 56b6431c..7c777b17 100644 --- a/apps/dokploy/server/api/routers/settings.ts +++ b/apps/dokploy/server/api/routers/settings.ts @@ -701,11 +701,8 @@ export const settingsRouter = createTRPCRouter({ } try { - const status = await checkGPUStatus(input.serverId || ""); - console.log("GPU Status Check Result:", status); - return status; + return await checkGPUStatus(input.serverId || ""); } catch (error) { - console.error("GPU Status Check Error:", error); throw new Error("Failed to check GPU status"); } }), diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts index 12d46dc1..d508cb44 100644 --- a/packages/server/src/utils/gpu-setup.ts +++ b/packages/server/src/utils/gpu-setup.ts @@ -225,15 +225,8 @@ export async function setupGPUSupport(serverId?: string): Promise { // 7. Final verification await sleep(5000); - const finalStatus = await verifySetup(nodeId, serverId); - - console.log("GPU setup completed successfully:", { - availableGPUs: initialStatus.availableGPUs, - driverVersion: initialStatus.driverVersion, - nodeId, - }); + await verifySetup(nodeId, serverId); } catch (error) { - console.error("GPU Setup Error:", error); if ( error instanceof Error && error.message.includes("password is required") @@ -260,7 +253,6 @@ const validatePrerequisites = async (initialStatus: GPUInfo) => { } if (initialStatus.swarmEnabled && initialStatus.runtimeConfigured) { - console.log("GPU already configured, skipping setup"); return false; } From 3eef4aa016379e3385531abb70282e9ddbd8b515 Mon Sep 17 00:00:00 2001 From: vishalkadam47 Date: Thu, 14 Nov 2024 17:41:32 +0530 Subject: [PATCH 13/13] refactor: removed sleep function and updated import --- packages/server/src/utils/gpu-setup.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts index d508cb44..ce60adf1 100644 --- a/packages/server/src/utils/gpu-setup.ts +++ b/packages/server/src/utils/gpu-setup.ts @@ -1,5 +1,5 @@ import * as fs from "node:fs/promises"; -import { execAsync } from "../utils/process/execAsync"; +import { execAsync, sleep } from "../utils/process/execAsync"; import { execAsyncRemote } from "../utils/process/execAsync"; interface GPUInfo { @@ -347,5 +347,3 @@ const verifySetup = async (nodeId: string, serverId?: string) => { return finalStatus; }; - -const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));