diff --git a/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx b/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx index 8bfcf4da..119d4d29 100644 --- a/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx +++ b/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx @@ -32,6 +32,7 @@ import Link from "next/link"; import { useState } from "react"; import { toast } from "sonner"; import { ShowDeployment } from "../../application/deployments/show-deployment"; +import { GPUSupport } from "./gpu-support"; interface Props { serverId: string; @@ -89,9 +90,10 @@ export const SetupServer = ({ serverId }: Props) => { ) : (
- + SSH Keys Deployments + GPU Setup {
+ +
+ +
+
)} diff --git a/apps/dokploy/server/api/routers/settings.ts b/apps/dokploy/server/api/routers/settings.ts index e1e63579..4a000889 100644 --- a/apps/dokploy/server/api/routers/settings.ts +++ b/apps/dokploy/server/api/routers/settings.ts @@ -52,6 +52,10 @@ import { writeMainConfig, writeTraefikConfigInPath, } from "@dokploy/server"; +import { + checkGPUStatus, + setupGPUSupport, +} from "@dokploy/server/src/utils/gpu-setup"; import { generateOpenApiDocument } from "@dokploy/trpc-openapi"; import { TRPCError } from "@trpc/server"; import { sql } from "drizzle-orm"; @@ -650,6 +654,62 @@ export const settingsRouter = createTRPCRouter({ } return { status: "not_cloud" }; }), + setupGPU: adminProcedure + .input( + z.object({ + serverId: z.string(), + }), + ) + .mutation(async ({ input }) => { + try { + if (IS_CLOUD) { + return { success: true }; + } + + if (!input.serverId) { + throw new TRPCError({ + code: "BAD_REQUEST", + message: "Server ID is required", + }); + } + + await setupGPUSupport(input.serverId); + return { success: true }; + } catch (error) { + throw new TRPCError({ + code: "INTERNAL_SERVER_ERROR", + message: + error instanceof Error + ? error.message + : "Failed to enable GPU support", + cause: error, + }); + } + }), + checkGPUStatus: adminProcedure + .input( + z.object({ + serverId: z.string().optional(), + }), + ) + .query(async ({ input }) => { + if (IS_CLOUD) { + return { + driverInstalled: false, + driverVersion: undefined, + gpuModel: undefined, + runtimeInstalled: false, + runtimeConfigured: false, + cudaSupport: undefined, + cudaVersion: undefined, + memoryInfo: undefined, + availableGPUs: 0, + swarmEnabled: false, + gpuResources: 0, + }; + } + return await checkGPUStatus(input.serverId); + }), }); // { // "Parallelism": 1, diff --git a/apps/dokploy/server/api/trpc.ts b/apps/dokploy/server/api/trpc.ts index 8aec99ec..db4f7adf 100644 --- a/apps/dokploy/server/api/trpc.ts +++ b/apps/dokploy/server/api/trpc.ts @@ -21,8 +21,6 @@ import { import type { Session, User } from "lucia"; import superjson from "superjson"; import { ZodError } from "zod"; -import { setupGPUSupport } from '@dokploy/server/src/utils/gpu-setup'; - /** * 1. CONTEXT * @@ -209,10 +207,3 @@ export const adminProcedure = t.procedure.use(({ ctx, next }) => { }, }); }); - -const appRouter = t.router({ - setupGPU: t.procedure.mutation(async () => { - await setupGPUSupport(); - return { success: true }; - }), - }); \ No newline at end of file diff --git a/apps/dokploy/templates/blender/index.ts b/apps/dokploy/templates/blender/index.ts index 088e6fcc..baf243e0 100644 --- a/apps/dokploy/templates/blender/index.ts +++ b/apps/dokploy/templates/blender/index.ts @@ -1,34 +1,34 @@ import { - generateHash, - generateRandomDomain, - type Template, - type Schema, - type DomainSchema, + type DomainSchema, + type Schema, + type Template, + generateHash, + generateRandomDomain, } from "../utils"; export function generate(schema: Schema): Template { - const mainServiceHash = generateHash(schema.projectName); - const mainDomain = generateRandomDomain(schema); + const mainServiceHash = generateHash(schema.projectName); + const mainDomain = generateRandomDomain(schema); - const domains: DomainSchema[] = [ - { - host: mainDomain, - port: 3000, - serviceName: "blender", - }, - ]; + const domains: DomainSchema[] = [ + { + host: mainDomain, + port: 3000, + serviceName: "blender", + }, + ]; - const envs = [ - `PUID=1000`, - `PGID=1000`, - `TZ=Etc/UTC`, - `SUBFOLDER=/`, - `NVIDIA_VISIBLE_DEVICES=all`, - `NVIDIA_DRIVER_CAPABILITIES=all`, - ]; + const envs = [ + `PUID=1000`, + `PGID=1000`, + `TZ=Etc/UTC`, + `SUBFOLDER=/`, + `NVIDIA_VISIBLE_DEVICES=all`, + `NVIDIA_DRIVER_CAPABILITIES=all`, + ]; - return { - envs, - domains, - }; + return { + envs, + domains, + }; } diff --git a/apps/dokploy/templates/templates.ts b/apps/dokploy/templates/templates.ts index 40d493e5..115a1ecf 100644 --- a/apps/dokploy/templates/templates.ts +++ b/apps/dokploy/templates/templates.ts @@ -516,7 +516,8 @@ export const templates: TemplateData[] = [ id: "blender", name: "Blender", version: "latest", - description: "Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.", + description: + "Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.", logo: "blender.svg", links: { github: "https://github.com/linuxserver/docker-blender", diff --git a/packages/server/src/constants/index.ts b/packages/server/src/constants/index.ts index be2a72de..f2f1a4d8 100644 --- a/packages/server/src/constants/index.ts +++ b/packages/server/src/constants/index.ts @@ -36,4 +36,4 @@ export const paths = (isServer = false) => { MONITORING_PATH: `${BASE_PATH}/monitoring`, REGISTRY_PATH: `${BASE_PATH}/registry`, }; -}; \ No newline at end of file +}; diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 06f2bc87..90daec2d 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -118,3 +118,4 @@ export * from "./monitoring/utilts"; export * from "./db/validations/domain"; export * from "./db/validations/index"; +export * from "./utils/gpu-setup"; diff --git a/packages/server/src/utils/gpu-setup.ts b/packages/server/src/utils/gpu-setup.ts index 459c3395..71f3bf0f 100644 --- a/packages/server/src/utils/gpu-setup.ts +++ b/packages/server/src/utils/gpu-setup.ts @@ -1,9 +1,261 @@ -import { docker } from '../constants'; +import { docker } from "../constants"; +import { execAsync } from "../utils/process/execAsync"; +import { execAsyncRemote } from "../utils/process/execAsync"; +import { getRemoteDocker } from "./servers/remote-docker"; -export async function setupGPUSupport() { - await docker.swarmUpdate({ - TaskDefaults: { - GenericResources: [{ DiscreteResourceSpec: { Kind: 'gpu', Value: 1 } }] - } - }); -} \ No newline at end of file +interface GPUInfo { + driverInstalled: boolean; + driverVersion?: string; + gpuModel?: string; + runtimeInstalled: boolean; + runtimeConfigured: boolean; + cudaSupport: boolean; + cudaVersion?: string; + memoryInfo?: string; + availableGPUs: number; + swarmEnabled: boolean; + gpuResources: number; +} + +interface DiscreteResourceSpec { + Kind: string; + Value: number; +} + +interface NamedGenericResource { + NamedResourceSpec?: { Kind: string; Value: string }; + DiscreteResourceSpec?: DiscreteResourceSpec; +} + +export async function checkGPUStatus(serverId?: string): Promise { + try { + // Check NVIDIA Driver + let driverInstalled = false; + let driverVersion: string | undefined; + let availableGPUs = 0; + + try { + const driverCommand = + "nvidia-smi --query-gpu=driver_version --format=csv,noheader"; + const { stdout: nvidiaSmi } = serverId + ? await execAsyncRemote(serverId, driverCommand) + : await execAsync(driverCommand); + + driverVersion = nvidiaSmi.trim(); + if (driverVersion) { + driverInstalled = true; + const countCommand = + "nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l"; + const { stdout: gpuCount } = serverId + ? await execAsyncRemote(serverId, countCommand) + : await execAsync(countCommand); + + availableGPUs = Number.parseInt(gpuCount.trim(), 10); + } + } catch (error) { + console.debug("GPU driver check:", error); + } + + // Check Runtime Configuration + let runtimeInstalled = false; + let runtimeConfigured = false; + try { + const runtimeCommand = 'docker info --format "{{json .Runtimes}}"'; + const { stdout: runtimeInfo } = serverId + ? await execAsyncRemote(serverId, runtimeCommand) + : await execAsync(runtimeCommand); + + const runtimes = JSON.parse(runtimeInfo); + runtimeInstalled = "nvidia" in runtimes; + + // Check if it's the default runtime + const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"'; + const { stdout: defaultRuntime } = serverId + ? await execAsyncRemote(serverId, defaultCommand) + : await execAsync(defaultCommand); + + runtimeConfigured = defaultRuntime.trim() === "nvidia"; + } catch (error) { + console.debug("Runtime check:", error); + } + + // Check Swarm GPU Resources + let swarmEnabled = false; + let gpuResources = 0; + + try { + // Check node resources directly from inspect + const nodeCommand = + "docker node inspect self --format '{{json .Description.Resources.GenericResources}}'"; + const { stdout: resources } = serverId + ? await execAsyncRemote(serverId, nodeCommand) + : await execAsync(nodeCommand); + + if (resources && resources !== "null") { + const genericResources = JSON.parse(resources); + for (const resource of genericResources) { + if ( + resource.DiscreteResourceSpec && + (resource.DiscreteResourceSpec.Kind === "GPU" || + resource.DiscreteResourceSpec.Kind === "gpu") + ) { + gpuResources = resource.DiscreteResourceSpec.Value; + swarmEnabled = true; + break; + } + } + } + } catch (error) { + console.debug("Swarm resource check:", error); + } + + // Get GPU Model and Memory Info + const gpuInfoCommand = + "nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader"; + const { stdout: gpuInfo } = serverId + ? await execAsyncRemote(serverId, gpuInfoCommand) + : await execAsync(gpuInfoCommand); + + const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim()); + + // Check CUDA Support + const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"'; + const { stdout: cudaInfo } = serverId + ? await execAsyncRemote(serverId, cudaCommand) + : await execAsync(cudaCommand); + + const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/); + const cudaVersion = cudaMatch ? cudaMatch[1] : undefined; + const cudaSupport = !!cudaVersion; + + return { + driverInstalled, + driverVersion, + runtimeInstalled, + runtimeConfigured, + availableGPUs, + swarmEnabled, + gpuResources, + gpuModel, + memoryInfo: memoryTotal, + cudaSupport, + cudaVersion, + }; + } catch (error) { + console.error("Error in checkGPUStatus:", error); + return { + driverInstalled: false, + driverVersion: undefined, + runtimeInstalled: false, + runtimeConfigured: false, + cudaSupport: false, + cudaVersion: undefined, + gpuModel: undefined, + memoryInfo: undefined, + availableGPUs: 0, + swarmEnabled: false, + gpuResources: 0, + }; + } +} + +export async function setupGPUSupport(serverId?: string): Promise { + try { + // 1. Check current GPU status first + const initialStatus = await checkGPUStatus(serverId); + + // If GPU is already configured, just verify and return quickly + if ( + initialStatus.swarmEnabled && + initialStatus.runtimeConfigured && + initialStatus.driverInstalled + ) { + console.log("GPU already configured, skipping setup"); + return; + } + + // 2. Verify GPU prerequisites + if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) { + throw new Error( + "NVIDIA drivers or runtime not installed. Please install them first.", + ); + } + + // Get the node ID + const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"'; + const { stdout: nodeId } = serverId + ? await execAsyncRemote(serverId, nodeIdCommand) + : await execAsync(nodeIdCommand); + + if (!nodeId.trim()) { + throw new Error("Setup Server before enabling GPU support"); + } + + // 3. Configure NVIDIA runtime in daemon.json + const daemonConfig = { + runtimes: { + nvidia: { + path: "nvidia-container-runtime", + runtimeArgs: [], + }, + }, + "default-runtime": "nvidia", + "node-generic-resources": [`GPU=${initialStatus.availableGPUs}`], + }; + + const setupCommands = [ + "sudo -n true", + `echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`, + "sudo mkdir -p /etc/nvidia-container-runtime", + 'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml', + "sudo systemctl daemon-reload", + "sudo systemctl restart docker", + ].join(" && "); + + if (serverId) { + await execAsyncRemote(serverId, setupCommands); + } else { + await execAsync(setupCommands); + } + + // 4. Reduced wait time for Docker restart + await new Promise((resolve) => setTimeout(resolve, 10000)); + + // 5. Add GPU label to the node + const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`; + if (serverId) { + await execAsyncRemote(serverId, labelCommand); + } else { + await execAsync(labelCommand); + } + + // 6. Quick final verification + await new Promise((resolve) => setTimeout(resolve, 5000)); + const finalStatus = await checkGPUStatus(serverId); + + if (!finalStatus.swarmEnabled) { + const diagnosticCommands = [ + `docker node inspect ${nodeId.trim()}`, + 'nvidia-smi -a | grep "GPU UUID"', + "cat /etc/docker/daemon.json", + "cat /etc/nvidia-container-runtime/config.toml", + ].join(" && "); + + const { stdout: diagnostics } = serverId + ? await execAsyncRemote(serverId, diagnosticCommands) + : await execAsync(diagnosticCommands); + + console.error("Diagnostic Information:", diagnostics); + throw new Error("GPU support not detected in swarm after setup"); + } + + console.log("GPU setup completed successfully:", { + availableGPUs: initialStatus.availableGPUs, + driverVersion: initialStatus.driverVersion, + nodeId: nodeId.trim(), + }); + } catch (error) { + console.error("GPU Setup Error:", error); + throw error; + } +}