feat: Added GPU support feature for Remote Server with setup and status checks, including API endpoints and utility functions

2025-06-26 18:27:59 +00:00 · 2024-11-02 15:15:58 +05:30 · 2024-11-02 15:15:58 +05:30 · 1b6d8d803b
commit 1b6d8d803b
parent 3e467959c9
8 changed files with 361 additions and 46 deletions
--- a/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx
+++ b/apps/dokploy/components/dashboard/settings/servers/setup-server.tsx
@ -32,6 +32,7 @@ import Link from "next/link";
 import { useState } from "react";
 import { toast } from "sonner";
 import { ShowDeployment } from "../../application/deployments/show-deployment";
+import { GPUSupport } from "./gpu-support";

 interface Props {
 	serverId: string;
@ -89,9 +90,10 @@ export const SetupServer = ({ serverId }: Props) => {
 				) : (
 					<div id="hook-form-add-gitlab" className="grid w-full gap-1">
 						<Tabs defaultValue="ssh-keys">
-							<TabsList className="grid grid-cols-2 w-[400px]">
+							<TabsList className="grid grid-cols-3 w-[400px]">
 								<TabsTrigger value="ssh-keys">SSH Keys</TabsTrigger>
 								<TabsTrigger value="deployments">Deployments</TabsTrigger>
+								<TabsTrigger value="gpu-setup">GPU Setup</TabsTrigger>
 							</TabsList>
 							<TabsContent
 								value="ssh-keys"
@ -291,6 +293,14 @@ export const SetupServer = ({ serverId }: Props) => {
 									</div>
 								</CardContent>
 							</TabsContent>
+							<TabsContent
+								value="gpu-setup"
+								className="outline-none ring-0 focus-visible:ring-0 focus-visible:ring-offset-0"
+							>
+								<div className="flex flex-col gap-2 text-sm text-muted-foreground pt-3">
+									<GPUSupport serverId={serverId} />
+								</div>
+							</TabsContent>
 						</Tabs>
 					</div>
 				)}
--- a/apps/dokploy/server/api/routers/settings.ts
+++ b/apps/dokploy/server/api/routers/settings.ts
@ -52,6 +52,10 @@ import {
 	writeMainConfig,
 	writeTraefikConfigInPath,
 } from "@dokploy/server";
+import {
+	checkGPUStatus,
+	setupGPUSupport,
+} from "@dokploy/server/src/utils/gpu-setup";
 import { generateOpenApiDocument } from "@dokploy/trpc-openapi";
 import { TRPCError } from "@trpc/server";
 import { sql } from "drizzle-orm";
@ -650,6 +654,62 @@ export const settingsRouter = createTRPCRouter({
 		}
 		return { status: "not_cloud" };
 	}),
+	setupGPU: adminProcedure
+		.input(
+			z.object({
+				serverId: z.string(),
+			}),
+		)
+		.mutation(async ({ input }) => {
+			try {
+				if (IS_CLOUD) {
+					return { success: true };
+				}
+
+				if (!input.serverId) {
+					throw new TRPCError({
+						code: "BAD_REQUEST",
+						message: "Server ID is required",
+					});
+				}
+
+				await setupGPUSupport(input.serverId);
+				return { success: true };
+			} catch (error) {
+				throw new TRPCError({
+					code: "INTERNAL_SERVER_ERROR",
+					message:
+						error instanceof Error
+							? error.message
+							: "Failed to enable GPU support",
+					cause: error,
+				});
+			}
+		}),
+	checkGPUStatus: adminProcedure
+		.input(
+			z.object({
+				serverId: z.string().optional(),
+			}),
+		)
+		.query(async ({ input }) => {
+			if (IS_CLOUD) {
+				return {
+					driverInstalled: false,
+					driverVersion: undefined,
+					gpuModel: undefined,
+					runtimeInstalled: false,
+					runtimeConfigured: false,
+					cudaSupport: undefined,
+					cudaVersion: undefined,
+					memoryInfo: undefined,
+					availableGPUs: 0,
+					swarmEnabled: false,
+					gpuResources: 0,
+				};
+			}
+			return await checkGPUStatus(input.serverId);
+		}),
 });
 // {
 // 	"Parallelism": 1,
--- a/apps/dokploy/server/api/trpc.ts
+++ b/apps/dokploy/server/api/trpc.ts
@ -21,8 +21,6 @@ import {
 import type { Session, User } from "lucia";
 import superjson from "superjson";
 import { ZodError } from "zod";
-import { setupGPUSupport } from '@dokploy/server/src/utils/gpu-setup';
-
 /**
 * 1. CONTEXT
 *
@ -209,10 +207,3 @@ export const adminProcedure = t.procedure.use(({ ctx, next }) => {
 		},
 	});
 });
-
-const appRouter = t.router({
-	setupGPU: t.procedure.mutation(async () => {
-	  await setupGPUSupport();
-	  return { success: true };
-	}),
-  });
--- a/apps/dokploy/templates/blender/index.ts
+++ b/apps/dokploy/templates/blender/index.ts
@ -1,34 +1,34 @@
 import {
-  generateHash,
-  generateRandomDomain,
-  type Template,
-  type Schema,
-  type DomainSchema,
+	type DomainSchema,
+	type Schema,
+	type Template,
+	generateHash,
+	generateRandomDomain,
 } from "../utils";

 export function generate(schema: Schema): Template {
-  const mainServiceHash = generateHash(schema.projectName);
-  const mainDomain = generateRandomDomain(schema);
+	const mainServiceHash = generateHash(schema.projectName);
+	const mainDomain = generateRandomDomain(schema);

-  const domains: DomainSchema[] = [
-    {
-      host: mainDomain,
-      port: 3000,
-      serviceName: "blender",
-    },
-  ];
+	const domains: DomainSchema[] = [
+		{
+			host: mainDomain,
+			port: 3000,
+			serviceName: "blender",
+		},
+	];

-  const envs = [
-    `PUID=1000`,
-    `PGID=1000`,
-    `TZ=Etc/UTC`,
-    `SUBFOLDER=/`,
-    `NVIDIA_VISIBLE_DEVICES=all`,
-    `NVIDIA_DRIVER_CAPABILITIES=all`,
-  ];
+	const envs = [
+		`PUID=1000`,
+		`PGID=1000`,
+		`TZ=Etc/UTC`,
+		`SUBFOLDER=/`,
+		`NVIDIA_VISIBLE_DEVICES=all`,
+		`NVIDIA_DRIVER_CAPABILITIES=all`,
+	];

-  return {
-    envs,
-    domains,
-  };
+	return {
+		envs,
+		domains,
+	};
 }
--- a/apps/dokploy/templates/templates.ts
+++ b/apps/dokploy/templates/templates.ts
@ -516,7 +516,8 @@ export const templates: TemplateData[] = [
 		id: "blender",
 		name: "Blender",
 		version: "latest",
-		description: "Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.",
+		description:
+			"Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.",
 		logo: "blender.svg",
 		links: {
 			github: "https://github.com/linuxserver/docker-blender",
--- a/packages/server/src/constants/index.ts
+++ b/packages/server/src/constants/index.ts
@ -36,4 +36,4 @@ export const paths = (isServer = false) => {
 		MONITORING_PATH: `${BASE_PATH}/monitoring`,
 		REGISTRY_PATH: `${BASE_PATH}/registry`,
 	};
-};
+};
--- a/packages/server/src/index.ts
+++ b/packages/server/src/index.ts
@ -118,3 +118,4 @@ export * from "./monitoring/utilts";

 export * from "./db/validations/domain";
 export * from "./db/validations/index";
+export * from "./utils/gpu-setup";
--- a/packages/server/src/utils/gpu-setup.ts
+++ b/packages/server/src/utils/gpu-setup.ts
@ -1,9 +1,261 @@
-import { docker } from '../constants';
+import { docker } from "../constants";
+import { execAsync } from "../utils/process/execAsync";
+import { execAsyncRemote } from "../utils/process/execAsync";
+import { getRemoteDocker } from "./servers/remote-docker";

-export async function setupGPUSupport() {
-  await docker.swarmUpdate({
-    TaskDefaults: {
-      GenericResources: [{ DiscreteResourceSpec: { Kind: 'gpu', Value: 1 } }]
-    }
-  });
-}
+interface GPUInfo {
+	driverInstalled: boolean;
+	driverVersion?: string;
+	gpuModel?: string;
+	runtimeInstalled: boolean;
+	runtimeConfigured: boolean;
+	cudaSupport: boolean;
+	cudaVersion?: string;
+	memoryInfo?: string;
+	availableGPUs: number;
+	swarmEnabled: boolean;
+	gpuResources: number;
+}
+
+interface DiscreteResourceSpec {
+	Kind: string;
+	Value: number;
+}
+
+interface NamedGenericResource {
+	NamedResourceSpec?: { Kind: string; Value: string };
+	DiscreteResourceSpec?: DiscreteResourceSpec;
+}
+
+export async function checkGPUStatus(serverId?: string): Promise<GPUInfo> {
+	try {
+		// Check NVIDIA Driver
+		let driverInstalled = false;
+		let driverVersion: string | undefined;
+		let availableGPUs = 0;
+
+		try {
+			const driverCommand =
+				"nvidia-smi --query-gpu=driver_version --format=csv,noheader";
+			const { stdout: nvidiaSmi } = serverId
+				? await execAsyncRemote(serverId, driverCommand)
+				: await execAsync(driverCommand);
+
+			driverVersion = nvidiaSmi.trim();
+			if (driverVersion) {
+				driverInstalled = true;
+				const countCommand =
+					"nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
+				const { stdout: gpuCount } = serverId
+					? await execAsyncRemote(serverId, countCommand)
+					: await execAsync(countCommand);
+
+				availableGPUs = Number.parseInt(gpuCount.trim(), 10);
+			}
+		} catch (error) {
+			console.debug("GPU driver check:", error);
+		}
+
+		// Check Runtime Configuration
+		let runtimeInstalled = false;
+		let runtimeConfigured = false;
+		try {
+			const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
+			const { stdout: runtimeInfo } = serverId
+				? await execAsyncRemote(serverId, runtimeCommand)
+				: await execAsync(runtimeCommand);
+
+			const runtimes = JSON.parse(runtimeInfo);
+			runtimeInstalled = "nvidia" in runtimes;
+
+			// Check if it's the default runtime
+			const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
+			const { stdout: defaultRuntime } = serverId
+				? await execAsyncRemote(serverId, defaultCommand)
+				: await execAsync(defaultCommand);
+
+			runtimeConfigured = defaultRuntime.trim() === "nvidia";
+		} catch (error) {
+			console.debug("Runtime check:", error);
+		}
+
+		// Check Swarm GPU Resources
+		let swarmEnabled = false;
+		let gpuResources = 0;
+
+		try {
+			// Check node resources directly from inspect
+			const nodeCommand =
+				"docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
+			const { stdout: resources } = serverId
+				? await execAsyncRemote(serverId, nodeCommand)
+				: await execAsync(nodeCommand);
+
+			if (resources && resources !== "null") {
+				const genericResources = JSON.parse(resources);
+				for (const resource of genericResources) {
+					if (
+						resource.DiscreteResourceSpec &&
+						(resource.DiscreteResourceSpec.Kind === "GPU" ||
+							resource.DiscreteResourceSpec.Kind === "gpu")
+					) {
+						gpuResources = resource.DiscreteResourceSpec.Value;
+						swarmEnabled = true;
+						break;
+					}
+				}
+			}
+		} catch (error) {
+			console.debug("Swarm resource check:", error);
+		}
+
+		// Get GPU Model and Memory Info
+		const gpuInfoCommand =
+			"nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
+		const { stdout: gpuInfo } = serverId
+			? await execAsyncRemote(serverId, gpuInfoCommand)
+			: await execAsync(gpuInfoCommand);
+
+		const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim());
+
+		// Check CUDA Support
+		const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
+		const { stdout: cudaInfo } = serverId
+			? await execAsyncRemote(serverId, cudaCommand)
+			: await execAsync(cudaCommand);
+
+		const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
+		const cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
+		const cudaSupport = !!cudaVersion;
+
+		return {
+			driverInstalled,
+			driverVersion,
+			runtimeInstalled,
+			runtimeConfigured,
+			availableGPUs,
+			swarmEnabled,
+			gpuResources,
+			gpuModel,
+			memoryInfo: memoryTotal,
+			cudaSupport,
+			cudaVersion,
+		};
+	} catch (error) {
+		console.error("Error in checkGPUStatus:", error);
+		return {
+			driverInstalled: false,
+			driverVersion: undefined,
+			runtimeInstalled: false,
+			runtimeConfigured: false,
+			cudaSupport: false,
+			cudaVersion: undefined,
+			gpuModel: undefined,
+			memoryInfo: undefined,
+			availableGPUs: 0,
+			swarmEnabled: false,
+			gpuResources: 0,
+		};
+	}
+}
+
+export async function setupGPUSupport(serverId?: string): Promise<void> {
+	try {
+		// 1. Check current GPU status first
+		const initialStatus = await checkGPUStatus(serverId);
+
+		// If GPU is already configured, just verify and return quickly
+		if (
+			initialStatus.swarmEnabled &&
+			initialStatus.runtimeConfigured &&
+			initialStatus.driverInstalled
+		) {
+			console.log("GPU already configured, skipping setup");
+			return;
+		}
+
+		// 2. Verify GPU prerequisites
+		if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) {
+			throw new Error(
+				"NVIDIA drivers or runtime not installed. Please install them first.",
+			);
+		}
+
+		// Get the node ID
+		const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
+		const { stdout: nodeId } = serverId
+			? await execAsyncRemote(serverId, nodeIdCommand)
+			: await execAsync(nodeIdCommand);
+
+		if (!nodeId.trim()) {
+			throw new Error("Setup Server before enabling GPU support");
+		}
+
+		// 3. Configure NVIDIA runtime in daemon.json
+		const daemonConfig = {
+			runtimes: {
+				nvidia: {
+					path: "nvidia-container-runtime",
+					runtimeArgs: [],
+				},
+			},
+			"default-runtime": "nvidia",
+			"node-generic-resources": [`GPU=${initialStatus.availableGPUs}`],
+		};
+
+		const setupCommands = [
+			"sudo -n true",
+			`echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
+			"sudo mkdir -p /etc/nvidia-container-runtime",
+			'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
+			"sudo systemctl daemon-reload",
+			"sudo systemctl restart docker",
+		].join(" && ");
+
+		if (serverId) {
+			await execAsyncRemote(serverId, setupCommands);
+		} else {
+			await execAsync(setupCommands);
+		}
+
+		// 4. Reduced wait time for Docker restart
+		await new Promise((resolve) => setTimeout(resolve, 10000));
+
+		// 5. Add GPU label to the node
+		const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`;
+		if (serverId) {
+			await execAsyncRemote(serverId, labelCommand);
+		} else {
+			await execAsync(labelCommand);
+		}
+
+		// 6. Quick final verification
+		await new Promise((resolve) => setTimeout(resolve, 5000));
+		const finalStatus = await checkGPUStatus(serverId);
+
+		if (!finalStatus.swarmEnabled) {
+			const diagnosticCommands = [
+				`docker node inspect ${nodeId.trim()}`,
+				'nvidia-smi -a | grep "GPU UUID"',
+				"cat /etc/docker/daemon.json",
+				"cat /etc/nvidia-container-runtime/config.toml",
+			].join(" && ");
+
+			const { stdout: diagnostics } = serverId
+				? await execAsyncRemote(serverId, diagnosticCommands)
+				: await execAsync(diagnosticCommands);
+
+			console.error("Diagnostic Information:", diagnostics);
+			throw new Error("GPU support not detected in swarm after setup");
+		}
+
+		console.log("GPU setup completed successfully:", {
+			availableGPUs: initialStatus.availableGPUs,
+			driverVersion: initialStatus.driverVersion,
+			nodeId: nodeId.trim(),
+		});
+	} catch (error) {
+		console.error("GPU Setup Error:", error);
+		throw error;
+	}
+}