mirror of
https://github.com/Dokploy/dokploy
synced 2025-06-26 18:27:59 +00:00
feat: Added GPU support feature for Remote Server with setup and status checks, including API endpoints and utility functions
This commit is contained in:
parent
3e467959c9
commit
1b6d8d803b
@ -32,6 +32,7 @@ import Link from "next/link";
|
||||
import { useState } from "react";
|
||||
import { toast } from "sonner";
|
||||
import { ShowDeployment } from "../../application/deployments/show-deployment";
|
||||
import { GPUSupport } from "./gpu-support";
|
||||
|
||||
interface Props {
|
||||
serverId: string;
|
||||
@ -89,9 +90,10 @@ export const SetupServer = ({ serverId }: Props) => {
|
||||
) : (
|
||||
<div id="hook-form-add-gitlab" className="grid w-full gap-1">
|
||||
<Tabs defaultValue="ssh-keys">
|
||||
<TabsList className="grid grid-cols-2 w-[400px]">
|
||||
<TabsList className="grid grid-cols-3 w-[400px]">
|
||||
<TabsTrigger value="ssh-keys">SSH Keys</TabsTrigger>
|
||||
<TabsTrigger value="deployments">Deployments</TabsTrigger>
|
||||
<TabsTrigger value="gpu-setup">GPU Setup</TabsTrigger>
|
||||
</TabsList>
|
||||
<TabsContent
|
||||
value="ssh-keys"
|
||||
@ -291,6 +293,14 @@ export const SetupServer = ({ serverId }: Props) => {
|
||||
</div>
|
||||
</CardContent>
|
||||
</TabsContent>
|
||||
<TabsContent
|
||||
value="gpu-setup"
|
||||
className="outline-none ring-0 focus-visible:ring-0 focus-visible:ring-offset-0"
|
||||
>
|
||||
<div className="flex flex-col gap-2 text-sm text-muted-foreground pt-3">
|
||||
<GPUSupport serverId={serverId} />
|
||||
</div>
|
||||
</TabsContent>
|
||||
</Tabs>
|
||||
</div>
|
||||
)}
|
||||
|
@ -52,6 +52,10 @@ import {
|
||||
writeMainConfig,
|
||||
writeTraefikConfigInPath,
|
||||
} from "@dokploy/server";
|
||||
import {
|
||||
checkGPUStatus,
|
||||
setupGPUSupport,
|
||||
} from "@dokploy/server/src/utils/gpu-setup";
|
||||
import { generateOpenApiDocument } from "@dokploy/trpc-openapi";
|
||||
import { TRPCError } from "@trpc/server";
|
||||
import { sql } from "drizzle-orm";
|
||||
@ -650,6 +654,62 @@ export const settingsRouter = createTRPCRouter({
|
||||
}
|
||||
return { status: "not_cloud" };
|
||||
}),
|
||||
setupGPU: adminProcedure
|
||||
.input(
|
||||
z.object({
|
||||
serverId: z.string(),
|
||||
}),
|
||||
)
|
||||
.mutation(async ({ input }) => {
|
||||
try {
|
||||
if (IS_CLOUD) {
|
||||
return { success: true };
|
||||
}
|
||||
|
||||
if (!input.serverId) {
|
||||
throw new TRPCError({
|
||||
code: "BAD_REQUEST",
|
||||
message: "Server ID is required",
|
||||
});
|
||||
}
|
||||
|
||||
await setupGPUSupport(input.serverId);
|
||||
return { success: true };
|
||||
} catch (error) {
|
||||
throw new TRPCError({
|
||||
code: "INTERNAL_SERVER_ERROR",
|
||||
message:
|
||||
error instanceof Error
|
||||
? error.message
|
||||
: "Failed to enable GPU support",
|
||||
cause: error,
|
||||
});
|
||||
}
|
||||
}),
|
||||
checkGPUStatus: adminProcedure
|
||||
.input(
|
||||
z.object({
|
||||
serverId: z.string().optional(),
|
||||
}),
|
||||
)
|
||||
.query(async ({ input }) => {
|
||||
if (IS_CLOUD) {
|
||||
return {
|
||||
driverInstalled: false,
|
||||
driverVersion: undefined,
|
||||
gpuModel: undefined,
|
||||
runtimeInstalled: false,
|
||||
runtimeConfigured: false,
|
||||
cudaSupport: undefined,
|
||||
cudaVersion: undefined,
|
||||
memoryInfo: undefined,
|
||||
availableGPUs: 0,
|
||||
swarmEnabled: false,
|
||||
gpuResources: 0,
|
||||
};
|
||||
}
|
||||
return await checkGPUStatus(input.serverId);
|
||||
}),
|
||||
});
|
||||
// {
|
||||
// "Parallelism": 1,
|
||||
|
@ -21,8 +21,6 @@ import {
|
||||
import type { Session, User } from "lucia";
|
||||
import superjson from "superjson";
|
||||
import { ZodError } from "zod";
|
||||
import { setupGPUSupport } from '@dokploy/server/src/utils/gpu-setup';
|
||||
|
||||
/**
|
||||
* 1. CONTEXT
|
||||
*
|
||||
@ -209,10 +207,3 @@ export const adminProcedure = t.procedure.use(({ ctx, next }) => {
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
const appRouter = t.router({
|
||||
setupGPU: t.procedure.mutation(async () => {
|
||||
await setupGPUSupport();
|
||||
return { success: true };
|
||||
}),
|
||||
});
|
@ -1,34 +1,34 @@
|
||||
import {
|
||||
generateHash,
|
||||
generateRandomDomain,
|
||||
type Template,
|
||||
type Schema,
|
||||
type DomainSchema,
|
||||
type DomainSchema,
|
||||
type Schema,
|
||||
type Template,
|
||||
generateHash,
|
||||
generateRandomDomain,
|
||||
} from "../utils";
|
||||
|
||||
export function generate(schema: Schema): Template {
|
||||
const mainServiceHash = generateHash(schema.projectName);
|
||||
const mainDomain = generateRandomDomain(schema);
|
||||
const mainServiceHash = generateHash(schema.projectName);
|
||||
const mainDomain = generateRandomDomain(schema);
|
||||
|
||||
const domains: DomainSchema[] = [
|
||||
{
|
||||
host: mainDomain,
|
||||
port: 3000,
|
||||
serviceName: "blender",
|
||||
},
|
||||
];
|
||||
const domains: DomainSchema[] = [
|
||||
{
|
||||
host: mainDomain,
|
||||
port: 3000,
|
||||
serviceName: "blender",
|
||||
},
|
||||
];
|
||||
|
||||
const envs = [
|
||||
`PUID=1000`,
|
||||
`PGID=1000`,
|
||||
`TZ=Etc/UTC`,
|
||||
`SUBFOLDER=/`,
|
||||
`NVIDIA_VISIBLE_DEVICES=all`,
|
||||
`NVIDIA_DRIVER_CAPABILITIES=all`,
|
||||
];
|
||||
const envs = [
|
||||
`PUID=1000`,
|
||||
`PGID=1000`,
|
||||
`TZ=Etc/UTC`,
|
||||
`SUBFOLDER=/`,
|
||||
`NVIDIA_VISIBLE_DEVICES=all`,
|
||||
`NVIDIA_DRIVER_CAPABILITIES=all`,
|
||||
];
|
||||
|
||||
return {
|
||||
envs,
|
||||
domains,
|
||||
};
|
||||
return {
|
||||
envs,
|
||||
domains,
|
||||
};
|
||||
}
|
||||
|
@ -516,7 +516,8 @@ export const templates: TemplateData[] = [
|
||||
id: "blender",
|
||||
name: "Blender",
|
||||
version: "latest",
|
||||
description: "Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.",
|
||||
description:
|
||||
"Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.",
|
||||
logo: "blender.svg",
|
||||
links: {
|
||||
github: "https://github.com/linuxserver/docker-blender",
|
||||
|
@ -36,4 +36,4 @@ export const paths = (isServer = false) => {
|
||||
MONITORING_PATH: `${BASE_PATH}/monitoring`,
|
||||
REGISTRY_PATH: `${BASE_PATH}/registry`,
|
||||
};
|
||||
};
|
||||
};
|
||||
|
@ -118,3 +118,4 @@ export * from "./monitoring/utilts";
|
||||
|
||||
export * from "./db/validations/domain";
|
||||
export * from "./db/validations/index";
|
||||
export * from "./utils/gpu-setup";
|
||||
|
@ -1,9 +1,261 @@
|
||||
import { docker } from '../constants';
|
||||
import { docker } from "../constants";
|
||||
import { execAsync } from "../utils/process/execAsync";
|
||||
import { execAsyncRemote } from "../utils/process/execAsync";
|
||||
import { getRemoteDocker } from "./servers/remote-docker";
|
||||
|
||||
export async function setupGPUSupport() {
|
||||
await docker.swarmUpdate({
|
||||
TaskDefaults: {
|
||||
GenericResources: [{ DiscreteResourceSpec: { Kind: 'gpu', Value: 1 } }]
|
||||
}
|
||||
});
|
||||
}
|
||||
interface GPUInfo {
|
||||
driverInstalled: boolean;
|
||||
driverVersion?: string;
|
||||
gpuModel?: string;
|
||||
runtimeInstalled: boolean;
|
||||
runtimeConfigured: boolean;
|
||||
cudaSupport: boolean;
|
||||
cudaVersion?: string;
|
||||
memoryInfo?: string;
|
||||
availableGPUs: number;
|
||||
swarmEnabled: boolean;
|
||||
gpuResources: number;
|
||||
}
|
||||
|
||||
interface DiscreteResourceSpec {
|
||||
Kind: string;
|
||||
Value: number;
|
||||
}
|
||||
|
||||
interface NamedGenericResource {
|
||||
NamedResourceSpec?: { Kind: string; Value: string };
|
||||
DiscreteResourceSpec?: DiscreteResourceSpec;
|
||||
}
|
||||
|
||||
export async function checkGPUStatus(serverId?: string): Promise<GPUInfo> {
|
||||
try {
|
||||
// Check NVIDIA Driver
|
||||
let driverInstalled = false;
|
||||
let driverVersion: string | undefined;
|
||||
let availableGPUs = 0;
|
||||
|
||||
try {
|
||||
const driverCommand =
|
||||
"nvidia-smi --query-gpu=driver_version --format=csv,noheader";
|
||||
const { stdout: nvidiaSmi } = serverId
|
||||
? await execAsyncRemote(serverId, driverCommand)
|
||||
: await execAsync(driverCommand);
|
||||
|
||||
driverVersion = nvidiaSmi.trim();
|
||||
if (driverVersion) {
|
||||
driverInstalled = true;
|
||||
const countCommand =
|
||||
"nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
|
||||
const { stdout: gpuCount } = serverId
|
||||
? await execAsyncRemote(serverId, countCommand)
|
||||
: await execAsync(countCommand);
|
||||
|
||||
availableGPUs = Number.parseInt(gpuCount.trim(), 10);
|
||||
}
|
||||
} catch (error) {
|
||||
console.debug("GPU driver check:", error);
|
||||
}
|
||||
|
||||
// Check Runtime Configuration
|
||||
let runtimeInstalled = false;
|
||||
let runtimeConfigured = false;
|
||||
try {
|
||||
const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
|
||||
const { stdout: runtimeInfo } = serverId
|
||||
? await execAsyncRemote(serverId, runtimeCommand)
|
||||
: await execAsync(runtimeCommand);
|
||||
|
||||
const runtimes = JSON.parse(runtimeInfo);
|
||||
runtimeInstalled = "nvidia" in runtimes;
|
||||
|
||||
// Check if it's the default runtime
|
||||
const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
|
||||
const { stdout: defaultRuntime } = serverId
|
||||
? await execAsyncRemote(serverId, defaultCommand)
|
||||
: await execAsync(defaultCommand);
|
||||
|
||||
runtimeConfigured = defaultRuntime.trim() === "nvidia";
|
||||
} catch (error) {
|
||||
console.debug("Runtime check:", error);
|
||||
}
|
||||
|
||||
// Check Swarm GPU Resources
|
||||
let swarmEnabled = false;
|
||||
let gpuResources = 0;
|
||||
|
||||
try {
|
||||
// Check node resources directly from inspect
|
||||
const nodeCommand =
|
||||
"docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
|
||||
const { stdout: resources } = serverId
|
||||
? await execAsyncRemote(serverId, nodeCommand)
|
||||
: await execAsync(nodeCommand);
|
||||
|
||||
if (resources && resources !== "null") {
|
||||
const genericResources = JSON.parse(resources);
|
||||
for (const resource of genericResources) {
|
||||
if (
|
||||
resource.DiscreteResourceSpec &&
|
||||
(resource.DiscreteResourceSpec.Kind === "GPU" ||
|
||||
resource.DiscreteResourceSpec.Kind === "gpu")
|
||||
) {
|
||||
gpuResources = resource.DiscreteResourceSpec.Value;
|
||||
swarmEnabled = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.debug("Swarm resource check:", error);
|
||||
}
|
||||
|
||||
// Get GPU Model and Memory Info
|
||||
const gpuInfoCommand =
|
||||
"nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
|
||||
const { stdout: gpuInfo } = serverId
|
||||
? await execAsyncRemote(serverId, gpuInfoCommand)
|
||||
: await execAsync(gpuInfoCommand);
|
||||
|
||||
const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim());
|
||||
|
||||
// Check CUDA Support
|
||||
const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
|
||||
const { stdout: cudaInfo } = serverId
|
||||
? await execAsyncRemote(serverId, cudaCommand)
|
||||
: await execAsync(cudaCommand);
|
||||
|
||||
const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
|
||||
const cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
|
||||
const cudaSupport = !!cudaVersion;
|
||||
|
||||
return {
|
||||
driverInstalled,
|
||||
driverVersion,
|
||||
runtimeInstalled,
|
||||
runtimeConfigured,
|
||||
availableGPUs,
|
||||
swarmEnabled,
|
||||
gpuResources,
|
||||
gpuModel,
|
||||
memoryInfo: memoryTotal,
|
||||
cudaSupport,
|
||||
cudaVersion,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error in checkGPUStatus:", error);
|
||||
return {
|
||||
driverInstalled: false,
|
||||
driverVersion: undefined,
|
||||
runtimeInstalled: false,
|
||||
runtimeConfigured: false,
|
||||
cudaSupport: false,
|
||||
cudaVersion: undefined,
|
||||
gpuModel: undefined,
|
||||
memoryInfo: undefined,
|
||||
availableGPUs: 0,
|
||||
swarmEnabled: false,
|
||||
gpuResources: 0,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function setupGPUSupport(serverId?: string): Promise<void> {
|
||||
try {
|
||||
// 1. Check current GPU status first
|
||||
const initialStatus = await checkGPUStatus(serverId);
|
||||
|
||||
// If GPU is already configured, just verify and return quickly
|
||||
if (
|
||||
initialStatus.swarmEnabled &&
|
||||
initialStatus.runtimeConfigured &&
|
||||
initialStatus.driverInstalled
|
||||
) {
|
||||
console.log("GPU already configured, skipping setup");
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. Verify GPU prerequisites
|
||||
if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) {
|
||||
throw new Error(
|
||||
"NVIDIA drivers or runtime not installed. Please install them first.",
|
||||
);
|
||||
}
|
||||
|
||||
// Get the node ID
|
||||
const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
|
||||
const { stdout: nodeId } = serverId
|
||||
? await execAsyncRemote(serverId, nodeIdCommand)
|
||||
: await execAsync(nodeIdCommand);
|
||||
|
||||
if (!nodeId.trim()) {
|
||||
throw new Error("Setup Server before enabling GPU support");
|
||||
}
|
||||
|
||||
// 3. Configure NVIDIA runtime in daemon.json
|
||||
const daemonConfig = {
|
||||
runtimes: {
|
||||
nvidia: {
|
||||
path: "nvidia-container-runtime",
|
||||
runtimeArgs: [],
|
||||
},
|
||||
},
|
||||
"default-runtime": "nvidia",
|
||||
"node-generic-resources": [`GPU=${initialStatus.availableGPUs}`],
|
||||
};
|
||||
|
||||
const setupCommands = [
|
||||
"sudo -n true",
|
||||
`echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
|
||||
"sudo mkdir -p /etc/nvidia-container-runtime",
|
||||
'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
|
||||
"sudo systemctl daemon-reload",
|
||||
"sudo systemctl restart docker",
|
||||
].join(" && ");
|
||||
|
||||
if (serverId) {
|
||||
await execAsyncRemote(serverId, setupCommands);
|
||||
} else {
|
||||
await execAsync(setupCommands);
|
||||
}
|
||||
|
||||
// 4. Reduced wait time for Docker restart
|
||||
await new Promise((resolve) => setTimeout(resolve, 10000));
|
||||
|
||||
// 5. Add GPU label to the node
|
||||
const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`;
|
||||
if (serverId) {
|
||||
await execAsyncRemote(serverId, labelCommand);
|
||||
} else {
|
||||
await execAsync(labelCommand);
|
||||
}
|
||||
|
||||
// 6. Quick final verification
|
||||
await new Promise((resolve) => setTimeout(resolve, 5000));
|
||||
const finalStatus = await checkGPUStatus(serverId);
|
||||
|
||||
if (!finalStatus.swarmEnabled) {
|
||||
const diagnosticCommands = [
|
||||
`docker node inspect ${nodeId.trim()}`,
|
||||
'nvidia-smi -a | grep "GPU UUID"',
|
||||
"cat /etc/docker/daemon.json",
|
||||
"cat /etc/nvidia-container-runtime/config.toml",
|
||||
].join(" && ");
|
||||
|
||||
const { stdout: diagnostics } = serverId
|
||||
? await execAsyncRemote(serverId, diagnosticCommands)
|
||||
: await execAsync(diagnosticCommands);
|
||||
|
||||
console.error("Diagnostic Information:", diagnostics);
|
||||
throw new Error("GPU support not detected in swarm after setup");
|
||||
}
|
||||
|
||||
console.log("GPU setup completed successfully:", {
|
||||
availableGPUs: initialStatus.availableGPUs,
|
||||
driverVersion: initialStatus.driverVersion,
|
||||
nodeId: nodeId.trim(),
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("GPU Setup Error:", error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user