feat: Added GPU support feature for Remote Server with setup and status checks, including API endpoints and utility functions

This commit is contained in:
vishalkadam47 2024-11-02 15:15:58 +05:30
parent 3e467959c9
commit 1b6d8d803b
8 changed files with 361 additions and 46 deletions

View File

@ -32,6 +32,7 @@ import Link from "next/link";
import { useState } from "react";
import { toast } from "sonner";
import { ShowDeployment } from "../../application/deployments/show-deployment";
import { GPUSupport } from "./gpu-support";
interface Props {
serverId: string;
@ -89,9 +90,10 @@ export const SetupServer = ({ serverId }: Props) => {
) : (
<div id="hook-form-add-gitlab" className="grid w-full gap-1">
<Tabs defaultValue="ssh-keys">
<TabsList className="grid grid-cols-2 w-[400px]">
<TabsList className="grid grid-cols-3 w-[400px]">
<TabsTrigger value="ssh-keys">SSH Keys</TabsTrigger>
<TabsTrigger value="deployments">Deployments</TabsTrigger>
<TabsTrigger value="gpu-setup">GPU Setup</TabsTrigger>
</TabsList>
<TabsContent
value="ssh-keys"
@ -291,6 +293,14 @@ export const SetupServer = ({ serverId }: Props) => {
</div>
</CardContent>
</TabsContent>
<TabsContent
value="gpu-setup"
className="outline-none ring-0 focus-visible:ring-0 focus-visible:ring-offset-0"
>
<div className="flex flex-col gap-2 text-sm text-muted-foreground pt-3">
<GPUSupport serverId={serverId} />
</div>
</TabsContent>
</Tabs>
</div>
)}

View File

@ -52,6 +52,10 @@ import {
writeMainConfig,
writeTraefikConfigInPath,
} from "@dokploy/server";
import {
checkGPUStatus,
setupGPUSupport,
} from "@dokploy/server/src/utils/gpu-setup";
import { generateOpenApiDocument } from "@dokploy/trpc-openapi";
import { TRPCError } from "@trpc/server";
import { sql } from "drizzle-orm";
@ -650,6 +654,62 @@ export const settingsRouter = createTRPCRouter({
}
return { status: "not_cloud" };
}),
setupGPU: adminProcedure
.input(
z.object({
serverId: z.string(),
}),
)
.mutation(async ({ input }) => {
try {
if (IS_CLOUD) {
return { success: true };
}
if (!input.serverId) {
throw new TRPCError({
code: "BAD_REQUEST",
message: "Server ID is required",
});
}
await setupGPUSupport(input.serverId);
return { success: true };
} catch (error) {
throw new TRPCError({
code: "INTERNAL_SERVER_ERROR",
message:
error instanceof Error
? error.message
: "Failed to enable GPU support",
cause: error,
});
}
}),
checkGPUStatus: adminProcedure
.input(
z.object({
serverId: z.string().optional(),
}),
)
.query(async ({ input }) => {
if (IS_CLOUD) {
return {
driverInstalled: false,
driverVersion: undefined,
gpuModel: undefined,
runtimeInstalled: false,
runtimeConfigured: false,
cudaSupport: undefined,
cudaVersion: undefined,
memoryInfo: undefined,
availableGPUs: 0,
swarmEnabled: false,
gpuResources: 0,
};
}
return await checkGPUStatus(input.serverId);
}),
});
// {
// "Parallelism": 1,

View File

@ -21,8 +21,6 @@ import {
import type { Session, User } from "lucia";
import superjson from "superjson";
import { ZodError } from "zod";
import { setupGPUSupport } from '@dokploy/server/src/utils/gpu-setup';
/**
* 1. CONTEXT
*
@ -209,10 +207,3 @@ export const adminProcedure = t.procedure.use(({ ctx, next }) => {
},
});
});
const appRouter = t.router({
setupGPU: t.procedure.mutation(async () => {
await setupGPUSupport();
return { success: true };
}),
});

View File

@ -1,34 +1,34 @@
import {
generateHash,
generateRandomDomain,
type Template,
type Schema,
type DomainSchema,
type DomainSchema,
type Schema,
type Template,
generateHash,
generateRandomDomain,
} from "../utils";
export function generate(schema: Schema): Template {
const mainServiceHash = generateHash(schema.projectName);
const mainDomain = generateRandomDomain(schema);
const mainServiceHash = generateHash(schema.projectName);
const mainDomain = generateRandomDomain(schema);
const domains: DomainSchema[] = [
{
host: mainDomain,
port: 3000,
serviceName: "blender",
},
];
const domains: DomainSchema[] = [
{
host: mainDomain,
port: 3000,
serviceName: "blender",
},
];
const envs = [
`PUID=1000`,
`PGID=1000`,
`TZ=Etc/UTC`,
`SUBFOLDER=/`,
`NVIDIA_VISIBLE_DEVICES=all`,
`NVIDIA_DRIVER_CAPABILITIES=all`,
];
const envs = [
`PUID=1000`,
`PGID=1000`,
`TZ=Etc/UTC`,
`SUBFOLDER=/`,
`NVIDIA_VISIBLE_DEVICES=all`,
`NVIDIA_DRIVER_CAPABILITIES=all`,
];
return {
envs,
domains,
};
return {
envs,
domains,
};
}

View File

@ -516,7 +516,8 @@ export const templates: TemplateData[] = [
id: "blender",
name: "Blender",
version: "latest",
description: "Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.",
description:
"Blender is a free and open-source 3D creation suite. It supports the entire 3D pipeline—modeling, rigging, animation, simulation, rendering, compositing and motion tracking, video editing and 2D animation pipeline.",
logo: "blender.svg",
links: {
github: "https://github.com/linuxserver/docker-blender",

View File

@ -36,4 +36,4 @@ export const paths = (isServer = false) => {
MONITORING_PATH: `${BASE_PATH}/monitoring`,
REGISTRY_PATH: `${BASE_PATH}/registry`,
};
};
};

View File

@ -118,3 +118,4 @@ export * from "./monitoring/utilts";
export * from "./db/validations/domain";
export * from "./db/validations/index";
export * from "./utils/gpu-setup";

View File

@ -1,9 +1,261 @@
import { docker } from '../constants';
import { docker } from "../constants";
import { execAsync } from "../utils/process/execAsync";
import { execAsyncRemote } from "../utils/process/execAsync";
import { getRemoteDocker } from "./servers/remote-docker";
export async function setupGPUSupport() {
await docker.swarmUpdate({
TaskDefaults: {
GenericResources: [{ DiscreteResourceSpec: { Kind: 'gpu', Value: 1 } }]
}
});
}
interface GPUInfo {
driverInstalled: boolean;
driverVersion?: string;
gpuModel?: string;
runtimeInstalled: boolean;
runtimeConfigured: boolean;
cudaSupport: boolean;
cudaVersion?: string;
memoryInfo?: string;
availableGPUs: number;
swarmEnabled: boolean;
gpuResources: number;
}
interface DiscreteResourceSpec {
Kind: string;
Value: number;
}
interface NamedGenericResource {
NamedResourceSpec?: { Kind: string; Value: string };
DiscreteResourceSpec?: DiscreteResourceSpec;
}
export async function checkGPUStatus(serverId?: string): Promise<GPUInfo> {
try {
// Check NVIDIA Driver
let driverInstalled = false;
let driverVersion: string | undefined;
let availableGPUs = 0;
try {
const driverCommand =
"nvidia-smi --query-gpu=driver_version --format=csv,noheader";
const { stdout: nvidiaSmi } = serverId
? await execAsyncRemote(serverId, driverCommand)
: await execAsync(driverCommand);
driverVersion = nvidiaSmi.trim();
if (driverVersion) {
driverInstalled = true;
const countCommand =
"nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
const { stdout: gpuCount } = serverId
? await execAsyncRemote(serverId, countCommand)
: await execAsync(countCommand);
availableGPUs = Number.parseInt(gpuCount.trim(), 10);
}
} catch (error) {
console.debug("GPU driver check:", error);
}
// Check Runtime Configuration
let runtimeInstalled = false;
let runtimeConfigured = false;
try {
const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
const { stdout: runtimeInfo } = serverId
? await execAsyncRemote(serverId, runtimeCommand)
: await execAsync(runtimeCommand);
const runtimes = JSON.parse(runtimeInfo);
runtimeInstalled = "nvidia" in runtimes;
// Check if it's the default runtime
const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
const { stdout: defaultRuntime } = serverId
? await execAsyncRemote(serverId, defaultCommand)
: await execAsync(defaultCommand);
runtimeConfigured = defaultRuntime.trim() === "nvidia";
} catch (error) {
console.debug("Runtime check:", error);
}
// Check Swarm GPU Resources
let swarmEnabled = false;
let gpuResources = 0;
try {
// Check node resources directly from inspect
const nodeCommand =
"docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
const { stdout: resources } = serverId
? await execAsyncRemote(serverId, nodeCommand)
: await execAsync(nodeCommand);
if (resources && resources !== "null") {
const genericResources = JSON.parse(resources);
for (const resource of genericResources) {
if (
resource.DiscreteResourceSpec &&
(resource.DiscreteResourceSpec.Kind === "GPU" ||
resource.DiscreteResourceSpec.Kind === "gpu")
) {
gpuResources = resource.DiscreteResourceSpec.Value;
swarmEnabled = true;
break;
}
}
}
} catch (error) {
console.debug("Swarm resource check:", error);
}
// Get GPU Model and Memory Info
const gpuInfoCommand =
"nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
const { stdout: gpuInfo } = serverId
? await execAsyncRemote(serverId, gpuInfoCommand)
: await execAsync(gpuInfoCommand);
const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim());
// Check CUDA Support
const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
const { stdout: cudaInfo } = serverId
? await execAsyncRemote(serverId, cudaCommand)
: await execAsync(cudaCommand);
const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
const cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
const cudaSupport = !!cudaVersion;
return {
driverInstalled,
driverVersion,
runtimeInstalled,
runtimeConfigured,
availableGPUs,
swarmEnabled,
gpuResources,
gpuModel,
memoryInfo: memoryTotal,
cudaSupport,
cudaVersion,
};
} catch (error) {
console.error("Error in checkGPUStatus:", error);
return {
driverInstalled: false,
driverVersion: undefined,
runtimeInstalled: false,
runtimeConfigured: false,
cudaSupport: false,
cudaVersion: undefined,
gpuModel: undefined,
memoryInfo: undefined,
availableGPUs: 0,
swarmEnabled: false,
gpuResources: 0,
};
}
}
export async function setupGPUSupport(serverId?: string): Promise<void> {
try {
// 1. Check current GPU status first
const initialStatus = await checkGPUStatus(serverId);
// If GPU is already configured, just verify and return quickly
if (
initialStatus.swarmEnabled &&
initialStatus.runtimeConfigured &&
initialStatus.driverInstalled
) {
console.log("GPU already configured, skipping setup");
return;
}
// 2. Verify GPU prerequisites
if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) {
throw new Error(
"NVIDIA drivers or runtime not installed. Please install them first.",
);
}
// Get the node ID
const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
const { stdout: nodeId } = serverId
? await execAsyncRemote(serverId, nodeIdCommand)
: await execAsync(nodeIdCommand);
if (!nodeId.trim()) {
throw new Error("Setup Server before enabling GPU support");
}
// 3. Configure NVIDIA runtime in daemon.json
const daemonConfig = {
runtimes: {
nvidia: {
path: "nvidia-container-runtime",
runtimeArgs: [],
},
},
"default-runtime": "nvidia",
"node-generic-resources": [`GPU=${initialStatus.availableGPUs}`],
};
const setupCommands = [
"sudo -n true",
`echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
"sudo mkdir -p /etc/nvidia-container-runtime",
'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
"sudo systemctl daemon-reload",
"sudo systemctl restart docker",
].join(" && ");
if (serverId) {
await execAsyncRemote(serverId, setupCommands);
} else {
await execAsync(setupCommands);
}
// 4. Reduced wait time for Docker restart
await new Promise((resolve) => setTimeout(resolve, 10000));
// 5. Add GPU label to the node
const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`;
if (serverId) {
await execAsyncRemote(serverId, labelCommand);
} else {
await execAsync(labelCommand);
}
// 6. Quick final verification
await new Promise((resolve) => setTimeout(resolve, 5000));
const finalStatus = await checkGPUStatus(serverId);
if (!finalStatus.swarmEnabled) {
const diagnosticCommands = [
`docker node inspect ${nodeId.trim()}`,
'nvidia-smi -a | grep "GPU UUID"',
"cat /etc/docker/daemon.json",
"cat /etc/nvidia-container-runtime/config.toml",
].join(" && ");
const { stdout: diagnostics } = serverId
? await execAsyncRemote(serverId, diagnosticCommands)
: await execAsync(diagnosticCommands);
console.error("Diagnostic Information:", diagnostics);
throw new Error("GPU support not detected in swarm after setup");
}
console.log("GPU setup completed successfully:", {
availableGPUs: initialStatus.availableGPUs,
driverVersion: initialStatus.driverVersion,
nodeId: nodeId.trim(),
});
} catch (error) {
console.error("GPU Setup Error:", error);
throw error;
}
}