mirror of
https://github.com/Dokploy/dokploy
synced 2025-06-26 18:27:59 +00:00
350 lines
9.7 KiB
TypeScript
350 lines
9.7 KiB
TypeScript
import * as fs from "node:fs/promises";
|
|
import { execAsync, sleep } from "../utils/process/execAsync";
|
|
import { execAsyncRemote } from "../utils/process/execAsync";
|
|
|
|
interface GPUInfo {
|
|
driverInstalled: boolean;
|
|
driverVersion?: string;
|
|
gpuModel?: string;
|
|
runtimeInstalled: boolean;
|
|
runtimeConfigured: boolean;
|
|
cudaSupport: boolean;
|
|
cudaVersion?: string;
|
|
memoryInfo?: string;
|
|
availableGPUs: number;
|
|
swarmEnabled: boolean;
|
|
gpuResources: number;
|
|
}
|
|
|
|
export async function checkGPUStatus(serverId?: string): Promise<GPUInfo> {
|
|
try {
|
|
const [driverInfo, runtimeInfo, swarmInfo, gpuInfo, cudaInfo] =
|
|
await Promise.all([
|
|
checkGpuDriver(serverId),
|
|
checkRuntime(serverId),
|
|
checkSwarmResources(serverId),
|
|
checkGpuInfo(serverId),
|
|
checkCudaSupport(serverId),
|
|
]);
|
|
|
|
return {
|
|
...driverInfo,
|
|
...runtimeInfo,
|
|
...swarmInfo,
|
|
...gpuInfo,
|
|
...cudaInfo,
|
|
};
|
|
} catch (error) {
|
|
console.error("Error in checkGPUStatus:", error);
|
|
return {
|
|
driverInstalled: false,
|
|
driverVersion: undefined,
|
|
runtimeInstalled: false,
|
|
runtimeConfigured: false,
|
|
cudaSupport: false,
|
|
cudaVersion: undefined,
|
|
gpuModel: undefined,
|
|
memoryInfo: undefined,
|
|
availableGPUs: 0,
|
|
swarmEnabled: false,
|
|
gpuResources: 0,
|
|
};
|
|
}
|
|
}
|
|
|
|
const checkGpuDriver = async (serverId?: string) => {
|
|
let driverVersion: string | undefined;
|
|
let driverInstalled = false;
|
|
let availableGPUs = 0;
|
|
|
|
try {
|
|
const driverCommand =
|
|
"nvidia-smi --query-gpu=driver_version --format=csv,noheader";
|
|
const { stdout: nvidiaSmi } = serverId
|
|
? await execAsyncRemote(serverId, driverCommand)
|
|
: await execAsync(driverCommand);
|
|
|
|
driverVersion = nvidiaSmi.trim();
|
|
if (driverVersion) {
|
|
driverInstalled = true;
|
|
const countCommand =
|
|
"nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
|
|
const { stdout: gpuCount } = serverId
|
|
? await execAsyncRemote(serverId, countCommand)
|
|
: await execAsync(countCommand);
|
|
|
|
availableGPUs = Number.parseInt(gpuCount.trim(), 10);
|
|
}
|
|
} catch (error) {
|
|
console.debug("GPU driver check:", error);
|
|
}
|
|
|
|
return { driverVersion, driverInstalled, availableGPUs };
|
|
};
|
|
|
|
const checkRuntime = async (serverId?: string) => {
|
|
let runtimeInstalled = false;
|
|
let runtimeConfigured = false;
|
|
|
|
try {
|
|
// First check: Is nvidia-container-runtime installed?
|
|
const checkBinaryCommand = "command -v nvidia-container-runtime";
|
|
try {
|
|
const { stdout } = serverId
|
|
? await execAsyncRemote(serverId, checkBinaryCommand)
|
|
: await execAsync(checkBinaryCommand);
|
|
runtimeInstalled = !!stdout.trim();
|
|
} catch (error) {
|
|
console.debug("Runtime binary check:", error);
|
|
}
|
|
|
|
// Second check: Is it configured in Docker?
|
|
try {
|
|
const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
|
|
const { stdout: runtimeInfo } = serverId
|
|
? await execAsyncRemote(serverId, runtimeCommand)
|
|
: await execAsync(runtimeCommand);
|
|
|
|
const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
|
|
const { stdout: defaultRuntime } = serverId
|
|
? await execAsyncRemote(serverId, defaultCommand)
|
|
: await execAsync(defaultCommand);
|
|
|
|
const runtimes = JSON.parse(runtimeInfo);
|
|
const hasNvidiaRuntime = "nvidia" in runtimes;
|
|
const isDefaultRuntime = defaultRuntime.trim() === "nvidia";
|
|
|
|
// Only set runtimeConfigured if both conditions are met
|
|
runtimeConfigured = hasNvidiaRuntime && isDefaultRuntime;
|
|
} catch (error) {
|
|
console.debug("Runtime configuration check:", error);
|
|
}
|
|
} catch (error) {
|
|
console.debug("Runtime check:", error);
|
|
}
|
|
|
|
return { runtimeInstalled, runtimeConfigured };
|
|
};
|
|
|
|
const checkSwarmResources = async (serverId?: string) => {
|
|
let swarmEnabled = false;
|
|
let gpuResources = 0;
|
|
|
|
try {
|
|
const nodeCommand =
|
|
"docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
|
|
const { stdout: resources } = serverId
|
|
? await execAsyncRemote(serverId, nodeCommand)
|
|
: await execAsync(nodeCommand);
|
|
|
|
if (resources && resources !== "null") {
|
|
const genericResources = JSON.parse(resources);
|
|
for (const resource of genericResources) {
|
|
if (
|
|
resource.DiscreteResourceSpec &&
|
|
(resource.DiscreteResourceSpec.Kind === "GPU" ||
|
|
resource.DiscreteResourceSpec.Kind === "gpu")
|
|
) {
|
|
gpuResources = resource.DiscreteResourceSpec.Value;
|
|
swarmEnabled = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} catch (error) {
|
|
console.debug("Swarm resource check:", error);
|
|
}
|
|
|
|
return { swarmEnabled, gpuResources };
|
|
};
|
|
|
|
const checkGpuInfo = async (serverId?: string) => {
|
|
let gpuModel: string | undefined;
|
|
let memoryInfo: string | undefined;
|
|
|
|
try {
|
|
const gpuInfoCommand =
|
|
"nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
|
|
const { stdout: gpuInfo } = serverId
|
|
? await execAsyncRemote(serverId, gpuInfoCommand)
|
|
: await execAsync(gpuInfoCommand);
|
|
|
|
[gpuModel, memoryInfo] = gpuInfo.split(",").map((s) => s.trim());
|
|
} catch (error) {
|
|
console.debug("GPU info check:", error);
|
|
}
|
|
|
|
return { gpuModel, memoryInfo };
|
|
};
|
|
|
|
const checkCudaSupport = async (serverId?: string) => {
|
|
let cudaVersion: string | undefined;
|
|
let cudaSupport = false;
|
|
|
|
try {
|
|
const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
|
|
const { stdout: cudaInfo } = serverId
|
|
? await execAsyncRemote(serverId, cudaCommand)
|
|
: await execAsync(cudaCommand);
|
|
|
|
const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
|
|
cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
|
|
cudaSupport = !!cudaVersion;
|
|
} catch (error) {
|
|
console.debug("CUDA support check:", error);
|
|
}
|
|
|
|
return { cudaVersion, cudaSupport };
|
|
};
|
|
|
|
export async function setupGPUSupport(serverId?: string): Promise<void> {
|
|
try {
|
|
// 1. Initial status check and validation
|
|
const initialStatus = await checkGPUStatus(serverId);
|
|
const shouldContinue = await validatePrerequisites(initialStatus);
|
|
if (!shouldContinue) return;
|
|
|
|
// 2. Get node ID
|
|
const nodeId = await getNodeId(serverId);
|
|
|
|
// 3. Create daemon configuration
|
|
const daemonConfig = createDaemonConfig(initialStatus.availableGPUs);
|
|
|
|
// 4. Setup server based on environment
|
|
if (serverId) {
|
|
await setupRemoteServer(serverId, daemonConfig);
|
|
} else {
|
|
await setupLocalServer(daemonConfig);
|
|
}
|
|
|
|
// 5. Wait for Docker restart
|
|
await sleep(10000);
|
|
|
|
// 6. Add GPU label
|
|
await addGpuLabel(nodeId, serverId);
|
|
|
|
// 7. Final verification
|
|
await sleep(5000);
|
|
await verifySetup(nodeId, serverId);
|
|
} catch (error) {
|
|
if (
|
|
error instanceof Error &&
|
|
error.message.includes("password is required")
|
|
) {
|
|
throw new Error(
|
|
"Sudo access required. Please run with appropriate permissions.",
|
|
);
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
const validatePrerequisites = async (initialStatus: GPUInfo) => {
|
|
if (!initialStatus.driverInstalled) {
|
|
throw new Error(
|
|
"NVIDIA drivers not installed. Please install appropriate NVIDIA drivers first.",
|
|
);
|
|
}
|
|
|
|
if (!initialStatus.runtimeInstalled) {
|
|
throw new Error(
|
|
"NVIDIA Container Runtime not installed. Please install nvidia-container-runtime first.",
|
|
);
|
|
}
|
|
|
|
if (initialStatus.swarmEnabled && initialStatus.runtimeConfigured) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
const getNodeId = async (serverId?: string) => {
|
|
const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
|
|
const { stdout: nodeId } = serverId
|
|
? await execAsyncRemote(serverId, nodeIdCommand)
|
|
: await execAsync(nodeIdCommand);
|
|
|
|
const trimmedNodeId = nodeId.trim();
|
|
if (!trimmedNodeId) {
|
|
throw new Error("Setup Server before enabling GPU support");
|
|
}
|
|
|
|
return trimmedNodeId;
|
|
};
|
|
|
|
const createDaemonConfig = (availableGPUs: number) => ({
|
|
runtimes: {
|
|
nvidia: {
|
|
path: "nvidia-container-runtime",
|
|
runtimeArgs: [],
|
|
},
|
|
},
|
|
"default-runtime": "nvidia",
|
|
"node-generic-resources": [`GPU=${availableGPUs}`],
|
|
});
|
|
|
|
const setupRemoteServer = async (serverId: string, daemonConfig: any) => {
|
|
const setupCommands = [
|
|
"sudo -n true",
|
|
`echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
|
|
"sudo mkdir -p /etc/nvidia-container-runtime",
|
|
'sudo sed -i "/swarm-resource/d" /etc/nvidia-container-runtime/config.toml',
|
|
'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
|
|
"sudo systemctl daemon-reload",
|
|
"sudo systemctl restart docker",
|
|
].join(" && ");
|
|
|
|
await execAsyncRemote(serverId, setupCommands);
|
|
};
|
|
|
|
const setupLocalServer = async (daemonConfig: any) => {
|
|
const configFile = `/tmp/docker-daemon-${Date.now()}.json`;
|
|
await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2));
|
|
|
|
const setupCommands = [
|
|
`pkexec sh -c '
|
|
cp ${configFile} /etc/docker/daemon.json &&
|
|
mkdir -p /etc/nvidia-container-runtime &&
|
|
sed -i "/swarm-resource/d" /etc/nvidia-container-runtime/config.toml &&
|
|
echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml &&
|
|
systemctl daemon-reload &&
|
|
systemctl restart docker
|
|
'`,
|
|
`rm ${configFile}`,
|
|
].join(" && ");
|
|
|
|
await execAsync(setupCommands);
|
|
};
|
|
|
|
const addGpuLabel = async (nodeId: string, serverId?: string) => {
|
|
const labelCommand = `docker node update --label-add gpu=true ${nodeId}`;
|
|
if (serverId) {
|
|
await execAsyncRemote(serverId, labelCommand);
|
|
} else {
|
|
await execAsync(labelCommand);
|
|
}
|
|
};
|
|
|
|
const verifySetup = async (nodeId: string, serverId?: string) => {
|
|
const finalStatus = await checkGPUStatus(serverId);
|
|
|
|
if (!finalStatus.swarmEnabled) {
|
|
const diagnosticCommands = [
|
|
`docker node inspect ${nodeId}`,
|
|
'nvidia-smi -a | grep "GPU UUID"',
|
|
"cat /etc/docker/daemon.json",
|
|
"cat /etc/nvidia-container-runtime/config.toml",
|
|
].join(" && ");
|
|
|
|
const { stdout: diagnostics } = serverId
|
|
? await execAsyncRemote(serverId, diagnosticCommands)
|
|
: await execAsync(diagnosticCommands);
|
|
|
|
console.error("Diagnostic Information:", diagnostics);
|
|
throw new Error("GPU support not detected in swarm after setup");
|
|
}
|
|
|
|
return finalStatus;
|
|
};
|