mirror of
https://github.com/Dokploy/dokploy
synced 2025-06-26 18:27:59 +00:00
refactor: gpu setup and status checks, extract functions, and improve error handling
This commit is contained in:
parent
c765d7d9eb
commit
66c4d8f118
@ -10,7 +10,7 @@ import {
|
||||
} from "@/components/ui/card";
|
||||
import { api } from "@/utils/api";
|
||||
import { TRPCClientError } from "@trpc/client";
|
||||
import { CheckCircle2, Cpu, Loader2, XCircle } from "lucide-react";
|
||||
import { CheckCircle2, Cpu, Loader2, RefreshCw, XCircle } from "lucide-react";
|
||||
import { useState } from "react";
|
||||
import { toast } from "sonner";
|
||||
|
||||
@ -20,16 +20,19 @@ interface GPUSupportProps {
|
||||
|
||||
export function GPUSupport({ serverId }: GPUSupportProps) {
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
const [isRefreshing, setIsRefreshing] = useState(false);
|
||||
const utils = api.useContext();
|
||||
|
||||
const { data: gpuStatus, isLoading: isChecking } =
|
||||
api.settings.checkGPUStatus.useQuery(
|
||||
{ serverId },
|
||||
{
|
||||
enabled: serverId !== undefined,
|
||||
refetchInterval: 5000,
|
||||
},
|
||||
);
|
||||
const {
|
||||
data: gpuStatus,
|
||||
isLoading: isChecking,
|
||||
refetch,
|
||||
} = api.settings.checkGPUStatus.useQuery(
|
||||
{ serverId },
|
||||
{
|
||||
enabled: serverId !== undefined,
|
||||
},
|
||||
);
|
||||
|
||||
const setupGPU = api.settings.setupGPU.useMutation({
|
||||
onMutate: () => {
|
||||
@ -41,31 +44,20 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
|
||||
await utils.settings.checkGPUStatus.invalidate({ serverId });
|
||||
},
|
||||
onError: (error) => {
|
||||
if (error instanceof TRPCClientError) {
|
||||
const errorMessage = error.message;
|
||||
if (
|
||||
errorMessage.includes(
|
||||
"Permission denied. Please ensure proper sudo access.",
|
||||
) ||
|
||||
errorMessage.includes("sudo access required")
|
||||
) {
|
||||
toast.error(
|
||||
"Administrator privileges required. Please enter your password when prompted.",
|
||||
);
|
||||
} else if (errorMessage.includes("Failed to configure GPU")) {
|
||||
toast.error(
|
||||
"GPU configuration failed. Please check system requirements.",
|
||||
);
|
||||
} else {
|
||||
toast.error(errorMessage);
|
||||
}
|
||||
} else {
|
||||
toast.error("Failed to enable GPU support. Please check server logs.");
|
||||
}
|
||||
toast.error(
|
||||
error.message ||
|
||||
"Failed to enable GPU support. Please check server logs.",
|
||||
);
|
||||
setIsLoading(false);
|
||||
},
|
||||
});
|
||||
|
||||
const handleRefresh = async () => {
|
||||
setIsRefreshing(true);
|
||||
await refetch();
|
||||
setIsRefreshing(false);
|
||||
};
|
||||
|
||||
const handleEnableGPU = async () => {
|
||||
if (serverId === undefined) {
|
||||
toast.error("No server selected");
|
||||
@ -94,22 +86,33 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
|
||||
Configure and monitor GPU support
|
||||
</CardDescription>
|
||||
</div>
|
||||
<DialogAction
|
||||
title="Enable GPU Support?"
|
||||
description="This will enable GPU support for Docker Swarm on this server. Make sure you have the required hardware and drivers installed."
|
||||
onClick={handleEnableGPU}
|
||||
>
|
||||
<Button
|
||||
isLoading={isLoading}
|
||||
disabled={isLoading || serverId === undefined || isChecking}
|
||||
<div className="flex items-center gap-2">
|
||||
<DialogAction
|
||||
title="Enable GPU Support?"
|
||||
description="This will enable GPU support for Docker Swarm on this server. Make sure you have the required hardware and drivers installed."
|
||||
onClick={handleEnableGPU}
|
||||
>
|
||||
{isLoading
|
||||
? "Enabling GPU..."
|
||||
: gpuStatus?.swarmEnabled
|
||||
? "Reconfigure GPU"
|
||||
: "Enable GPU"}
|
||||
<Button
|
||||
isLoading={isLoading}
|
||||
disabled={isLoading || serverId === undefined || isChecking}
|
||||
>
|
||||
{isLoading
|
||||
? "Enabling GPU..."
|
||||
: gpuStatus?.swarmEnabled
|
||||
? "Reconfigure GPU"
|
||||
: "Enable GPU"}
|
||||
</Button>
|
||||
</DialogAction>
|
||||
<Button
|
||||
size="icon"
|
||||
onClick={handleRefresh}
|
||||
disabled={isChecking || isRefreshing}
|
||||
>
|
||||
<RefreshCw
|
||||
className={`h-5 w-5 ${isChecking || isRefreshing ? "animate-spin" : ""}`}
|
||||
/>
|
||||
</Button>
|
||||
</DialogAction>
|
||||
</div>
|
||||
</div>
|
||||
</CardHeader>
|
||||
|
||||
@ -117,9 +120,17 @@ export function GPUSupport({ serverId }: GPUSupportProps) {
|
||||
<AlertBlock type="info">
|
||||
<div className="font-medium mb-2">System Requirements:</div>
|
||||
<ul className="list-disc list-inside text-sm space-y-1">
|
||||
<li>NVIDIA drivers must be installed on the host system</li>
|
||||
<li>NVIDIA Container Runtime is required for GPU support</li>
|
||||
<li>Compatible GPU hardware must be present</li>
|
||||
<li>NVIDIA GPU hardware must be physically installed</li>
|
||||
<li>
|
||||
NVIDIA drivers must be installed and running (check with
|
||||
nvidia-smi)
|
||||
</li>
|
||||
<li>
|
||||
NVIDIA Container Runtime must be installed
|
||||
(nvidia-container-runtime)
|
||||
</li>
|
||||
<li>User must have sudo/administrative privileges</li>
|
||||
<li>System must support CUDA for GPU acceleration</li>
|
||||
</ul>
|
||||
</AlertBlock>
|
||||
|
||||
|
@ -665,8 +665,8 @@ export const settingsRouter = createTRPCRouter({
|
||||
}),
|
||||
)
|
||||
.mutation(async ({ input }) => {
|
||||
if (IS_CLOUD) {
|
||||
throw new Error("GPU setup is not available in cloud mode");
|
||||
if (IS_CLOUD && !input.serverId) {
|
||||
throw new Error("Select a server to enable the GPU Setup");
|
||||
}
|
||||
|
||||
try {
|
||||
@ -684,7 +684,7 @@ export const settingsRouter = createTRPCRouter({
|
||||
}),
|
||||
)
|
||||
.query(async ({ input }) => {
|
||||
if (IS_CLOUD) {
|
||||
if (IS_CLOUD && !input.serverId) {
|
||||
return {
|
||||
driverInstalled: false,
|
||||
driverVersion: undefined,
|
||||
|
@ -18,117 +18,21 @@ interface GPUInfo {
|
||||
|
||||
export async function checkGPUStatus(serverId?: string): Promise<GPUInfo> {
|
||||
try {
|
||||
// Check NVIDIA Driver
|
||||
let driverInstalled = false;
|
||||
let driverVersion: string | undefined;
|
||||
let availableGPUs = 0;
|
||||
|
||||
try {
|
||||
const driverCommand =
|
||||
"nvidia-smi --query-gpu=driver_version --format=csv,noheader";
|
||||
const { stdout: nvidiaSmi } = serverId
|
||||
? await execAsyncRemote(serverId, driverCommand)
|
||||
: await execAsync(driverCommand);
|
||||
|
||||
driverVersion = nvidiaSmi.trim();
|
||||
if (driverVersion) {
|
||||
driverInstalled = true;
|
||||
const countCommand =
|
||||
"nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
|
||||
const { stdout: gpuCount } = serverId
|
||||
? await execAsyncRemote(serverId, countCommand)
|
||||
: await execAsync(countCommand);
|
||||
|
||||
availableGPUs = Number.parseInt(gpuCount.trim(), 10);
|
||||
}
|
||||
} catch (error) {
|
||||
console.debug("GPU driver check:", error);
|
||||
}
|
||||
|
||||
// Check Runtime Configuration
|
||||
let runtimeInstalled = false;
|
||||
let runtimeConfigured = false;
|
||||
try {
|
||||
const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
|
||||
const { stdout: runtimeInfo } = serverId
|
||||
? await execAsyncRemote(serverId, runtimeCommand)
|
||||
: await execAsync(runtimeCommand);
|
||||
|
||||
const runtimes = JSON.parse(runtimeInfo);
|
||||
runtimeInstalled = "nvidia" in runtimes;
|
||||
|
||||
// Check if it's the default runtime
|
||||
const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
|
||||
const { stdout: defaultRuntime } = serverId
|
||||
? await execAsyncRemote(serverId, defaultCommand)
|
||||
: await execAsync(defaultCommand);
|
||||
|
||||
runtimeConfigured = defaultRuntime.trim() === "nvidia";
|
||||
} catch (error) {
|
||||
console.debug("Runtime check:", error);
|
||||
}
|
||||
|
||||
// Check Swarm GPU Resources
|
||||
let swarmEnabled = false;
|
||||
let gpuResources = 0;
|
||||
|
||||
try {
|
||||
// Check node resources directly from inspect
|
||||
const nodeCommand =
|
||||
"docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
|
||||
const { stdout: resources } = serverId
|
||||
? await execAsyncRemote(serverId, nodeCommand)
|
||||
: await execAsync(nodeCommand);
|
||||
|
||||
if (resources && resources !== "null") {
|
||||
const genericResources = JSON.parse(resources);
|
||||
for (const resource of genericResources) {
|
||||
if (
|
||||
resource.DiscreteResourceSpec &&
|
||||
(resource.DiscreteResourceSpec.Kind === "GPU" ||
|
||||
resource.DiscreteResourceSpec.Kind === "gpu")
|
||||
) {
|
||||
gpuResources = resource.DiscreteResourceSpec.Value;
|
||||
swarmEnabled = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.debug("Swarm resource check:", error);
|
||||
}
|
||||
|
||||
// Get GPU Model and Memory Info
|
||||
const gpuInfoCommand =
|
||||
"nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
|
||||
const { stdout: gpuInfo } = serverId
|
||||
? await execAsyncRemote(serverId, gpuInfoCommand)
|
||||
: await execAsync(gpuInfoCommand);
|
||||
|
||||
const [gpuModel, memoryTotal] = gpuInfo.split(",").map((s) => s.trim());
|
||||
|
||||
// Check CUDA Support
|
||||
const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
|
||||
const { stdout: cudaInfo } = serverId
|
||||
? await execAsyncRemote(serverId, cudaCommand)
|
||||
: await execAsync(cudaCommand);
|
||||
|
||||
const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
|
||||
const cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
|
||||
const cudaSupport = !!cudaVersion;
|
||||
const [driverInfo, runtimeInfo, swarmInfo, gpuInfo, cudaInfo] =
|
||||
await Promise.all([
|
||||
checkGpuDriver(serverId),
|
||||
checkRuntime(serverId),
|
||||
checkSwarmResources(serverId),
|
||||
checkGpuInfo(serverId),
|
||||
checkCudaSupport(serverId),
|
||||
]);
|
||||
|
||||
return {
|
||||
driverInstalled,
|
||||
driverVersion,
|
||||
runtimeInstalled,
|
||||
runtimeConfigured,
|
||||
availableGPUs,
|
||||
swarmEnabled,
|
||||
gpuResources,
|
||||
gpuModel,
|
||||
memoryInfo: memoryTotal,
|
||||
cudaSupport,
|
||||
cudaVersion,
|
||||
...driverInfo,
|
||||
...runtimeInfo,
|
||||
...swarmInfo,
|
||||
...gpuInfo,
|
||||
...cudaInfo,
|
||||
};
|
||||
} catch (error) {
|
||||
console.error("Error in checkGPUStatus:", error);
|
||||
@ -148,118 +52,167 @@ export async function checkGPUStatus(serverId?: string): Promise<GPUInfo> {
|
||||
}
|
||||
}
|
||||
|
||||
const checkGpuDriver = async (serverId?: string) => {
|
||||
let driverVersion: string | undefined;
|
||||
let driverInstalled = false;
|
||||
let availableGPUs = 0;
|
||||
|
||||
try {
|
||||
const driverCommand =
|
||||
"nvidia-smi --query-gpu=driver_version --format=csv,noheader";
|
||||
const { stdout: nvidiaSmi } = serverId
|
||||
? await execAsyncRemote(serverId, driverCommand)
|
||||
: await execAsync(driverCommand);
|
||||
|
||||
driverVersion = nvidiaSmi.trim();
|
||||
if (driverVersion) {
|
||||
driverInstalled = true;
|
||||
const countCommand =
|
||||
"nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l";
|
||||
const { stdout: gpuCount } = serverId
|
||||
? await execAsyncRemote(serverId, countCommand)
|
||||
: await execAsync(countCommand);
|
||||
|
||||
availableGPUs = Number.parseInt(gpuCount.trim(), 10);
|
||||
}
|
||||
} catch (error) {
|
||||
console.debug("GPU driver check:", error);
|
||||
}
|
||||
|
||||
return { driverVersion, driverInstalled, availableGPUs };
|
||||
};
|
||||
|
||||
const checkRuntime = async (serverId?: string) => {
|
||||
let runtimeInstalled = false;
|
||||
let runtimeConfigured = false;
|
||||
|
||||
try {
|
||||
const runtimeCommand = 'docker info --format "{{json .Runtimes}}"';
|
||||
const { stdout: runtimeInfo } = serverId
|
||||
? await execAsyncRemote(serverId, runtimeCommand)
|
||||
: await execAsync(runtimeCommand);
|
||||
|
||||
const runtimes = JSON.parse(runtimeInfo);
|
||||
runtimeInstalled = "nvidia" in runtimes;
|
||||
|
||||
const defaultCommand = 'docker info --format "{{.DefaultRuntime}}"';
|
||||
const { stdout: defaultRuntime } = serverId
|
||||
? await execAsyncRemote(serverId, defaultCommand)
|
||||
: await execAsync(defaultCommand);
|
||||
|
||||
runtimeConfigured = defaultRuntime.trim() === "nvidia";
|
||||
} catch (error) {
|
||||
console.debug("Runtime check:", error);
|
||||
}
|
||||
|
||||
return { runtimeInstalled, runtimeConfigured };
|
||||
};
|
||||
|
||||
const checkSwarmResources = async (serverId?: string) => {
|
||||
let swarmEnabled = false;
|
||||
let gpuResources = 0;
|
||||
|
||||
try {
|
||||
const nodeCommand =
|
||||
"docker node inspect self --format '{{json .Description.Resources.GenericResources}}'";
|
||||
const { stdout: resources } = serverId
|
||||
? await execAsyncRemote(serverId, nodeCommand)
|
||||
: await execAsync(nodeCommand);
|
||||
|
||||
if (resources && resources !== "null") {
|
||||
const genericResources = JSON.parse(resources);
|
||||
for (const resource of genericResources) {
|
||||
if (
|
||||
resource.DiscreteResourceSpec &&
|
||||
(resource.DiscreteResourceSpec.Kind === "GPU" ||
|
||||
resource.DiscreteResourceSpec.Kind === "gpu")
|
||||
) {
|
||||
gpuResources = resource.DiscreteResourceSpec.Value;
|
||||
swarmEnabled = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.debug("Swarm resource check:", error);
|
||||
}
|
||||
|
||||
return { swarmEnabled, gpuResources };
|
||||
};
|
||||
|
||||
const checkGpuInfo = async (serverId?: string) => {
|
||||
let gpuModel: string | undefined;
|
||||
let memoryInfo: string | undefined;
|
||||
|
||||
try {
|
||||
const gpuInfoCommand =
|
||||
"nvidia-smi --query-gpu=gpu_name,memory.total --format=csv,noheader";
|
||||
const { stdout: gpuInfo } = serverId
|
||||
? await execAsyncRemote(serverId, gpuInfoCommand)
|
||||
: await execAsync(gpuInfoCommand);
|
||||
|
||||
[gpuModel, memoryInfo] = gpuInfo.split(",").map((s) => s.trim());
|
||||
} catch (error) {
|
||||
console.debug("GPU info check:", error);
|
||||
}
|
||||
|
||||
return { gpuModel, memoryInfo };
|
||||
};
|
||||
|
||||
const checkCudaSupport = async (serverId?: string) => {
|
||||
let cudaVersion: string | undefined;
|
||||
let cudaSupport = false;
|
||||
|
||||
try {
|
||||
const cudaCommand = 'nvidia-smi -q | grep "CUDA Version"';
|
||||
const { stdout: cudaInfo } = serverId
|
||||
? await execAsyncRemote(serverId, cudaCommand)
|
||||
: await execAsync(cudaCommand);
|
||||
|
||||
const cudaMatch = cudaInfo.match(/CUDA Version\s*:\s*([\d\.]+)/);
|
||||
cudaVersion = cudaMatch ? cudaMatch[1] : undefined;
|
||||
cudaSupport = !!cudaVersion;
|
||||
} catch (error) {
|
||||
console.debug("CUDA support check:", error);
|
||||
}
|
||||
|
||||
return { cudaVersion, cudaSupport };
|
||||
};
|
||||
|
||||
export async function setupGPUSupport(serverId?: string): Promise<void> {
|
||||
try {
|
||||
// 1. Check current GPU status first
|
||||
// 1. Initial status check and validation
|
||||
const initialStatus = await checkGPUStatus(serverId);
|
||||
const shouldContinue = await validatePrerequisites(initialStatus);
|
||||
if (!shouldContinue) return;
|
||||
|
||||
// If GPU is already configured, just verify and return quickly
|
||||
if (
|
||||
initialStatus.swarmEnabled &&
|
||||
initialStatus.runtimeConfigured &&
|
||||
initialStatus.driverInstalled
|
||||
) {
|
||||
console.log("GPU already configured, skipping setup");
|
||||
return;
|
||||
}
|
||||
// 2. Get node ID
|
||||
const nodeId = await getNodeId(serverId);
|
||||
|
||||
// 2. Verify GPU prerequisites
|
||||
if (!initialStatus.driverInstalled || !initialStatus.runtimeInstalled) {
|
||||
throw new Error(
|
||||
"NVIDIA drivers or runtime not installed. Please install them first.",
|
||||
);
|
||||
}
|
||||
// 3. Create daemon configuration
|
||||
const daemonConfig = createDaemonConfig(initialStatus.availableGPUs);
|
||||
|
||||
// Get the node ID
|
||||
const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
|
||||
const { stdout: nodeId } = serverId
|
||||
? await execAsyncRemote(serverId, nodeIdCommand)
|
||||
: await execAsync(nodeIdCommand);
|
||||
|
||||
if (!nodeId.trim()) {
|
||||
throw new Error("Setup Server before enabling GPU support");
|
||||
}
|
||||
|
||||
// 3. Configure NVIDIA runtime in daemon.json
|
||||
const daemonConfig = {
|
||||
runtimes: {
|
||||
nvidia: {
|
||||
path: "nvidia-container-runtime",
|
||||
runtimeArgs: [],
|
||||
},
|
||||
},
|
||||
"default-runtime": "nvidia",
|
||||
"node-generic-resources": [`GPU=${initialStatus.availableGPUs}`],
|
||||
};
|
||||
|
||||
// Different commands for local and remote setup
|
||||
// 4. Setup server based on environment
|
||||
if (serverId) {
|
||||
// Remote server setup (using sudo)
|
||||
const setupCommands = [
|
||||
"sudo -n true",
|
||||
`echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
|
||||
"sudo mkdir -p /etc/nvidia-container-runtime",
|
||||
'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
|
||||
"sudo systemctl daemon-reload",
|
||||
"sudo systemctl restart docker",
|
||||
].join(" && ");
|
||||
|
||||
await execAsyncRemote(serverId, setupCommands);
|
||||
await setupRemoteServer(serverId, daemonConfig);
|
||||
} else {
|
||||
// Local server setup (using pkexec for GUI password prompt)
|
||||
const configFile = `/tmp/docker-daemon-${Date.now()}.json`;
|
||||
await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2));
|
||||
|
||||
const setupCommands = [
|
||||
// Use pkexec for GUI password prompt
|
||||
`pkexec sh -c '
|
||||
cp ${configFile} /etc/docker/daemon.json &&
|
||||
mkdir -p /etc/nvidia-container-runtime &&
|
||||
echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml &&
|
||||
systemctl daemon-reload &&
|
||||
systemctl restart docker
|
||||
'`,
|
||||
`rm ${configFile}`, // Clean up temp file
|
||||
].join(" && ");
|
||||
|
||||
await execAsync(setupCommands);
|
||||
await setupLocalServer(daemonConfig);
|
||||
}
|
||||
|
||||
// 4. Reduced wait time for Docker restart
|
||||
await new Promise((resolve) => setTimeout(resolve, 10000));
|
||||
// 5. Wait for Docker restart
|
||||
await sleep(10000);
|
||||
|
||||
// 5. Add GPU label to the node
|
||||
const labelCommand = `docker node update --label-add gpu=true ${nodeId.trim()}`;
|
||||
if (serverId) {
|
||||
await execAsyncRemote(serverId, labelCommand);
|
||||
} else {
|
||||
await execAsync(labelCommand);
|
||||
}
|
||||
// 6. Add GPU label
|
||||
await addGpuLabel(nodeId, serverId);
|
||||
|
||||
// 6. Quick final verification
|
||||
await new Promise((resolve) => setTimeout(resolve, 5000));
|
||||
const finalStatus = await checkGPUStatus(serverId);
|
||||
|
||||
if (!finalStatus.swarmEnabled) {
|
||||
const diagnosticCommands = [
|
||||
`docker node inspect ${nodeId.trim()}`,
|
||||
'nvidia-smi -a | grep "GPU UUID"',
|
||||
"cat /etc/docker/daemon.json",
|
||||
"cat /etc/nvidia-container-runtime/config.toml",
|
||||
].join(" && ");
|
||||
|
||||
const { stdout: diagnostics } = serverId
|
||||
? await execAsyncRemote(serverId, diagnosticCommands)
|
||||
: await execAsync(diagnosticCommands);
|
||||
|
||||
console.error("Diagnostic Information:", diagnostics);
|
||||
throw new Error("GPU support not detected in swarm after setup");
|
||||
}
|
||||
// 7. Final verification
|
||||
await sleep(5000);
|
||||
const finalStatus = await verifySetup(nodeId, serverId);
|
||||
|
||||
console.log("GPU setup completed successfully:", {
|
||||
availableGPUs: initialStatus.availableGPUs,
|
||||
driverVersion: initialStatus.driverVersion,
|
||||
nodeId: nodeId.trim(),
|
||||
nodeId,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error("GPU Setup Error:", error);
|
||||
@ -274,3 +227,113 @@ export async function setupGPUSupport(serverId?: string): Promise<void> {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
const validatePrerequisites = async (initialStatus: GPUInfo) => {
|
||||
if (!initialStatus.driverInstalled) {
|
||||
throw new Error(
|
||||
"NVIDIA drivers not installed. Please install appropriate NVIDIA drivers first.",
|
||||
);
|
||||
}
|
||||
|
||||
if (!initialStatus.runtimeInstalled) {
|
||||
throw new Error(
|
||||
"NVIDIA Container Runtime not installed. Please install nvidia-container-runtime first.",
|
||||
);
|
||||
}
|
||||
|
||||
if (initialStatus.swarmEnabled && initialStatus.runtimeConfigured) {
|
||||
console.log("GPU already configured, skipping setup");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
const getNodeId = async (serverId?: string) => {
|
||||
const nodeIdCommand = 'docker info --format "{{.Swarm.NodeID}}"';
|
||||
const { stdout: nodeId } = serverId
|
||||
? await execAsyncRemote(serverId, nodeIdCommand)
|
||||
: await execAsync(nodeIdCommand);
|
||||
|
||||
const trimmedNodeId = nodeId.trim();
|
||||
if (!trimmedNodeId) {
|
||||
throw new Error("Setup Server before enabling GPU support");
|
||||
}
|
||||
|
||||
return trimmedNodeId;
|
||||
};
|
||||
|
||||
const createDaemonConfig = (availableGPUs: number) => ({
|
||||
runtimes: {
|
||||
nvidia: {
|
||||
path: "nvidia-container-runtime",
|
||||
runtimeArgs: [],
|
||||
},
|
||||
},
|
||||
"default-runtime": "nvidia",
|
||||
"node-generic-resources": [`GPU=${availableGPUs}`],
|
||||
});
|
||||
|
||||
const setupRemoteServer = async (serverId: string, daemonConfig: any) => {
|
||||
const setupCommands = [
|
||||
"sudo -n true",
|
||||
`echo '${JSON.stringify(daemonConfig, null, 2)}' | sudo tee /etc/docker/daemon.json`,
|
||||
"sudo mkdir -p /etc/nvidia-container-runtime",
|
||||
'echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" | sudo tee -a /etc/nvidia-container-runtime/config.toml',
|
||||
"sudo systemctl daemon-reload",
|
||||
"sudo systemctl restart docker",
|
||||
].join(" && ");
|
||||
|
||||
await execAsyncRemote(serverId, setupCommands);
|
||||
};
|
||||
|
||||
const setupLocalServer = async (daemonConfig: any) => {
|
||||
const configFile = `/tmp/docker-daemon-${Date.now()}.json`;
|
||||
await fs.writeFile(configFile, JSON.stringify(daemonConfig, null, 2));
|
||||
|
||||
const setupCommands = [
|
||||
`pkexec sh -c '
|
||||
cp ${configFile} /etc/docker/daemon.json &&
|
||||
mkdir -p /etc/nvidia-container-runtime &&
|
||||
echo "swarm-resource = \\"DOCKER_RESOURCE_GPU\\"" >> /etc/nvidia-container-runtime/config.toml &&
|
||||
systemctl daemon-reload &&
|
||||
systemctl restart docker
|
||||
'`,
|
||||
`rm ${configFile}`,
|
||||
].join(" && ");
|
||||
|
||||
await execAsync(setupCommands);
|
||||
};
|
||||
|
||||
const addGpuLabel = async (nodeId: string, serverId?: string) => {
|
||||
const labelCommand = `docker node update --label-add gpu=true ${nodeId}`;
|
||||
if (serverId) {
|
||||
await execAsyncRemote(serverId, labelCommand);
|
||||
} else {
|
||||
await execAsync(labelCommand);
|
||||
}
|
||||
};
|
||||
|
||||
const verifySetup = async (nodeId: string, serverId?: string) => {
|
||||
const finalStatus = await checkGPUStatus(serverId);
|
||||
|
||||
if (!finalStatus.swarmEnabled) {
|
||||
const diagnosticCommands = [
|
||||
`docker node inspect ${nodeId}`,
|
||||
'nvidia-smi -a | grep "GPU UUID"',
|
||||
"cat /etc/docker/daemon.json",
|
||||
"cat /etc/nvidia-container-runtime/config.toml",
|
||||
].join(" && ");
|
||||
|
||||
const { stdout: diagnostics } = serverId
|
||||
? await execAsyncRemote(serverId, diagnosticCommands)
|
||||
: await execAsync(diagnosticCommands);
|
||||
|
||||
console.error("Diagnostic Information:", diagnostics);
|
||||
throw new Error("GPU support not detected in swarm after setup");
|
||||
}
|
||||
|
||||
return finalStatus;
|
||||
};
|
||||
|
||||
const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
Loading…
Reference in New Issue
Block a user