Add nvidia-cdi-refresh service
Some checks are pending
CI Pipeline / code-scanning (push) Waiting to run
CI Pipeline / variables (push) Waiting to run
CI Pipeline / golang (push) Waiting to run
CI Pipeline / image (push) Blocked by required conditions
CI Pipeline / e2e-test (push) Blocked by required conditions

Automatic regeneration of /var/run/cdi/nvidia.yaml
New units:
	•	nvidia-cdi-refresh.service – one-shot wrapper for
			nvidia-ctk cdi generate (adds sleep + required caps).
	•	nvidia-cdi-refresh.path   – fires on driver install/upgrade via
			modules.dep.bin changes.
	•	60-nvidia-cdi-refresh.rules – udev triggers for module add/remove, PCI
			bind/unbind/change, and MIG /dev/nvidia-caps* char-device events.
Packaging
	•	RPM %post reloads udev/systemd and enables the path unit on fresh
			installs.
	•	DEB postinst does the same (configure, skip on upgrade).

Result: CDI spec is always up to date

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
This commit is contained in:
Carlos Eduardo Arango Gutierrez 2025-05-12 15:22:42 +02:00
parent adb5e6719d
commit 512272201c
No known key found for this signature in database
GPG Key ID: 42D9CB42F300A852
11 changed files with 143 additions and 1 deletions

View File

@ -0,0 +1,23 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[Unit]
Description=Trigger CDI refresh on NVIDIA driver install / uninstall events
[Path]
# depmod rewrites these exactly once per (un)install
PathChanged=/lib/modules/%v/modules.dep.bin
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,26 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[Unit]
Description=Refresh NVIDIA CDI specification file
[Service]
Type=oneshot
# The 30-second delay ensures that dependent services or resources are fully initialized.
ExecStartPre=/bin/sleep 30
ExecStart=/usr/bin/nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml
CapabilityBoundingSet=CAP_SYS_MODULE CAP_SYS_ADMIN CAP_MKNOD
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,21 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NVIDIA kernel-module events
ACTION=="add|remove", SUBSYSTEM=="module", KERNEL=="nvidia*", \
TAG+="systemd", ENV{SYSTEMD_WANTS}+="nvidia-cdi-refresh.service"
# First bind/unbind/change of a GPU PCI function to the NVIDIA driver
ACTION=="bind|unbind|change", SUBSYSTEM=="pci", DRIVER=="nvidia", \
TAG+="systemd", ENV{SYSTEMD_WANTS}+="nvidia-cdi-refresh.service"

View File

@ -55,6 +55,8 @@ RUN make PREFIX=${DIST_DIR} cmds
WORKDIR $DIST_DIR
COPY packaging/debian ./debian
COPY deployments/systemd/ .
COPY deployments/udev/ .
ARG LIBNVIDIA_CONTAINER_TOOLS_VERSION
ENV LIBNVIDIA_CONTAINER_TOOLS_VERSION ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}

View File

@ -46,6 +46,8 @@ RUN make PREFIX=${DIST_DIR} cmds
WORKDIR $DIST_DIR/..
COPY packaging/rpm .
COPY deployments/systemd/ .
COPY deployments/udev/ .
ARG LIBNVIDIA_CONTAINER_TOOLS_VERSION
ENV LIBNVIDIA_CONTAINER_TOOLS_VERSION ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}

View File

@ -71,6 +71,8 @@ RUN make PREFIX=${DIST_DIR} cmds
WORKDIR $DIST_DIR/..
COPY packaging/rpm .
COPY deployments/systemd/* ${DIST_DIR}/
COPY deployments/udev/* ${DIST_DIR}/
ARG LIBNVIDIA_CONTAINER_TOOLS_VERSION
ENV LIBNVIDIA_CONTAINER_TOOLS_VERSION ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}

View File

@ -53,6 +53,8 @@ RUN make PREFIX=${DIST_DIR} cmds
WORKDIR $DIST_DIR
COPY packaging/debian ./debian
COPY deployments/systemd/ .
COPY deployments/udev/ .
ARG LIBNVIDIA_CONTAINER_TOOLS_VERSION
ENV LIBNVIDIA_CONTAINER_TOOLS_VERSION ${LIBNVIDIA_CONTAINER_TOOLS_VERSION}

View File

@ -29,3 +29,9 @@ Architecture: any
Depends: ${misc:Depends}, nvidia-container-toolkit-base (= @VERSION@)
Description: NVIDIA Container Toolkit Operator Extensions
Provides tools for using the NVIDIA Container Toolkit with the GPU Operator
Package: nvidia-container-toolkit-cdi-refresh
Architecture: any
Depends: ${misc:Depends}, nvidia-container-toolkit-base (= @VERSION@)
Description: NVIDIA CDI Refresh Service
Provides a service to refresh the NVIDIA CDI specification

View File

@ -0,0 +1,3 @@
60-nvidia-cdi-refresh.rules /etc/udev/rules.d/
nvidia-cdi-refresh.service /usr/lib/systemd/system/
nvidia-cdi-refresh.path /usr/lib/systemd/system/

View File

@ -0,0 +1,32 @@
#!/bin/sh
set -e
case "$1" in
configure)
if command -v udevadm >/dev/null 2>&1; then
udevadm control --reload || true
fi
if command -v systemctl >/dev/null 2>&1 \
&& systemctl --quiet is-system-running 2>/dev/null; then
systemctl daemon-reload || true
if [ -z "$2" ]; then # $2 empty → first install
systemctl enable --now nvidia-cdi-refresh.path || true
fi
fi
;;
abort-upgrade|abort-remove|abort-deconfigure)
# Nothing to do for these dpkg abort cases
;;
*)
echo "postinst called with unknown argument \`$1'" >&2
exit 1
;;
esac
exit 0

View File

@ -17,6 +17,9 @@ Source3: nvidia-container-runtime
Source4: nvidia-container-runtime.cdi
Source5: nvidia-container-runtime.legacy
Source6: nvidia-cdi-hook
Source7: nvidia-cdi-refresh.service
Source8: nvidia-cdi-refresh.path
Source9: 60-nvidia-cdi-refresh.rules
Obsoletes: nvidia-container-runtime <= 3.5.0-1, nvidia-container-runtime-hook <= 1.4.0-2
Provides: nvidia-container-runtime
@ -28,16 +31,22 @@ Requires: nvidia-container-toolkit-base == %{version}-%{release}
Provides tools and utilities to enable GPU support in containers.
%prep
cp %{SOURCE0} %{SOURCE1} %{SOURCE2} %{SOURCE3} %{SOURCE4} %{SOURCE5} %{SOURCE6} .
cp %{SOURCE0} %{SOURCE1} %{SOURCE2} %{SOURCE3} %{SOURCE4} %{SOURCE5} %{SOURCE6} %{SOURCE7} %{SOURCE8} %{SOURCE9} .
%install
mkdir -p %{buildroot}%{_bindir}
mkdir -p %{buildroot}/usr/lib/systemd/system
mkdir -p %{buildroot}/etc/udev/rules.d
install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime-hook
install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime
install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime.cdi
install -m 755 -t %{buildroot}%{_bindir} nvidia-container-runtime.legacy
install -m 755 -t %{buildroot}%{_bindir} nvidia-ctk
install -m 755 -t %{buildroot}%{_bindir} nvidia-cdi-hook
install -m 644 -t %{buildroot}/usr/lib/systemd/system %{SOURCE7}
install -m 644 -t %{buildroot}/usr/lib/systemd/system %{SOURCE8}
install -m 644 -t %{buildroot}/etc/udev/rules.d %{SOURCE9}
%post
if [ $1 -gt 1 ]; then # only on package upgrade
@ -45,6 +54,17 @@ if [ $1 -gt 1 ]; then # only on package upgrade
cp -af %{_bindir}/nvidia-container-runtime-hook %{_localstatedir}/lib/rpm-state/nvidia-container-toolkit
fi
# Reload udev so the new rule is active immediately
/usr/bin/udevadm control --reload || :
# Reload systemd unit cache
/bin/systemctl daemon-reload || :
# On fresh install ($1 == 1) enable the path unit so it starts at boot
if [ "$1" -eq 1 ]; then
/bin/systemctl enable --now nvidia-cdi-refresh.path || :
fi
%posttrans
if [ ! -e %{_bindir}/nvidia-container-runtime-hook ]; then
# repairing lost file nvidia-container-runtime-hook
@ -64,6 +84,9 @@ fi
%files
%license LICENSE
%{_bindir}/nvidia-container-runtime-hook
%config(noreplace) %{_unitdir}/nvidia-cdi-refresh.service
%config(noreplace) %{_unitdir}/nvidia-cdi-refresh.path
%config %{_udevrulesdir}/60-nvidia-cdi-refresh.rules
%changelog
# As of 1.10.0-1 we generate the release information automatically