Merge pull request canonical#161 from jocado/cdi-with-classic

NVIDIA support - enable use on Classic systems
jocado · Jun 11, 2024 · 7fd884d · 7fd884d
2 parents f6e8a7a + 9fbed5f
commit 7fd884d
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -56,19 +56,23 @@ Docker should function normally, with the following caveats:
 * [Setup a secure private registry](registry-example.md)
 
 
-## NVIDIA support on Ubuntu Core 22
+## NVIDIA support
 
-If the system is found to have an nvidia graphics card available, the nvidia container toolkit will be setup and configured to enable use of the local GPU from docker.  This can be used to enable use of CUDA from a docker container, for instance.
+If the system is found to have an nvidia graphics card available, and the host has the required nvidia libraries installed, the nvidia container toolkit will be setup and configured to enable use of the local GPU from docker.  This can be used to enable use of CUDA from a docker container, for instance.
+
+To enable proper use of the GPU within docker, the nvidia runtime must be used.  By default, the nvidia runtime will be configured to use [CDI](https://github.com/cncf-tags/container-device-interface) mode, and a the appropriate nvidia CDI config will be automatically created for the system.  You just need to specify the nvidia runtime when running a container.
+
+### Ubuntu Core 22
+
+The required nvidia libraries are available in the nvidia-core22 snap.
 
 This requires connection of the graphics-core22 content interface provided by the nvidia-core22 snap, which should be automatically connected once installed.
 
-To enable proper use of the GPU within docker, the nvidia runtime must be used.  By default, the nvidia runtime will be configured to use ([CDI](https://github.com/cncf-tags/container-device-interface)) mode, and a the appropriate nvidia CDI config will be automatically created for the system.  You just need to specify the nvidia runtime when running a container.
+### Ubuntu Server / Desktop
 
-Example usage:
+The required nvidia libraries are available in the nvidia container toolkit packages.
 
-```shell
-docker run --rm --runtime nvidia {cuda-container-image-name}
-```
+Instruction on how to install them can be found ([here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html))
 
 ### Custom NVIDIA runtime config
 
@@ -96,6 +100,32 @@ Setting up the nvidia support should be automatic the hardware is present, but y
 snap set docker nvidia-support.disabled=true
 ```
 
+### Usage examples
+
+Generic example usage would look something like:
+
+```shell
+docker run --rm --runtime nvidia --gpus all {cuda-container-image-name}
+```
+
+or
+
+```shell
+docker run --rm --runtime nvidia --env NVIDIA_VISIBLE_DEVICES=all {cuda-container-image-name}
+```
+
+If your container image already has appropriate environment variables set, may be able to just specify the nvidia runtime with no additional args required.
+
+Please refer to [this guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html) for mode detail regarding environment variables that can be used.
+
+*NOTE*: library path and discovery is automatically handled, but binary paths are not, so if you wish to test using something like the `nvidia-smi` binary passed into the container from the host, you could either specify the full path or set the PATH environment variable.
+
+e.g.
+
+```
+docker run --rm --runtime=nvidia --gpus all --env PATH="${PATH}:/var/lib/snapd/hostfs/usr/bin" ubuntu nvidia-smi
+```
+
 ## Development
 
 Developing the `docker` snap package is typically performed on a "classic" Ubuntu distribution.  The instructions here are written for Ubuntu 16.04 "Xenial".

diff --git a/nvidia/lib b/nvidia/lib
@@ -9,6 +9,12 @@ ARCH_TRIPLET="${U_MACHINE}-${U_KERNEL,,}-${U_USERLAND,,}"
 
 NVIDIA_SUPPORT_DISABLED="$(snapctl get nvidia-support.disabled)"
 
+if [ -L "/var/lib/snapd/hostfs/usr/lib/${ARCH_TRIPLET}/libcuda.so" ] ; then
+    NVIDIA_SUPPORT_CLASSIC="true"
+else
+    NVIDIA_SUPPORT_CLASSIC="false"
+fi
+
 device_wait() {
 
     COUNT=0
@@ -51,7 +57,16 @@ cdi_generate () {
     CDI_DEVICE_NAME_STRATEGY="$(snapctl get nvidia-support.cdi.device-name-strategy)"
     CDI_DEVICE_NAME_STRATEGY="${CDI_DEVICE_NAME_STRATEGY:-index}"
 
-    PATH="${PATH}:${SNAP}/graphics/bin" "${SNAP}/usr/bin/nvidia-ctk" cdi generate --nvidia-ctk-path "${SNAP}/usr/bin/nvidia-ctk" --library-search-path "${SNAP}/graphics/lib/${ARCH_TRIPLET}" --device-name-strategy "${CDI_DEVICE_NAME_STRATEGY}" --output "${SNAP_DATA}/etc/cdi/nvidia.yaml"
+    # Default CDI libs search path and shell path for install on core systems #
+    CDI_LIB_SEARCH_PATH="${SNAP}/graphics/lib/${ARCH_TRIPLET}"
+    CDI_CONFIG_SEARCH_PATH="${SNAP}/graphics/share"
+    CDI_PATH="${PATH}:${SNAP}/graphics/bin"
+    # Otherwise, if on classic and nvidia driver is installed, set hostfs for the CDI libs search path and shell path #
+    [ "${NVIDIA_SUPPORT_CLASSIC}" == "true" ] && CDI_LIB_SEARCH_PATH="/var/lib/snapd/hostfs/usr/lib/${ARCH_TRIPLET}"
+    [ "${NVIDIA_SUPPORT_CLASSIC}" == "true" ] && CDI_CONFIG_SEARCH_PATH="/var/lib/snapd/hostfs/usr/share"
+    [ "${NVIDIA_SUPPORT_CLASSIC}" == "true" ] && CDI_PATH="${PATH}:/var/lib/snapd/hostfs/usr/bin"
+
+    XDG_DATA_DIRS="${XDG_DATA_DIRS:-}:${CDI_CONFIG_SEARCH_PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CDI_LIB_SEARCH_PATH}" PATH="${CDI_PATH}" "${SNAP}/usr/bin/nvidia-ctk" cdi generate --nvidia-ctk-path "${SNAP}/usr/bin/nvidia-ctk" --library-search-path "${CDI_LIB_SEARCH_PATH}" --device-name-strategy "${CDI_DEVICE_NAME_STRATEGY}" --output "${SNAP_DATA}/etc/cdi/nvidia.yaml"
 }
 
 # Create the nvidia runtime config, either snap default or custom #
@@ -63,8 +78,9 @@ nvidia_runtime_config () {
         echo "${RUNTIME_CONFIG_OVERRIDE}" > "${SNAP_DATA}/etc/nvidia-container-runtime/config.toml"
     # Default - opinionated, but most viable option for now #
     else
+        # FIXME: CDI spec-dirs can be set is a list using `"${SNAP_DATA}/etc/cdi",/var/run/cdi`, once this is fixed: https://github.com/NVIDIA/nvidia-container-toolkit/issues/466
         rm -f "${SNAP_DATA}/etc/nvidia-container-runtime/config.toml"
-        "${SNAP}/usr/bin/nvidia-ctk" config --in-place --set nvidia-container-runtime.mode=cdi
+        "${SNAP}/usr/bin/nvidia-ctk" config --in-place --set nvidia-container-runtime.mode=cdi --set nvidia-container-runtime.modes.cdi.spec-dirs="${SNAP_DATA}/etc/cdi" --config "${SNAP_DATA}/etc/nvidia-container-runtime/config.toml"
     fi
 }
 

diff --git a/nvidia/nvidia-container-toolkit b/nvidia/nvidia-container-toolkit
@@ -7,8 +7,16 @@ set -eu
 # Just exit if NVIDIA support is disabled #
 [ "${NVIDIA_SUPPORT_DISABLED}" != "true" ] || exit 0
 
+NVIDIA_ENABLE="false"
+
+# Determine if we can enable NVIDIA support or not #
+snapctl is-connected graphics-core22 && NVIDIA_ENABLE="true"
+[ "${NVIDIA_SUPPORT_CLASSIC}" == "true" ] && NVIDIA_ENABLE="true"
+
 # Ensure nvidia support setup correctly, and only if hardware is preset and correct #
-if snapctl is-connected graphics-core22 ; then
+if [ "${NVIDIA_ENABLE}" == "true" ] ; then
+
+    [ "${NVIDIA_SUPPORT_CLASSIC}" == "true" ] && echo "Running on Classic system" || echo "Running on Ubuntu Core system"
 
     # Connection hooks are run early - copy the config file from $SNAP into $SNAP_DATA if it doesn't exist
     if [ ! -f "$SNAP_DATA/config/daemon.json" ]; then

diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml
@@ -43,11 +43,6 @@ passthrough:
 layout:
   /etc/docker:
     bind: $SNAP_DATA/etc/docker
-  # Container Device Interface (CDI) Support - https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#container-device-interface-cdi-support
-  /etc/cdi:
-    bind: $SNAP_DATA/etc/cdi
-  /etc/nvidia-container-runtime:
-    bind: $SNAP_DATA/etc/nvidia-container-runtime
   /etc/gitconfig:
     bind-file: $SNAP_DATA/etc/gitconfig
   /usr/libexec/docker/cli-plugins:
@@ -60,6 +55,8 @@ environment:
   LD_LIBRARY_PATH:    $SNAP/graphics/lib/${CRAFT_ARCH_TRIPLET}:${SNAP}/lib/:${SNAP}/lib/${CRAFT_ARCH_TRIPLET}:${SNAP}/usr/lib/:${SNAP}/usr/lib/${CRAFT_ARCH_TRIPLET}
   LIBGL_DRIVERS_PATH: $SNAP/graphics/lib/${CRAFT_ARCH_TRIPLET}/dri
   LIBVA_DRIVERS_PATH: $SNAP/graphics/lib/${CRAFT_ARCH_TRIPLET}/dri
+  # nvidia-container-runtime can only set alternative config directory via XDG_CONFIG_HOME #
+  XDG_CONFIG_HOME: $SNAP_DATA/etc
 
 plugs:
   home: