Merge pull request #73 from mikemckiernan/dev-24.06

Smaller items for 24.06
NVIDIA · Jul 31, 2024 · 692c0ec · 692c0ec
2 parents e9a9ce9 + e31e3f1
commit 692c0ec
Show file tree

Hide file tree

Showing 6 changed files with 182 additions and 39 deletions.
diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst
@@ -55,13 +55,13 @@ The product life cycle and versioning are subject to change in the future.
    * - GPU Operator Version
      - Status
 
-   * - 24.3.x
+   * - 24.6.x
      - Generally Available
 
-   * - 23.9.x
+   * - 24.3.x
      - Maintenance
 
-   * - 23.6.x and lower
+   * - 23.9.x and lower
      - EOL
 
 
@@ -86,60 +86,58 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information.
      - Version
 
    * - NVIDIA GPU Operator
-     - v24.3.0
+     - v24.6.0
 
    * - NVIDIA GPU Driver
-     - | `550.90.07 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-07/index.html>`_ (recommended),
-       | `550.54.15 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-54-15/index.html>`_ (default),
-       | `535.183.01 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-183-01/index.html>`_,
-       | `470.256.02 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-470-256-02/index.html>`_
+     - | `550.90.07 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-550-90-07/index.html>`_ (default),
+       | `535.183.06 <https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-183-06/index.html>`_
 
-   * - NVIDIA Driver Manager for K8s
-     - `v0.6.8 <https://ngc.nvidia.com/catalog/containers/nvidia:cloud-native:k8s-driver-manager>`__
+   * - NVIDIA Driver Manager for Kubernetes
+     - `v0.6.10 <https://ngc.nvidia.com/catalog/containers/nvidia:cloud-native:k8s-driver-manager>`__
 
    * - NVIDIA Container Toolkit
-     - `1.15.0 <https://github.com/NVIDIA/nvidia-container-toolkit/releases>`__
+     - `1.16.1 <https://github.com/NVIDIA/nvidia-container-toolkit/releases>`__
 
    * - NVIDIA Kubernetes Device Plugin
-     - `0.15.0 <https://github.com/NVIDIA/k8s-device-plugin/releases>`__
+     - `0.16.1 <https://github.com/NVIDIA/k8s-device-plugin/releases>`__
 
    * - DCGM Exporter
-     - `3.3.5-3.4.1 <https://github.com/NVIDIA/gpu-monitoring-tools/releases>`__
+     - `3.3.7-3.5.0 <https://github.com/NVIDIA/dcgm-exporter/releases>`__
 
    * - Node Feature Discovery
-     - v0.15.4
+     - v0.16.3
 
    * - | NVIDIA GPU Feature Discovery
        | for Kubernetes
-     - `0.15.0 <https://github.com/NVIDIA/gpu-feature-discovery/releases>`__
+     - `0.16.1 <https://github.com/NVIDIA/k8s-device-plugin/releases>`__
 
    * - NVIDIA MIG Manager for Kubernetes
-     - `0.7.0 <https://github.com/NVIDIA/mig-parted/tree/main/deployments/gpu-operator>`__
+     - `0.8.0 <https://github.com/NVIDIA/mig-parted/tree/main/deployments/gpu-operator>`__
 
    * - DCGM
-     - `3.3.5-1 <https://docs.nvidia.com/datacenter/dcgm/latest/release-notes/changelog.html>`__
+     - `3.3.7-1 <https://docs.nvidia.com/datacenter/dcgm/latest/release-notes/changelog.html>`__
 
    * - Validator for NVIDIA GPU Operator
-     - v24.3.0
+     - v24.6.0
 
    * - NVIDIA KubeVirt GPU Device Plugin
-     - `v1.2.7 <https://github.com/NVIDIA/kubevirt-gpu-device-plugin>`__
+     - `v1.2.9 <https://github.com/NVIDIA/kubevirt-gpu-device-plugin>`__
 
    * - NVIDIA vGPU Device Manager
-     - v0.2.6
+     - `v0.2.7 <https://github.com/NVIDIA/vgpu-device-manager>`__
 
    * - NVIDIA GDS Driver |gds|_
      - `2.17.5 <https://github.com/NVIDIA/gds-nvidia-fs/releases>`__
 
    * - NVIDIA Kata Manager for Kubernetes
-     - v0.2.0
+     - `v0.2.1 <https://github.com/NVIDIA/k8s-kata-manager>`__
 
    * - | NVIDIA Confidential Computing
        | Manager for Kubernetes
      - v0.1.1
 
    * - NVIDIA GDRCopy Driver
-     - `v2.4.1 <https://github.com/NVIDIA/gdrcopy/releases>`__
+     - `v2.4.1-1 <https://github.com/NVIDIA/gdrcopy/releases>`__
 
 .. _gds-open-kernel:
 
@@ -151,6 +149,6 @@ Refer to :ref:`Upgrading the NVIDIA GPU Operator` for more information.
 
    - Driver version could be different with NVIDIA vGPU, as it depends on the driver
      version downloaded from the `NVIDIA vGPU Software Portal  <https://nvid.nvidia.com/dashboard/#/dashboard>`_.
-   - The GPU Operator is supported on all active NVIDIA datacenter production drivers.
+   - The GPU Operator is supported on all active NVIDIA data center production drivers.
      Refer to `Supported Drivers and CUDA Toolkit Versions <https://docs.nvidia.com/datacenter/tesla/drivers/index.html#cuda-drivers>`_
      for more information.
diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst
@@ -80,9 +80,13 @@ The following NVIDIA data center GPUs are supported on x86 based platforms:
      | NVIDIA HGX H100         | NVIDIA Hopper and         |
      |                         | NVSwitch                  |
      +-------------------------+---------------------------+
+     | NVIDIA H200             | NVIDIA Hopper             |
+     +-------------------------+---------------------------+
      | | NVIDIA H100,          | NVIDIA Hopper             |
      | | NVIDIA H100 NVL       |                           |
      +-------------------------+---------------------------+
+     | NVIDIA H20              | NVIDIA Hopper             |
+     +-------------------------+---------------------------+
      | | NVIDIA L40,           | NVIDIA Ada                |
      | | NVIDIA L40S           |                           |
      +-------------------------+---------------------------+
@@ -472,7 +476,7 @@ Support for GPUDirect RDMA
 
 Supported operating systems and NVIDIA GPU Drivers with GPUDirect RDMA.
 
-- Ubuntu 20.04 and 22.04 LTS with Network Operator 24.1.1
+- Ubuntu 20.04 and 22.04 LTS with Network Operator 24.4.0
 - Red Hat OpenShift 4.12 and higher with Network Operator 23.10.0
 
 For information about configuring GPUDirect RDMA, refer to :doc:`gpu-operator-rdma`.
@@ -483,7 +487,7 @@ Support for GPUDirect Storage
 
 Supported operating systems and NVIDIA GPU Drivers with GPUDirect Storage.
 
-- Ubuntu 20.04 and 22.04 LTS with Network Operator 24.1.1
+- Ubuntu 20.04 and 22.04 LTS with Network Operator 24.4.0
 - Red Hat OpenShift Container Platform 4.12 and higher
 
 .. note::

diff --git a/gpu-operator/precompiled-drivers.rst b/gpu-operator/precompiled-drivers.rst
@@ -59,9 +59,9 @@ Limitations and Restrictions
   refer to :external+ocp:doc:`gpu-operator-with-precompiled-drivers`.
 
 * NVIDIA supports precompiled driver containers for the most recently released long-term
-  servicing branch (LTSB) driver branch, 525.
+  servicing branch (LTSB) driver branch.
 
-* NVIDIA builds images for the ``generic`` kernel variant.
+* NVIDIA builds images for the ``aws``, ``azure``, ``generic``, ``nvidia``, and ``oracle`` kernel variants.
   If your hosts run a different kernel variant, you can build a precompiled driver image
   and use your own container registry.
 
@@ -157,7 +157,7 @@ Perform the following steps to enable support for precompiled driver containers:
 
     clusterpolicy.nvidia.com/cluster-policy patched
 
-#. (Optional) Confirm that the driver daemonset pods terminate:
+#. Optional: Confirm that the driver daemon set pods terminate:
 
    .. code-block:: console
 
@@ -191,10 +191,13 @@ Perform the following steps to disable support for precompiled driver containers
 
 #. Disable support by modifying the cluster policy:
 
-   .. code-block:: console
+   .. code-block:: shell
 
-     $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \
-         -p='[{"op": "replace", "path": "/spec/driver/usePrecompiled", "value":false}]'
+      $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \
+          -p='[
+            {"op": "replace", "path": "/spec/driver/usePrecompiled", "value":false},
+            {"op": "replace", "path": "/spec/driver/version", "value":"550.90.07"},
+          ]'
 
    *Example Output*
 
@@ -327,7 +330,7 @@ you can perform the following steps to build and run a container image.
   If you have not already installed the GPU Operator, in addition to the ``--set driver.usePrecompiled=true``
   and ``--set driver.version=${DRIVER_BRANCH}`` arguments for Helm, also specify the ``--set driver.repository="$PRIVATE_REGISTRY"`` argument.
 
-  If the container registry is not public, you need to create an image pull secret in the GPU operator namespace
+  If the container registry is not public, you need to create an image pull secret in the GPU Operator namespace
   and specify it in the ``--set driver.imagePullSecrets=<pull-secret>`` argument.
 
   If you already installed the GPU Operator, specify the private registry for the driver in the cluster policy:

diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst
@@ -25,7 +25,7 @@ Release Notes
 
 This document describes the new features, improvements, fixed and known issues for the NVIDIA GPU Operator.
 
-See the :ref:`GPU Operator Component Matrix` for a list of components included in each release.
+See the :ref:`GPU Operator Component Matrix` for a list of software components and versions included in each release.
 
 .. note::
 
@@ -39,14 +39,152 @@ See the :ref:`GPU Operator Component Matrix` for a list of components included i
 24.6.0
 ======
 
+.. _v24.6.0-new-features:
+
 New Features
 ------------
 
+* Added support for the NVIDIA Data Center GPU Driver version 550.90.07.
+  Refer to the :ref:`GPU Operator Component Matrix`
+  on the platform support page.
+
+* Added support for the following software component versions:
+
+    - NVIDIA Container Toolkit v1.16.1
+    - NVIDIA Driver Manager for Kubernetes v0.6.10
+    - NVIDIA Kubernetes Device Plugin v0.16.1
+    - NVIDIA DCGM Exporter v3.3.7-3.5.0
+    - NVIDIA DCGM v3.3.7-1
+    - Node Feature Discovery v0.16.3
+    - NVIDIA GPU Feature Discovery for Kubernetes v0.16.1
+    - NVIDIA MIG Manager for Kubernetes v0.8.0
+    - NVIDIA KubeVirt GPU Device Plugin v1.2.9
+    - NVIDIA vGPU Device Manager v0.2.7
+    - NVIDIA GDS Driver v2.17.5
+    - NVIDIA Kata Manager for Kubernetes v0.2.1
+    - NVIDIA GDRCopy Driver v2.4.1-1
+
+* Added support for NVIDIA Network Operator v24.4.0.
+  Refer to :ref:`Support for GPUDirect RDMA` and :ref:`Support for GPUDirect Storage`.
+
 * Added support for using the Operator with Container-Optimized OS on Google Kubernetes Engine (GKE).
   The process uses the Google driver installer to manage the NVIDIA GPU Driver.
   For Ubuntu on GKE, you can use the Google driver installer or continue to use the NVIDIA Driver Manager as with previous releases.
   Refer to :doc:`google-gke` for more information.
 
+* Added support for precompiled driver containers with Open Kernel module drivers.
+  Specify ``--set driver.useOpenKernelModules=true --set driver.usePrecompiled=true --set driver.version=<driver-branch>``
+  when you install or upgrade the Operator.
+  Support remains limited to Ubuntu 22.04.
+  Refer to :doc:`precompiled-drivers` for more information.
+
+  NVIDIA began publishing driver containers with this support on July 15, 2024.
+  The tags for the first containers with this support are as follows:
+
+  * <driver-branch>-5.15.0-116-generic-ubuntu22.04
+  * <driver-branch>-5.15.0-1060-nvidia-ubuntu22.04
+  * <driver-branch>-5.15.0-1063-oracle-ubuntu22.04
+  * <driver-branch>-5.15.0-1068-azure-ubuntu22.04
+  * <driver-branch>-5.15.0-1065-aws-ubuntu22.04
+
+  Precompiled driver containers built after July 15 include support for the Open Kernel module drivers.
+
+* Added support for new MIG profiles.
+
+  * For H200 devices:
+
+    * ``1g.18gb``
+    * ``1g.18gb+me``
+    * ``1g.35gb``
+    * ``2g.35gb``
+    * ``3g.71gb``
+    * ``4g.71gb``
+    * ``7g.141gb``
+
+  * Added an ``all-balanced`` profile for H20 devices that creates the following GPU instances:
+
+    * ``1g.12gb`` :math:`\times` 2
+    * ``2g.24gb`` :math:`\times` 1
+    * ``3g.48gb`` :math:`\times` 1
+
+* Added support for creating a config map with custom MIG profiles during installation or upgrade with Helm.
+  Refer to :ref:`Example: Custom MIG Configuration During Installation` for more information.
+
+.. _v24.6.0-fixed-issues:
+
+Fixed Issues
+------------
+
+* Role-based access controls for the following components were reviewed and revised to use least-required privileges:
+
+  * GPU Operator
+  * Operator Validator
+  * MIG Manager
+  * GPU Driver Manager
+  * GPU Feature Discovery
+  * Kubernetes Device Plugin
+  * KubeVirt Device Plugin
+  * vGPU Host Manager
+
+  In previous releases, the permissions were more permissive than necessary.
+
+* Fixed an issue with Node Feature Discovery (NFD).
+  When an NFD pod was deleted or restarted, all NFD node labels were removed from the node and GPU Operator operands were restarted.
+  The v0.16.2 release of NFD fixes the issue.
+  Refer to Github `issue #782 <https://github.com/NVIDIA/gpu-operator/issues/782>`__ for more details.
+
+* Fixed an issue with NVIDIA vGPU Manager not working correctly on nodes with GPUs that require Open Kernel module drivers and GPU System Processor (GSP) firmware.
+  Refer to Github `issue #761 <https://github.com/NVIDIA/gpu-operator/issues/761>`__ for more details.
+
+* DGCM is revised to use a cluster IP and a service with the internal traffic policy set to ``Local``.
+  In previous releases, DCGM was a host networked pod.
+  The ``dcgm.hostPort`` field of the NVIDIA cluster policy resource is now deprecated.
+
+* Fixed an issue that prevented enabling GDRCopy and additional volume mounts with the NVIDIA Driver custom resource.
+  Previously, the driver daemon set did not update with the change and the Operator logs included an error message.
+  Refer to Github `issue #713 <https://github.com/NVIDIA/gpu-operator/issues/713>`__ for more details.
+
+* Fixed an issue with deleting GPU Driver daemon sets due to having misscheduled pods rather than zero pods.
+  Previously, if a node had an untolerated taint such as ``node.kubernetes.io/unreachable:NoSchedule``,
+  the Operator could repeatedly delete and recreate the driver daemon sets.
+  Refer to Github `issue #715 <https://github.com/NVIDIA/gpu-operator/issues/715>`__ for more details.
+
+* Fixed an issue with reporting the correct GPU capacity and allocatable resources from the KubeVirt GPU Device Plugin.
+  Previously, if a GPU became unavailable, the reported GPU capacity and allocatable resources remained unchanged.
+  Refer to Github `issue #97 <https://github.com/NVIDIA/kubevirt-gpu-device-plugin/issues/97>`__ for more details.
+
+.. _v24.6.0-known-limitations:
+
+Known Limitations
+------------------
+
+* The ``1g.12gb`` MIG profile does not operate as expected on the NVIDIA GH200 GPU when the MIG configuration is set to ``all-balanced``.
+* The GPU Driver container does not run on hosts that have a custom kernel with the SEV-SNP CPU feature
+  because of the missing ``kernel-headers`` package within the container.
+  With a custom kernel, NVIDIA recommends pre-installing the NVIDIA drivers on the host if you want to
+  run traditional container workloads with NVIDIA GPUs.
+* If you cordon a node while the GPU driver upgrade process is already in progress,
+  the Operator uncordons the node and upgrades the driver on the node.
+  You can determine if an upgrade is in progress by checking the node label
+  ``nvidia.com/gpu-driver-upgrade-state != upgrade-done``.
+* NVIDIA vGPU is incompatible with KubeVirt v0.58.0, v0.58.1, and v0.59.0, as well
+  as OpenShift Virtualization 4.12.0---4.12.2.
+* Using NVIDIA vGPU on bare metal nodes and NVSwitch is not supported.
+* All worker nodes in the Kubernetes cluster must run the same operating system version to use the NVIDIA GPU Driver container.
+  Alternatively, if you pre-install the NVIDIA GPU Driver on the nodes, then you can run different operating systems.
+  The technical preview feature that provides :doc:`gpu-driver-configuration` is also an alternative.
+* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems.
+* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version.
+  The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster.
+  The technical preview feature that provides :doc:`gpu-driver-configuration` is an alternative.
+* The ``nouveau`` driver must be blacklisted when using NVIDIA vGPU.
+  Otherwise the driver fails to initialize the GPU with the error ``Failed to enable MSI-X`` in the system journal logs.
+  Additionally, all GPU operator pods become stuck in the ``Init`` state.
+* When using RHEL 8 with containerd as the runtime and SELinux is enabled (either in permissive or enforcing mode)
+  at the host level, containerd must also be configured for SELinux, such as setting the ``enable_selinux=true``
+  configuration option.
+  Additionally, network-restricted environments are not supported.
+
 .. _v24.3.0:
 
 24.3.0

diff --git a/gpu-operator/versions.json b/gpu-operator/versions.json
@@ -1,7 +1,10 @@
 {
-    "latest": "24.3.0",
+    "latest": "24.6.0",
     "versions":
     [
+        {
+            "version": "24.6.0"
+        },
         {
             "version": "24.3.0"
         },
@@ -19,9 +22,6 @@
         },
         {
             "version": "23.6.1"
-        },
-        {
-            "version": "23.6.0"
         }
     ]
 }
diff --git a/repo.toml b/repo.toml
@@ -12,7 +12,7 @@ author = "NVIDIA Corporation"
 
 sphinx_conf_py_extra = """
   myst_enable_extensions = [
-    "colon_fence",
+    "colon_fence", "dollarmath",
   ]
   templates_path = ['${root}/templates']
   extensions.extend([
@@ -140,7 +140,7 @@ output_format = "linkcheck"
 docs_root = "${root}/gpu-operator"
 project = "gpu-operator"
 name = "NVIDIA GPU Operator"
-version = "24.3.0"
+version = "24.6.0"
 copyright_start = 2020
 sphinx_exclude_patterns = [
   "life-cycle-policy.rst",