ORT 1.19.2 Release: Cherry Pick Round 1 (#21861)

Approved cherry picks for ORT 1.19.2 release. --------- Co-authored-by: Yi Zhang <[email protected]> Co-authored-by: Edward Chen <[email protected]> Co-authored-by: Ye Wang <[email protected]> Co-authored-by: Your Name <[email protected]> Co-authored-by: Tianlei Wu <[email protected]> Co-authored-by: aciddelgado <[email protected]> Co-authored-by: mindest <[email protected]> Co-authored-by: Changming Sun <[email protected]>
microsoft · Aug 30, 2024 · ffceed9 · ffceed9
1 parent d651463
commit ffceed9
Show file tree

Hide file tree

Showing 79 changed files with 1,788 additions and 847 deletions.
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
@@ -1 +1 @@
-1.19.1
+1.19.2
diff --git a/cmake/patches/abseil/absl_windows.patch b/cmake/patches/abseil/absl_windows.patch
@@ -74,6 +74,19 @@ index 2d85ac74..4875d668 100644
      # The decorated name was longer than the compiler limit
      "/wd4503",
      # forcing value to bool 'true' or 'false' (performance warning)
+diff --git a/absl/debugging/symbolize.cc b/absl/debugging/symbolize.cc
+index 638d3954..6b817075 100644
+--- a/absl/debugging/symbolize.cc
++++ b/absl/debugging/symbolize.cc
+@@ -14,7 +14,7 @@
+
+ #include "absl/debugging/symbolize.h"
+
+-#ifdef _WIN32
++#if defined(_WIN32) && !defined(NDEBUG)
+ #include <winapifamily.h>
+ #if !(WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)) || \
+     WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
 diff --git a/absl/debugging/symbolize_win32.inc b/absl/debugging/symbolize_win32.inc
 index 53a099a1..34d210d6 100644
 --- a/absl/debugging/symbolize_win32.inc

diff --git a/cmake/patches/cutlass/cutlass_3.5.0.patch b/cmake/patches/cutlass/cutlass_3.5.0.patch
@@ -1,13 +1,64 @@
+diff --git a/examples/41_fused_multi_head_attention/kernel_forward.h b/examples/41_fused_multi_head_attention/kernel_forward.h
+index 4c80f549..34327633 100644
+--- a/examples/41_fused_multi_head_attention/kernel_forward.h
++++ b/examples/41_fused_multi_head_attention/kernel_forward.h
+@@ -221,6 +221,8 @@ struct AttentionKernel {
+     int32_t num_batches = 0;
+     int32_t num_heads = 0;
+
++    bool use_smooth_softmax = false;
++
+     // dropout
+     bool use_dropout = false;
+     unsigned long long dropout_batch_head_rng_offset = 0;
+@@ -897,7 +899,8 @@ struct AttentionKernel {
+           p.num_keys - iter_key_start,
+           iter_key_start == 0,
+           iteratorC_tile_offset,
+-          kSupportsBias ? 1.0f : p.scale);
++          kSupportsBias ? 1.0f : p.scale,
++          p.use_smooth_softmax);
+
+       // Output results to shared-memory
+       int warp_idx_mn_0 = my_warp_id %
+@@ -1166,7 +1169,8 @@ struct AttentionKernel {
+       int max_col,
+       bool is_first,
+       typename WarpIteratorC::TensorCoord const& tile_offset,
+-      float scaling) {
++      float scaling,
++      bool use_smooth_softmax) {
+     /* Iterates on the accumulator and corresponding position on result matrix
+
+     (1) Update `mi[r]` to the max value of the row `r`
+@@ -1257,7 +1261,7 @@ struct AttentionKernel {
+       accum_t mi_row, total_row;
+       LambdaIterator::iterateRows(
+           lane_offset,
+-          [&](int accum_m) { mi_row = mi[accum_m]; },
++          [&](int accum_m) { mi_row = mi[accum_m];},
+           [&](int accum_m, int accum_n, int idx) {
+             frag[idx] =
+                 (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
+@@ -1294,7 +1298,7 @@ struct AttentionKernel {
+       for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
+         total_row += addition_storage[id + kQueriesPerBlock * i];
+       }
+-      s_prime[id] = total_row;
++      s_prime[id] = (use_smooth_softmax && (max_col <= kKeysPerBlock)) ? total_row + exp2f(-mi[id]) : total_row;
+     }
+   }
+
 diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
 index 964d2ff3..b366bc14 100644
 --- a/include/cutlass/functional.h
 +++ b/include/cutlass/functional.h
 @@ -39,6 +39,7 @@
  #include "cutlass/numeric_types.h"
- 
+
  #include <cuda_runtime.h>
 +#include <cuda_fp16.h>
- 
+
  #if defined(CUTLASS_ARCH_WMMA_ENABLED)
  #include <mma.h>
 @@ -230,8 +231,12 @@ struct inverse_square_root<half_t> {
@@ -19,7 +70,7 @@ index 964d2ff3..b366bc14 100644
      return reinterpret_cast<half_t const &>(result);
 +#else
 +    return half_t::convert((rsqrtf(half_t::convert(lhs))));
-+#endif    
++#endif
  #else
      return half_t(1.f / std::sqrt(half_t::convert(lhs)));
- #endif
+ #endif
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -2482,6 +2482,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Rotate using interleaved pattern. Default value is 0 (False).</dd>
 <dt><tt>scale</tt> : float</dt>
 <dd>Custom scale will be used if specified. Default value is 1/sqrt(head_size)</dd>
+<dt><tt>smooth_softmax</tt> : int</dt>
+<dd>Use a smooth factor in softmax.</dd>
 </dl>
 
 #### Inputs (7 - 9)
@@ -3022,6 +3024,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Number of top experts to select from expert pool</dd>
 <dt><tt>normalize_routing_weights</tt> : int</dt>
 <dd>Whether to normalize routing weights</dd>
+<dt><tt>use_sparse_mixer</tt> : int</dt>
+<dd>Whether to use sparse mixer</dd>
 </dl>
 
 #### Inputs (5 - 8)
@@ -4337,7 +4341,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 
 ### <a name="com.microsoft.QMoE"></a><a name="com.microsoft.qmoe">**com.microsoft.QMoE**</a>
 
-  Int4 MoE
+  Quantized MoE
 
 #### Version
 
@@ -4348,10 +4352,14 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dl>
 <dt><tt>activation_type</tt> : string</dt>
 <dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
+<dt><tt>expert_weight_bits</tt> : int</dt>
+<dd>Number of bits used in quantized weights. Default is 4 bits</dd>
 <dt><tt>k</tt> : int</dt>
 <dd>Number of top experts to select from expert pool</dd>
 <dt><tt>normalize_routing_weights</tt> : int</dt>
 <dd>Whether to normalize routing weights</dd>
+<dt><tt>use_sparse_mixer</tt> : int</dt>
+<dd>Whether to use sparse mixer</dd>
 </dl>
 
 #### Inputs (7 - 11)
@@ -4362,19 +4370,19 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>router_probs</tt> : T</dt>
 <dd>2D input tensor with shape (num_rows, num_experts)</dd>
 <dt><tt>fc1_experts_weights</tt> : T1</dt>
-<dd>3D input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
+<dd>3D input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
 <dt><tt>fc1_scales</tt> : T</dt>
 <dd>2D input tensor with shape (num_experts, inter_size)</dd>
 <dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
 <dt><tt>fc2_experts_weights</tt> : T1</dt>
-<dd>3D input tensor with shape (num_experts, inter_size, hidden_size / 2)</dd>
+<dd>3D input tensor with shape (num_experts, inter_size, hidden_size) or (num_experts, inter_size, hidden_size / 2)</dd>
 <dt><tt>fc2_scales</tt> : T</dt>
 <dd>2D input tensor with shape (num_experts, hidden_size)</dd>
 <dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
 <dt><tt>fc3_experts_weights</tt> (optional) : T1</dt>
-<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
+<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
 <dt><tt>fc3_scales</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
 <dt><tt>fc3_experts_bias</tt> (optional) : T</dt>

diff --git a/docs/python/README.rst b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.19.2
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.19.2
+
 1.19.1
 ^^^^^^
 

diff --git a/include/onnxruntime/core/graph/graph_nodes.h b/include/onnxruntime/core/graph/graph_nodes.h
@@ -117,13 +117,14 @@ class ValidNodes {
       return (current_ != other.current_);
     }
 
-    void operator++() {
+    NodeIterator<TIterator>& operator++() {
       if (current_ < end_) {
         while (++current_ != end_) {
           if (*current_ != nullptr && (!apply_filter_ || (*filter_func_)((*current_)->Index()) == false))
             break;
         }
       }
+      return *this;
     }
 
     NodeIterator<TIterator> operator++(int) {

diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.19.1';
+export const version = '1.19.2';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
diff --git a/js/common/package.json b/js/common/package.json
@@ -2,7 +2,7 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.19.1",
+  "version": "1.19.2",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"

diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.19.1';
+export const version = '1.19.2';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
diff --git a/js/node/package.json b/js/node/package.json
@@ -13,7 +13,7 @@
       3
     ]
   },
-  "version": "1.19.1",
+  "version": "1.19.2",
   "dependencies": {
     "onnxruntime-common": "file:../common",
     "tar": "^7.0.1"

diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.19.1';
+export const version = '1.19.2';
diff --git a/js/react_native/package.json b/js/react_native/package.json
@@ -36,7 +36,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.19.1",
+  "version": "1.19.2",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [

diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
@@ -5254,7 +5254,7 @@ onetime@^5.1.0, onetime@^5.1.2:
     mimic-fn "^2.1.0"
 
 "onnxruntime-common@file:../common":
-  version "1.19.1"
+  version "1.19.2"
 
 open@^6.2.0:
   version "6.4.0"

diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.19.1';
+export const version = '1.19.2';
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
diff --git a/js/web/package.json b/js/web/package.json
@@ -7,7 +7,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.19.1",
+  "version": "1.19.2",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^1.12.0",

diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
@@ -7,7 +7,7 @@
 For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
-__version__ = "1.19.1"
+__version__ = "1.19.2"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).

diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -99,6 +99,7 @@ struct GroupQueryAttentionParameters {
   int sequence_length;          // sequence length of input query, key, value
   int seqlen_past_kv_cache;     // sequence length of past kv tensor
   int seqlen_present_kv_cache;  // sequence length of present kv tensor
+  int total_sequence_length;    // maximum total sequence length (past_sequence_length + sequence_length) among keys
   int hidden_size;
   int num_heads;
   int head_size;
@@ -113,6 +114,7 @@ struct GroupQueryAttentionParameters {
   bool is_prompt;  // determines if seqlens_k is past or kv sequence length tensor
   bool do_rotary;
   bool rotary_interleaved;
+  bool use_smooth_softmax;
   float scale;
   AttentionQkvFormat qkv_format;
   AttentionQkvFormat past_kv_format;