Skip to content

Commit

Permalink
ORT 1.19.2 Release: Cherry Pick Round 1 (#21861)
Browse files Browse the repository at this point in the history
Approved cherry picks for ORT 1.19.2 release.

---------

Co-authored-by: Yi Zhang <[email protected]>
Co-authored-by: Edward Chen <[email protected]>
Co-authored-by: Ye Wang <[email protected]>
Co-authored-by: Your Name <[email protected]>
Co-authored-by: Tianlei Wu <[email protected]>
Co-authored-by: aciddelgado <[email protected]>
Co-authored-by: mindest <[email protected]>
Co-authored-by: Changming Sun <[email protected]>
  • Loading branch information
9 people committed Aug 30, 2024
1 parent d651463 commit ffceed9
Show file tree
Hide file tree
Showing 79 changed files with 1,788 additions and 847 deletions.
2 changes: 1 addition & 1 deletion VERSION_NUMBER
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.19.1
1.19.2
13 changes: 13 additions & 0 deletions cmake/patches/abseil/absl_windows.patch
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,19 @@ index 2d85ac74..4875d668 100644
# The decorated name was longer than the compiler limit
"/wd4503",
# forcing value to bool 'true' or 'false' (performance warning)
diff --git a/absl/debugging/symbolize.cc b/absl/debugging/symbolize.cc
index 638d3954..6b817075 100644
--- a/absl/debugging/symbolize.cc
+++ b/absl/debugging/symbolize.cc
@@ -14,7 +14,7 @@

#include "absl/debugging/symbolize.h"

-#ifdef _WIN32
+#if defined(_WIN32) && !defined(NDEBUG)
#include <winapifamily.h>
#if !(WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)) || \
WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
diff --git a/absl/debugging/symbolize_win32.inc b/absl/debugging/symbolize_win32.inc
index 53a099a1..34d210d6 100644
--- a/absl/debugging/symbolize_win32.inc
Expand Down
59 changes: 55 additions & 4 deletions cmake/patches/cutlass/cutlass_3.5.0.patch
Original file line number Diff line number Diff line change
@@ -1,13 +1,64 @@
diff --git a/examples/41_fused_multi_head_attention/kernel_forward.h b/examples/41_fused_multi_head_attention/kernel_forward.h
index 4c80f549..34327633 100644
--- a/examples/41_fused_multi_head_attention/kernel_forward.h
+++ b/examples/41_fused_multi_head_attention/kernel_forward.h
@@ -221,6 +221,8 @@ struct AttentionKernel {
int32_t num_batches = 0;
int32_t num_heads = 0;

+ bool use_smooth_softmax = false;
+
// dropout
bool use_dropout = false;
unsigned long long dropout_batch_head_rng_offset = 0;
@@ -897,7 +899,8 @@ struct AttentionKernel {
p.num_keys - iter_key_start,
iter_key_start == 0,
iteratorC_tile_offset,
- kSupportsBias ? 1.0f : p.scale);
+ kSupportsBias ? 1.0f : p.scale,
+ p.use_smooth_softmax);

// Output results to shared-memory
int warp_idx_mn_0 = my_warp_id %
@@ -1166,7 +1169,8 @@ struct AttentionKernel {
int max_col,
bool is_first,
typename WarpIteratorC::TensorCoord const& tile_offset,
- float scaling) {
+ float scaling,
+ bool use_smooth_softmax) {
/* Iterates on the accumulator and corresponding position on result matrix

(1) Update `mi[r]` to the max value of the row `r`
@@ -1257,7 +1261,7 @@ struct AttentionKernel {
accum_t mi_row, total_row;
LambdaIterator::iterateRows(
lane_offset,
- [&](int accum_m) { mi_row = mi[accum_m]; },
+ [&](int accum_m) { mi_row = mi[accum_m];},
[&](int accum_m, int accum_n, int idx) {
frag[idx] =
(accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
@@ -1294,7 +1298,7 @@ struct AttentionKernel {
for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
total_row += addition_storage[id + kQueriesPerBlock * i];
}
- s_prime[id] = total_row;
+ s_prime[id] = (use_smooth_softmax && (max_col <= kKeysPerBlock)) ? total_row + exp2f(-mi[id]) : total_row;
}
}

diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 964d2ff3..b366bc14 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -39,6 +39,7 @@
#include "cutlass/numeric_types.h"

#include <cuda_runtime.h>
+#include <cuda_fp16.h>

#if defined(CUTLASS_ARCH_WMMA_ENABLED)
#include <mma.h>
@@ -230,8 +231,12 @@ struct inverse_square_root<half_t> {
Expand All @@ -19,7 +70,7 @@ index 964d2ff3..b366bc14 100644
return reinterpret_cast<half_t const &>(result);
+#else
+ return half_t::convert((rsqrtf(half_t::convert(lhs))));
+#endif
+#endif
#else
return half_t(1.f / std::sqrt(half_t::convert(lhs)));
#endif
#endif
16 changes: 12 additions & 4 deletions docs/ContribOperators.md
Original file line number Diff line number Diff line change
Expand Up @@ -2482,6 +2482,8 @@ This version of the operator has been available since version 1 of the 'com.micr
<dd>Rotate using interleaved pattern. Default value is 0 (False).</dd>
<dt><tt>scale</tt> : float</dt>
<dd>Custom scale will be used if specified. Default value is 1/sqrt(head_size)</dd>
<dt><tt>smooth_softmax</tt> : int</dt>
<dd>Use a smooth factor in softmax.</dd>
</dl>

#### Inputs (7 - 9)
Expand Down Expand Up @@ -3022,6 +3024,8 @@ This version of the operator has been available since version 1 of the 'com.micr
<dd>Number of top experts to select from expert pool</dd>
<dt><tt>normalize_routing_weights</tt> : int</dt>
<dd>Whether to normalize routing weights</dd>
<dt><tt>use_sparse_mixer</tt> : int</dt>
<dd>Whether to use sparse mixer</dd>
</dl>

#### Inputs (5 - 8)
Expand Down Expand Up @@ -4337,7 +4341,7 @@ This version of the operator has been available since version 1 of the 'com.micr

### <a name="com.microsoft.QMoE"></a><a name="com.microsoft.qmoe">**com.microsoft.QMoE**</a>

Int4 MoE
Quantized MoE

#### Version

Expand All @@ -4348,10 +4352,14 @@ This version of the operator has been available since version 1 of the 'com.micr
<dl>
<dt><tt>activation_type</tt> : string</dt>
<dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
<dt><tt>expert_weight_bits</tt> : int</dt>
<dd>Number of bits used in quantized weights. Default is 4 bits</dd>
<dt><tt>k</tt> : int</dt>
<dd>Number of top experts to select from expert pool</dd>
<dt><tt>normalize_routing_weights</tt> : int</dt>
<dd>Whether to normalize routing weights</dd>
<dt><tt>use_sparse_mixer</tt> : int</dt>
<dd>Whether to use sparse mixer</dd>
</dl>

#### Inputs (7 - 11)
Expand All @@ -4362,19 +4370,19 @@ This version of the operator has been available since version 1 of the 'com.micr
<dt><tt>router_probs</tt> : T</dt>
<dd>2D input tensor with shape (num_rows, num_experts)</dd>
<dt><tt>fc1_experts_weights</tt> : T1</dt>
<dd>3D input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
<dd>3D input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
<dt><tt>fc1_scales</tt> : T</dt>
<dd>2D input tensor with shape (num_experts, inter_size)</dd>
<dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
<dt><tt>fc2_experts_weights</tt> : T1</dt>
<dd>3D input tensor with shape (num_experts, inter_size, hidden_size / 2)</dd>
<dd>3D input tensor with shape (num_experts, inter_size, hidden_size) or (num_experts, inter_size, hidden_size / 2)</dd>
<dt><tt>fc2_scales</tt> : T</dt>
<dd>2D input tensor with shape (num_experts, hidden_size)</dd>
<dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
<dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
<dt><tt>fc3_experts_weights</tt> (optional) : T1</dt>
<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
<dt><tt>fc3_scales</tt> (optional) : T</dt>
<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
<dt><tt>fc3_experts_bias</tt> (optional) : T</dt>
Expand Down
5 changes: 5 additions & 0 deletions docs/python/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
Changes
-------

1.19.2
^^^^^^

Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.19.2

1.19.1
^^^^^^

Expand Down
3 changes: 2 additions & 1 deletion include/onnxruntime/core/graph/graph_nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,14 @@ class ValidNodes {
return (current_ != other.current_);
}

void operator++() {
NodeIterator<TIterator>& operator++() {
if (current_ < end_) {
while (++current_ != end_) {
if (*current_ != nullptr && (!apply_filter_ || (*filter_func_)((*current_)->Index()) == false))
break;
}
}
return *this;
}

NodeIterator<TIterator> operator++(int) {
Expand Down
2 changes: 1 addition & 1 deletion js/common/lib/version.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
// This file is generated by /js/scripts/update-version.ts
// Do not modify file content manually.

export const version = '1.19.1';
export const version = '1.19.2';
4 changes: 2 additions & 2 deletions js/common/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion js/common/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"license": "MIT",
"type": "module",
"name": "onnxruntime-common",
"version": "1.19.1",
"version": "1.19.2",
"repository": {
"url": "https://github.com/Microsoft/onnxruntime.git",
"type": "git"
Expand Down
2 changes: 1 addition & 1 deletion js/node/lib/version.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
// This file is generated by /js/scripts/update-version.ts
// Do not modify file content manually.

export const version = '1.19.1';
export const version = '1.19.2';
6 changes: 3 additions & 3 deletions js/node/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion js/node/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
3
]
},
"version": "1.19.1",
"version": "1.19.2",
"dependencies": {
"onnxruntime-common": "file:../common",
"tar": "^7.0.1"
Expand Down
2 changes: 1 addition & 1 deletion js/react_native/lib/version.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
// This file is generated by /js/scripts/update-version.ts
// Do not modify file content manually.

export const version = '1.19.1';
export const version = '1.19.2';
2 changes: 1 addition & 1 deletion js/react_native/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"registry": "https://registry.npmjs.org/"
},
"source": "lib/index",
"version": "1.19.1",
"version": "1.19.2",
"main": "dist/commonjs/index",
"homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
"files": [
Expand Down
2 changes: 1 addition & 1 deletion js/react_native/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5254,7 +5254,7 @@ onetime@^5.1.0, onetime@^5.1.2:
mimic-fn "^2.1.0"

"onnxruntime-common@file:../common":
version "1.19.1"
version "1.19.2"

open@^6.2.0:
version "6.4.0"
Expand Down
2 changes: 1 addition & 1 deletion js/web/lib/version.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
// This file is generated by /js/scripts/update-version.ts
// Do not modify file content manually.

export const version = '1.19.1';
export const version = '1.19.2';
6 changes: 3 additions & 3 deletions js/web/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion js/web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"type": "git"
},
"author": "fs-eire",
"version": "1.19.1",
"version": "1.19.2",
"jsdelivr": "dist/ort.min.js",
"dependencies": {
"flatbuffers": "^1.12.0",
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
"""
__version__ = "1.19.1"
__version__ = "1.19.2"
__author__ = "Microsoft"

# we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/contrib_ops/cpu/bert/attention_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ struct GroupQueryAttentionParameters {
int sequence_length; // sequence length of input query, key, value
int seqlen_past_kv_cache; // sequence length of past kv tensor
int seqlen_present_kv_cache; // sequence length of present kv tensor
int total_sequence_length; // maximum total sequence length (past_sequence_length + sequence_length) among keys
int hidden_size;
int num_heads;
int head_size;
Expand All @@ -113,6 +114,7 @@ struct GroupQueryAttentionParameters {
bool is_prompt; // determines if seqlens_k is past or kv sequence length tensor
bool do_rotary;
bool rotary_interleaved;
bool use_smooth_softmax;
float scale;
AttentionQkvFormat qkv_format;
AttentionQkvFormat past_kv_format;
Expand Down
Loading

0 comments on commit ffceed9

Please sign in to comment.