Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vectorize basic_string::find_last_of #4934

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 17 additions & 9 deletions benchmarks/src/find_first_of.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

using namespace std;

enum class AlgType : bool { std_func, str_member };
enum class AlgType { std_func, str_member_first, str_member_last };

template <AlgType Alg, class T, T Start = T{'a'}>
void bm(benchmark::State& state) {
Expand All @@ -22,7 +22,7 @@ void bm(benchmark::State& state) {
const size_t HSize = Pos * 2;
const size_t Which = 0;

using container = conditional_t<Alg == AlgType::str_member, basic_string<T>, vector<T>>;
using container = conditional_t<Alg == AlgType::std_func, vector<T>, basic_string<T>>;

container h(HSize, T{'.'});
container n(NSize, T{0});
Expand All @@ -37,26 +37,34 @@ void bm(benchmark::State& state) {
for (auto _ : state) {
benchmark::DoNotOptimize(h);
benchmark::DoNotOptimize(n);
if constexpr (Alg == AlgType::str_member) {
benchmark::DoNotOptimize(h.find_first_of(n.data(), 0, n.size()));
if constexpr (Alg == AlgType::str_member_first) {
benchmark::DoNotOptimize(h.find_first_of(n));
} else if constexpr (Alg == AlgType::str_member_last) {
benchmark::DoNotOptimize(h.find_last_of(n));
} else {
benchmark::DoNotOptimize(find_first_of(h.begin(), h.end(), n.begin(), n.end()));
}
}
}

void common_args(auto bm) {
bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2});
bm->Args({102, 4})->Args({325, 1})->Args({1011, 11})->Args({1502, 23})->Args({3056, 7});
bm->Args({2, 3})->Args({7, 4})->Args({9, 3})->Args({22, 5})->Args({58, 2})->Args({102, 4});
bm->Args({325, 1})->Args({400, 50})->Args({1011, 11})->Args({1502, 23})->Args({3056, 7});
}

BENCHMARK(bm<AlgType::std_func, uint8_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::std_func, uint16_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::std_func, uint32_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::std_func, uint64_t>)->Apply(common_args);

BENCHMARK(bm<AlgType::str_member, char>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member, wchar_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member, wchar_t, L'\x03B1'>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, char>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, wchar_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, wchar_t, L'\x03B1'>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, char32_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_first, char32_t, L'\x03B1'>)->Apply(common_args);

BENCHMARK(bm<AlgType::str_member_last, char>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_last, wchar_t>)->Apply(common_args);
BENCHMARK(bm<AlgType::str_member_last, wchar_t, L'\x03B1'>)->Apply(common_args);

BENCHMARK_MAIN();
110 changes: 89 additions & 21 deletions stl/inc/__msvc_string_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,42 @@ _STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new

#if _USE_STD_VECTOR_ALGORITHMS
extern "C" {
// The "noalias" attribute tells the compiler optimizer that pointers going into these hand-vectorized algorithms
// won't be stored beyond the lifetime of the function, and that the function will only reference arrays denoted by
// those pointers. The optimizer also assumes in that case that a pointer parameter is not returned to the caller via
// the return value, so functions using "noalias" must usually return void. This attribute is valuable because these
// functions are in native code objects that the compiler cannot analyze. In the absence of the noalias attribute, the
// compiler has to assume that the denoted arrays are "globally address taken", and that any later calls to
// unanalyzable routines may modify those arrays.

__declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_1(
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;
__declspec(noalias) size_t __stdcall __std_find_last_of_trivial_pos_2(
const void* _Haystack, size_t _Haystack_length, const void* _Needle, size_t _Needle_length) noexcept;

} // extern "C"

_STD_BEGIN

template <class _Ty1, class _Ty2>
size_t _Find_last_of_pos_vectorized(const _Ty1* const _Haystack, const size_t _Haystack_length,
const _Ty2* const _Needle, const size_t _Needle_length) noexcept {
_STL_INTERNAL_STATIC_ASSERT(sizeof(_Ty1) == sizeof(_Ty2));
if constexpr (sizeof(_Ty1) == 1) {
return ::__std_find_last_of_trivial_pos_1(_Haystack, _Haystack_length, _Needle, _Needle_length);
} else if constexpr (sizeof(_Ty1) == 2) {
return ::__std_find_last_of_trivial_pos_2(_Haystack, _Haystack_length, _Needle, _Needle_length);
} else {
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected size
}
}

_STD_END

#endif // _USE_STD_VECTOR_ALGORITHMS

_STD_BEGIN
#ifdef __clang__
#define _HAS_MEMCPY_MEMMOVE_INTRINSICS 1
Expand Down Expand Up @@ -719,10 +755,14 @@ constexpr size_t _Traits_find_first_of(_In_reads_(_Hay_size) const _Traits_ptr_t
const bool _Try_vectorize = _Hay_size - _Start_at > _Threshold_find_first_of;

// Additional condition for when the vectorization outperforms the table lookup
const bool _Use_bitmap = !_Try_vectorize || (sizeof(_Elem) > 1 && sizeof(_Elem) * _Needle_size > 16);
#else
constexpr size_t _Find_first_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48
: sizeof(_Elem) == 8 ? 8
: 16;

const bool _Use_bitmap = !_Try_vectorize || _Needle_size > _Find_first_of_bitmap_threshold;
#else // ^^^ _USE_STD_VECTOR_ALGORITHMS / !_USE_STD_VECTOR_ALGORITHMS vvv
const bool _Use_bitmap = true;
#endif // _USE_STD_VECTOR_ALGORITHMS
#endif // ^^^ !_USE_STD_VECTOR_ALGORITHMS ^^^

if (_Use_bitmap) {
_String_bitmap<_Elem> _Matches;
Expand Down Expand Up @@ -770,31 +810,59 @@ constexpr size_t _Traits_find_last_of(_In_reads_(_Hay_size) const _Traits_ptr_t<
const size_t _Needle_size) noexcept {
// in [_Haystack, _Haystack + _Hay_size), look for last of [_Needle, _Needle + _Needle_size), before _Start_at
if (_Needle_size != 0 && _Hay_size != 0) { // worth searching, do it
const auto _Hay_start = (_STD min)(_Start_at, _Hay_size - 1);

if constexpr (_Special) {
_String_bitmap<typename _Traits::char_type> _Matches;
if (!_Matches._Mark(_Needle, _Needle + _Needle_size)) { // couldn't put one of the characters into the
// bitmap, fall back to the serial algorithm
return _Traits_find_last_of<_Traits, false>(_Haystack, _Hay_size, _Start_at, _Needle, _Needle_size);
}
if (!_STD _Is_constant_evaluated()) {
using _Elem = typename _Traits::char_type;

for (auto _Match_try = _Haystack + (_STD min)(_Start_at, _Hay_size - 1);; --_Match_try) {
if (_Matches._Match(*_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}
bool _Use_bitmap = true;
#if _USE_STD_VECTOR_ALGORITHMS
bool _Try_vectorize = false;

if (_Match_try == _Haystack) {
break; // at beginning, no more chance for match
if constexpr (sizeof(_Elem) <= 2) {
_Try_vectorize = _Hay_size - _Start_at > _Threshold_find_first_of;
// Additional condition for when the vectorization outperforms the table lookup
constexpr size_t _Find_last_of_bitmap_threshold = sizeof(_Elem) == 1 ? 48 : 8;

_Use_bitmap = !_Try_vectorize || _Needle_size > _Find_last_of_bitmap_threshold;
}
}
} else {
for (auto _Match_try = _Haystack + (_STD min)(_Start_at, _Hay_size - 1);; --_Match_try) {
if (_Traits::find(_Needle, _Needle_size, *_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
#endif // _USE_STD_VECTOR_ALGORITHMS

if (_Use_bitmap) {
_String_bitmap<_Elem> _Matches;
if (_Matches._Mark(
_Needle, _Needle + _Needle_size)) { // couldn't put one of the characters into the
// bitmap, fall back to the serial algorithm
for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {
if (_Matches._Match(*_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}

if (_Match_try == _Haystack) {
break; // at beginning, no more chance for match
}
}
}
}

if (_Match_try == _Haystack) {
break; // at beginning, no more chance for match
#if _USE_STD_VECTOR_ALGORITHMS
if constexpr (sizeof(_Elem) <= 2) {
if (_Try_vectorize) {
return _STD _Find_last_of_pos_vectorized(_Haystack, _Hay_start + 1, _Needle, _Needle_size);
}
}
#endif // _USE_STD_VECTOR_ALGORITHMS
}
}

for (auto _Match_try = _Haystack + _Hay_start;; --_Match_try) {
if (_Traits::find(_Needle, _Needle_size, *_Match_try)) {
return static_cast<size_t>(_Match_try - _Haystack); // found a match
}

if (_Match_try == _Haystack) {
break; // at beginning, no more chance for match
}
}
}
Expand Down
Loading