Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Help the compiler vectorize adjacent_difference #4958

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ function(add_benchmark name)
target_link_libraries(benchmark-${name} PRIVATE benchmark::benchmark)
endfunction()

add_benchmark(adjacent_difference src/adjacent_difference.cpp)
add_benchmark(bitset_to_string src/bitset_to_string.cpp)
add_benchmark(efficient_nonlocking_print src/efficient_nonlocking_print.cpp)
add_benchmark(find_and_count src/find_and_count.cpp)
Expand Down
55 changes: 55 additions & 0 deletions benchmarks/src/adjacent_difference.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <benchmark/benchmark.h>
#include <cstdint>
#include <limits>
#include <numeric>
#include <random>
#include <vector>

using namespace std;

template <class T>
void bm(benchmark::State& state) {
mt19937 gen(96337);

const size_t size = static_cast<size_t>(state.range(0));

std::vector<T> input(size);
std::vector<T> output(size);

if constexpr (is_floating_point_v<T>) {
normal_distribution<T> dis(-100.0, 100.0);
ranges::generate(input, [&] { return dis(gen); });
} else {
uniform_int_distribution<conditional_t<sizeof(T) != 1, T, unsigned int>> dis(0, numeric_limits<T>::max());
ranges::generate(input, [&] { return static_cast<T>(dis(gen)); });
}

for (auto _ : state) {
benchmark::DoNotOptimize(input);
adjacent_difference(input.begin(), input.end(), output.begin());
benchmark::DoNotOptimize(output);
}
}

void common_args(auto bm) {
bm->Arg(2255);
}

// Avoid signed integers to avoid UB; they shouldn't perform differently from the unsigned

#pragma warning(push)
#pragma warning(disable : 4244) // warning C4244: '=': conversion from 'int' to 'unsigned char', possible loss of data
BENCHMARK(bm<uint8_t>)->Apply(common_args);
BENCHMARK(bm<uint16_t>)->Apply(common_args);
#pragma warning(pop)

BENCHMARK(bm<uint32_t>)->Apply(common_args);
BENCHMARK(bm<uint64_t>)->Apply(common_args);

BENCHMARK(bm<float>)->Apply(common_args);
BENCHMARK(bm<double>)->Apply(common_args);

BENCHMARK_MAIN();
35 changes: 35 additions & 0 deletions stl/inc/numeric
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,23 @@ _FwdIt2 transform_inclusive_scan(_ExPo&& _Exec, _FwdIt1 _First, _FwdIt1 _Last, _
_UnaryOp _Transform_op) noexcept; // terminates
#endif // _HAS_CXX17

template <class _Ty>
_CONSTEXPR20 bool _Arrays_overlap(const _Ty* const _First, const _Ty* const _Second, const size_t _Common_count) {
const uintptr_t _First_val = reinterpret_cast<uintptr_t>(_First);
const uintptr_t _Second_val = reinterpret_cast<uintptr_t>(_Second);
AlexGuteniev marked this conversation as resolved.
Show resolved Hide resolved
const size_t Size_bytes = _Common_count * sizeof(_Ty);
return _First_val + Size_bytes > _Second_val && _Second_val + Size_bytes > _First_val;
}

template <class _Ty, class _BinOp>
_CONSTEXPR20 void _Adjacent_difference_no_overlap(
_Ty* __restrict _Dest, const _Ty* __restrict _Src, const size_t _Count, _BinOp _Func) {
_Dest[0] = _Src[0];
for (size_t _Ix = 1; _Ix != _Count; ++_Ix) {
_Dest[_Ix] = _Func(_Src[_Ix], _Src[_Ix - 1]);
}
}

_EXPORT_STD template <class _InIt, class _OutIt, class _BinOp>
_CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _OutIt _Dest, _BinOp _Func) {
// compute adjacent differences into _Dest
Expand All @@ -469,6 +486,24 @@ _CONSTEXPR20 _OutIt adjacent_difference(const _InIt _First, const _InIt _Last, _
const auto _ULast = _STD _Get_unwrapped(_Last);
auto _UDest = _STD _Get_unwrapped_n(_Dest, _STD _Idl_distance<_InIt>(_UFirst, _ULast));
if (_UFirst != _ULast) {
if constexpr (_Iterators_are_contiguous<_InIt, _OutIt> && !_Iterator_is_volatile<_InIt>
&& is_trivially_copyable_v<_Iter_value_t<_InIt>>) {
Copy link
Contributor

@frederick-vs-ja frederick-vs-ja Sep 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is !_Iterator_is_volatile<_InIt> or is_trivially_copyable_v<_Iter_value_t<_InIt>> meaningful here, given we're not performing any trivial bitwise copy in this branch?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure.

The Effects as in [adjacent.difference]/4 specifically requiring making a copy at certain time, which we do later. So the difference may be observable. But it is not observable, if the memory is not volatile and the copying is trivial.

// Go with pointers and without loop-carried dependency to enable vectorization
const auto _Count = _ULast - _UFirst;
const auto _Src_ptr = _To_address(_UFirst);
const auto _Dest_ptr = _To_address(_Dest);
// Need to perform aliasing analysis.
// The vectorizer is generally able to do that on its own, and would guard the vectorized code with that,
// but when we elimniate loop-carried dependency we change the semantic of the unvectorized code too.
// So we need to perform this check manually, and after that can tell the compiler that there's no aliasing,
// to avoid it checking for that again.
if (!_Arrays_overlap(_Dest_ptr, _Src_ptr, static_cast<size_t>(_Count))) {
_Adjacent_difference_no_overlap(_Dest_ptr, _Src_ptr, static_cast<size_t>(_Count), _STD _Pass_fn(_Func));
_STD _Seek_wrapped(_Dest, _UDest + _Count);
return _Dest;
}
}

_Iter_value_t<_InIt> _Val(*_UFirst);
*_UDest = _Val;
while (++_UFirst != _ULast) { // compute another difference
Expand Down
66 changes: 66 additions & 0 deletions tests/std/tests/VSO_0000000_vector_algorithms/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <functional>
#include <limits>
#include <list>
#include <numeric>
#include <random>
#include <string>
#include <type_traits>
Expand All @@ -33,6 +34,66 @@ using namespace std;
#pragma clang diagnostic ignored "-Wc++17-extensions" // constexpr if is a C++17 extension
#endif // __clang__

template <class InIt, class OutIt>
OutIt last_known_good_adj_diff(InIt first, InIt last, OutIt dest) {
if (first == last) {
return dest;
}

auto val = *first;
*dest = val;

for (++first, ++dest; first != last; ++first, ++dest) {
auto tmp = *first;
*dest = tmp - val;
val = tmp;
}

return dest;
}

template <class T>
void test_case_adj_diff(const vector<T>& input, vector<T>& output_expected, vector<T>& output_actual) {
auto actual = adjacent_difference(input.begin(), input.end(), output_actual.begin());
auto expected = last_known_good_adj_diff(input.begin(), input.end(), output_expected.begin());

assert(actual - output_actual.begin() == expected - output_expected.begin());
assert(output_actual == output_expected);
}

template <class T>
void test_adjacent_difference(mt19937_64& gen) {
using Limits = numeric_limits<T>;

uniform_int_distribution<conditional_t<sizeof(T) == 1, int, T>> dis(
is_signed_v<T> ? static_cast<T>(Limits::min() / 2) : Limits::min(),
is_signed_v<T> ? static_cast<T>(Limits::max() / 2) : Limits::max());

vector<T> input;
vector<T> output_actual;
vector<T> output_expected;

vector<T>* const all__output_vectors[] = {&output_actual, &output_expected};
vector<T>* const all_vectors[] = {&input, &output_actual, &output_expected};

for (auto v : all_vectors) {
v->reserve(dataCount);
}

test_case_adj_diff(input, output_expected, output_actual);
for (size_t attempts = 0; attempts < dataCount; ++attempts) {
for (auto v : all__output_vectors) {
generate(v->begin(), v->end(), [&] { return static_cast<T>(dis(gen)); });
}

for (auto v : all_vectors) {
v->push_back(static_cast<T>(dis(gen)));
}

test_case_adj_diff(input, output_expected, output_actual);
}
}

template <class FwdIt, class T>
ptrdiff_t last_known_good_count(FwdIt first, FwdIt last, T v) {
ptrdiff_t result = 0;
Expand Down Expand Up @@ -776,6 +837,11 @@ void test_swap_ranges(mt19937_64& gen) {
}

void test_vector_algorithms(mt19937_64& gen) {
test_adjacent_difference<int>(gen);
test_adjacent_difference<unsigned int>(gen);
test_adjacent_difference<long long>(gen);
test_adjacent_difference<unsigned long long>(gen);

test_count<char>(gen);
test_count<signed char>(gen);
test_count<unsigned char>(gen);
Expand Down
Loading