Skip to content

Commit

Permalink
Always use cmpxchg16b on x64 for atomic_ref<16 bytes> (#4751)
Browse files Browse the repository at this point in the history
  • Loading branch information
StephanTLavavej committed Jul 5, 2024
1 parent 53f1faf commit ef1d621
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 150 deletions.
84 changes: 16 additions & 68 deletions stl/inc/atomic
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,10 @@ _STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new

#ifdef _WIN64
#if _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1
#define _STD_COMPARE_EXCHANGE_128 _InterlockedCompareExchange128
#else // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 / _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 vvv
// 16-byte atomics are separately compiled for x64, as not all x64 hardware has the cmpxchg16b
// instruction; in the event this instruction is not available, the fallback is a global
// synchronization object shared by all 16-byte atomics.
// (Note: machines without this instruction typically have 2 cores or fewer, so this isn't too bad)
// All pointer parameters must be 16-byte aligned.
extern "C" _NODISCARD unsigned char __stdcall __std_atomic_compare_exchange_128(
_Inout_bytecount_(16) long long* _Destination, _In_ long long _ExchangeHigh, _In_ long long _ExchangeLow,
_Inout_bytecount_(16) long long* _ComparandResult) noexcept;
extern "C" _NODISCARD char __stdcall __std_atomic_has_cmpxchg16b() noexcept;
#define _STD_COMPARE_EXCHANGE_128 __std_atomic_compare_exchange_128
#endif // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 ^^^
#endif // defined(_WIN64)

// Controls whether atomic::is_always_lock_free triggers for sizeof(void *) or 2 * sizeof(void *)
#if _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 || !defined(_M_X64) || defined(_M_ARM64EC)
#define _ATOMIC_HAS_DCAS 1
#else // ^^^ We always have DCAS / We only sometimes have DCAS vvv
#define _ATOMIC_HAS_DCAS 0
#endif // ^^^ We only sometimes have DCAS ^^^
// Allow _InterlockedCompareExchange128 to be used:
#if defined(__clang__) && defined(_M_X64)
#pragma clang attribute _STD_ATOMIC_HEADER.push([[gnu::target("cx16")]], apply_to = function)
#endif // ^^^ defined(__clang__) && defined(_M_X64) ^^^

// Controls whether ARM64 ldar/ldapr/stlr should be used
#ifndef _STD_ATOMIC_USE_ARM64_LDAR_STLR
Expand Down Expand Up @@ -603,7 +584,7 @@ inline bool __stdcall _Atomic_wait_compare_16_bytes(const void* _Storage, void*
const auto _Cmp = static_cast<const long long*>(_Comparand);
alignas(16) long long _Tmp[2] = {_Cmp[0], _Cmp[1]};
#if defined(_M_X64) && !defined(_M_ARM64EC)
return _STD_COMPARE_EXCHANGE_128(_Dest, _Tmp[1], _Tmp[0], _Tmp) != 0;
return _InterlockedCompareExchange128(_Dest, _Tmp[1], _Tmp[0], _Tmp) != 0;
#else // ^^^ _M_X64 / ARM64, _M_ARM64EC vvv
return _InterlockedCompareExchange128_nf(_Dest, _Tmp[1], _Tmp[0], _Tmp) != 0;
#endif // ^^^ ARM64, _M_ARM64EC ^^^
Expand Down Expand Up @@ -1199,7 +1180,7 @@ struct _Atomic_storage<_Ty&, 16> { // lock-free using 16-byte intrinsics
_NODISCARD _TVal load() const noexcept { // load with sequential consistency
long long* const _Storage_ptr = const_cast<long long*>(_STD _Atomic_address_as<const long long>(_Storage));
_Int128 _Result{}; // atomic CAS 0 with 0
(void) _STD_COMPARE_EXCHANGE_128(_Storage_ptr, 0, 0, &_Result._Low);
(void) _InterlockedCompareExchange128(_Storage_ptr, 0, 0, &_Result._Low);
return reinterpret_cast<_TVal&>(_Result);
}

Expand Down Expand Up @@ -1270,7 +1251,7 @@ struct _Atomic_storage<_Ty&, 16> { // lock-free using 16-byte intrinsics
&_Expected_temp._Low);
#else // ^^^ _M_ARM64, _M_ARM64EC / _M_X64 vvv
(void) _Order;
_Result = _STD_COMPARE_EXCHANGE_128(&reinterpret_cast<long long&>(_Storage), _Desired_bytes._High,
_Result = _InterlockedCompareExchange128(&reinterpret_cast<long long&>(_Storage), _Desired_bytes._High,
_Desired_bytes._Low, &_Expected_temp._Low);
#endif // ^^^ _M_X64 ^^^
if (_Result) {
Expand All @@ -1296,7 +1277,7 @@ struct _Atomic_storage<_Ty&, 16> { // lock-free using 16-byte intrinsics
&_Expected_temp._Low);
#else // ^^^ _M_ARM64, _M_ARM64EC / _M_X64 vvv
(void) _Order;
_Result = _STD_COMPARE_EXCHANGE_128(
_Result = _InterlockedCompareExchange128(
&reinterpret_cast<long long&>(_Storage), _Desired_bytes._High, _Desired_bytes._Low, &_Expected_temp._Low);
#endif // ^^^ _M_X64 ^^^
if (_Result == 0) {
Expand Down Expand Up @@ -1649,13 +1630,8 @@ struct _Atomic_integral<_Ty, 8> : _Atomic_storage<_Ty> { // atomic integral oper
template <size_t _TypeSize>
constexpr bool _Is_always_lock_free = _TypeSize <= 8 && (_TypeSize & (_TypeSize - 1)) == 0;
#else // ^^^ don't break ABI / break ABI vvv
#if _ATOMIC_HAS_DCAS
template <size_t _TypeSize>
constexpr bool _Is_always_lock_free = _TypeSize <= 2 * sizeof(void*);
#else // ^^^ _ATOMIC_HAS_DCAS / !_ATOMIC_HAS_DCAS vvv
template <size_t _TypeSize>
constexpr bool _Is_always_lock_free = _TypeSize <= sizeof(void*);
#endif // ^^^ !_ATOMIC_HAS_DCAS ^^^
#endif // ^^^ break ABI ^^^

template <class _Ty, bool _Is_lock_free = _Is_always_lock_free<sizeof(_Ty)>>
Expand Down Expand Up @@ -2175,23 +2151,10 @@ public:
static constexpr bool is_always_lock_free = _Is_always_lock_free<sizeof(_Ty)>;
#endif // _HAS_CXX17

#if 1 // TRANSITION, ABI
_NODISCARD bool is_lock_free() const volatile noexcept {
constexpr bool _Result = sizeof(_Ty) <= 8 && (sizeof(_Ty) & sizeof(_Ty) - 1) == 0;
return _Result;
return _Is_always_lock_free<sizeof(_Ty)>;
}

#else // ^^^ don't break ABI / break ABI vvv
_NODISCARD bool is_lock_free() const volatile noexcept {
#if _ATOMIC_HAS_DCAS
return sizeof(_Ty) <= 2 * sizeof(void*);
#else // ^^^ _ATOMIC_HAS_DCAS / !_ATOMIC_HAS_DCAS vvv
return sizeof(_Ty) <= sizeof(void*) || (sizeof(_Ty) <= 2 * sizeof(void*) && __std_atomic_has_cmpxchg16b());
#endif // ^^^ !_ATOMIC_HAS_DCAS ^^^
}
#endif // ^^^ break ABI ^^^

_NODISCARD bool is_lock_free() const noexcept {
return static_cast<const volatile atomic*>(this)->is_lock_free();
}
Expand Down Expand Up @@ -2341,7 +2304,7 @@ public:
using value_type = _Ty;

explicit atomic_ref(_Ty& _Value) noexcept /* strengthened */ : _Base(_Value) {
if constexpr (_Is_potentially_lock_free) {
if constexpr (is_always_lock_free) {
_Check_alignment(_Value);
} else {
this->_Init_spinlock_for_ref();
Expand All @@ -2352,30 +2315,13 @@ public:

atomic_ref& operator=(const atomic_ref&) = delete;

static constexpr bool _Is_potentially_lock_free =
sizeof(_Ty) <= 2 * sizeof(void*) && (sizeof(_Ty) & (sizeof(_Ty) - 1)) == 0;

static constexpr bool is_always_lock_free =
#if _ATOMIC_HAS_DCAS
_Is_potentially_lock_free;
#else // ^^^ _ATOMIC_HAS_DCAS / !_ATOMIC_HAS_DCAS vvv
_Is_potentially_lock_free && sizeof(_Ty) <= sizeof(void*);
#endif // ^^^ !_ATOMIC_HAS_DCAS ^^^
sizeof(_Ty) <= 2 * sizeof(void*) && (sizeof(_Ty) & (sizeof(_Ty) - 1)) == 0;

static constexpr size_t required_alignment = _Is_potentially_lock_free ? sizeof(_Ty) : alignof(_Ty);
static constexpr size_t required_alignment = is_always_lock_free ? sizeof(_Ty) : alignof(_Ty);

_NODISCARD bool is_lock_free() const noexcept {
#if _ATOMIC_HAS_DCAS
return is_always_lock_free;
#else // ^^^ _ATOMIC_HAS_DCAS / !_ATOMIC_HAS_DCAS vvv
if constexpr (is_always_lock_free) {
return true;
} else if constexpr (_Is_potentially_lock_free) {
return __std_atomic_has_cmpxchg16b() != 0;
} else {
return false;
}
#endif // ^^^ !_ATOMIC_HAS_DCAS ^^^
}

void store(const _Ty _Value) const noexcept {
Expand Down Expand Up @@ -3049,16 +2995,18 @@ _STD_END
#undef _ATOMIC_STORE_32_SEQ_CST
#undef _ATOMIC_STORE_64_SEQ_CST
#undef _ATOMIC_STORE_64_SEQ_CST_IX86
#undef _ATOMIC_HAS_DCAS
#undef _ATOMIC_STORE_SEQ_CST_ARM64
#undef __LOAD_ACQUIRE_ARM64
#undef _ATOMIC_LOAD_ARM64
#undef __STORE_RELEASE
#undef _STD_ATOMIC_USE_ARM64_LDAR_STLR

#undef _STD_COMPARE_EXCHANGE_128
#undef _INVALID_MEMORY_ORDER

#if defined(__clang__) && defined(_M_X64)
#pragma clang attribute _STD_ATOMIC_HEADER.pop
#endif // ^^^ defined(__clang__) && defined(_M_X64) ^^^

#pragma pop_macro("new")
_STL_RESTORE_CLANG_WARNINGS
#pragma warning(pop)
Expand Down
16 changes: 0 additions & 16 deletions stl/inc/yvals.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,22 +315,6 @@ _EMIT_STL_WARNING(STL4001, "/clr:pure is deprecated and will be REMOVED.");
#define _LOCK_STREAM 2
#define _LOCK_DEBUG 3

#ifndef _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B
#if _STL_WIN32_WINNT >= _STL_WIN32_WINNT_WINBLUE && defined(_WIN64)
#define _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B 1
#else // ^^^ modern 64-bit / less modern or 32-bit vvv
#define _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B 0
#endif // _STL_WIN32_WINNT >= _STL_WIN32_WINNT_WINBLUE && defined(_WIN64)
#endif // !defined(_STD_ATOMIC_ALWAYS_USE_CMPXCHG16B)

#if _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 && defined(_M_ARM64)
#error ARM64 requires _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B to be 1.
#endif // _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 && defined(_M_ARM64)

#if _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 && !defined(_WIN64)
#error _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 requires 64-bit.
#endif // _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 && !defined(_WIN64)

_STD_BEGIN
enum _Uninitialized { // tag for suppressing initialization
_Noinit
Expand Down
7 changes: 3 additions & 4 deletions stl/inc/yvals_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -1936,10 +1936,9 @@ compiler option, or define _ALLOW_RTCc_IN_STL to suppress this error.
#error In yvals_core.h, defined(MRTDLL) implies defined(_M_CEE_PURE); !defined(_M_CEE_PURE) implies !defined(MRTDLL)
#endif // defined(MRTDLL) && !defined(_M_CEE_PURE)

#define _STL_WIN32_WINNT_VISTA 0x0600 // _WIN32_WINNT_VISTA from sdkddkver.h
#define _STL_WIN32_WINNT_WIN8 0x0602 // _WIN32_WINNT_WIN8 from sdkddkver.h
#define _STL_WIN32_WINNT_WINBLUE 0x0603 // _WIN32_WINNT_WINBLUE from sdkddkver.h
#define _STL_WIN32_WINNT_WIN10 0x0A00 // _WIN32_WINNT_WIN10 from sdkddkver.h
#define _STL_WIN32_WINNT_VISTA 0x0600 // _WIN32_WINNT_VISTA from sdkddkver.h
#define _STL_WIN32_WINNT_WIN8 0x0602 // _WIN32_WINNT_WIN8 from sdkddkver.h
#define _STL_WIN32_WINNT_WIN10 0x0A00 // _WIN32_WINNT_WIN10 from sdkddkver.h

// Note that the STL DLL builds will set this to XP for ABI compatibility with VS2015 which supported XP.
#ifndef _STL_WIN32_WINNT
Expand Down
61 changes: 15 additions & 46 deletions stl/src/atomic_wait.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include <atomic>
#include <cstdint>
#include <cstdlib>
#include <new>
#include <thread>

Expand Down Expand Up @@ -90,24 +91,6 @@ namespace {
}
#endif // defined(_DEBUG)
}

[[nodiscard]] unsigned char __std_atomic_compare_exchange_128_fallback(
_Inout_bytecount_(16) long long* _Destination, _In_ long long _ExchangeHigh, _In_ long long _ExchangeLow,
_Inout_bytecount_(16) long long* _ComparandResult) noexcept {
static SRWLOCK _Mtx = SRWLOCK_INIT;
_SrwLock_guard _Guard{_Mtx};
if (_Destination[0] == _ComparandResult[0] && _Destination[1] == _ComparandResult[1]) {
_ComparandResult[0] = _Destination[0];
_ComparandResult[1] = _Destination[1];
_Destination[0] = _ExchangeLow;
_Destination[1] = _ExchangeHigh;
return static_cast<unsigned char>(true);
} else {
_ComparandResult[0] = _Destination[0];
_ComparandResult[1] = _Destination[1];
return static_cast<unsigned char>(false);
}
}
} // unnamed namespace

extern "C" {
Expand Down Expand Up @@ -249,41 +232,27 @@ _Smtx_t* __stdcall __std_atomic_get_mutex(const void* const _Key) noexcept {
}
#pragma warning(pop)

// TRANSITION, ABI: preserved for binary compatibility
[[nodiscard]] unsigned char __stdcall __std_atomic_compare_exchange_128(_Inout_bytecount_(16) long long* _Destination,
_In_ long long _ExchangeHigh, _In_ long long _ExchangeLow,
_Inout_bytecount_(16) long long* _ComparandResult) noexcept {
#if !defined(_WIN64)
return __std_atomic_compare_exchange_128_fallback(_Destination, _ExchangeHigh, _ExchangeLow, _ComparandResult);
#elif _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1
#ifdef _WIN64
return _InterlockedCompareExchange128(_Destination, _ExchangeHigh, _ExchangeLow, _ComparandResult);
#else // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 / _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 vvv
if (__std_atomic_has_cmpxchg16b()) {
return _InterlockedCompareExchange128(_Destination, _ExchangeHigh, _ExchangeLow, _ComparandResult);
}

return __std_atomic_compare_exchange_128_fallback(_Destination, _ExchangeHigh, _ExchangeLow, _ComparandResult);
#endif // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 ^^^
#else // ^^^ 64-bit / 32-bit vvv
(void) _Destination;
(void) _ExchangeHigh;
(void) _ExchangeLow;
(void) _ComparandResult;
_CSTD abort();
#endif // ^^^ 32-bit ^^^
}

// TRANSITION, ABI: preserved for binary compatibility
[[nodiscard]] char __stdcall __std_atomic_has_cmpxchg16b() noexcept {
#if !defined(_WIN64)
return false;
#elif _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1
#ifdef _WIN64
return true;
#else // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 1 / _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 vvv
constexpr char _Cmpxchg_Absent = 0;
constexpr char _Cmpxchg_Present = 1;
constexpr char _Cmpxchg_Unknown = 2;

static std::atomic<char> _Cached_value{_Cmpxchg_Unknown};

char _Value = _Cached_value.load(std::memory_order_relaxed);
if (_Value == _Cmpxchg_Unknown) {
_Value = IsProcessorFeaturePresent(PF_COMPARE_EXCHANGE128) ? _Cmpxchg_Present : _Cmpxchg_Absent;
_Cached_value.store(_Value, std::memory_order_relaxed);
}

return _Value;
#endif // ^^^ _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B == 0 ^^^
#else
_CSTD abort();
#endif
}
} // extern "C"
3 changes: 0 additions & 3 deletions tests/std/tests/P0019R8_atomic_ref/env.lst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,3 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

RUNALL_INCLUDE ..\usual_20_matrix.lst
RUNALL_CROSSLIST
* PM_CL=""
* PM_CL="/D_STD_ATOMIC_ALWAYS_USE_CMPXCHG16B=1 /DTEST_CMPXCHG16B"
13 changes: 0 additions & 13 deletions tests/std/tests/P0019R8_atomic_ref/test.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#if defined(TEST_CMPXCHG16B) && (defined(__clang__) || !defined(_M_X64))
// Skip Clang because it would require the -mcx16 compiler option.
// Skip non-x64 because _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B is always 1 for ARM64, and is forbidden to be 1 for 32-bit.
int main() {}
#else // ^^^ skip test / run test vvv

#include <atomic>
#include <cassert>
#include <cstddef>
Expand Down Expand Up @@ -436,13 +430,8 @@ void test_gh_4472() {

static_assert(std::atomic_ref<two_pointers_t>::required_alignment == sizeof(two_pointers_t));

#ifdef _WIN64
static_assert(std::atomic_ref<two_pointers_t>::is_always_lock_free == _STD_ATOMIC_ALWAYS_USE_CMPXCHG16B);
#else
static_assert(std::atomic_ref<two_pointers_t>::is_always_lock_free);
#endif

// We expect tests to run on machines that support DCAS, which is required by Win8+.
std::atomic_ref<two_pointers_t> ar{two_pointers};
assert(ar.is_lock_free());
}
Expand Down Expand Up @@ -518,5 +507,3 @@ int main() {
test_gh_4472();
test_gh_4728();
}

#endif // ^^^ run test ^^^

0 comments on commit ef1d621

Please sign in to comment.