From b5ce42bb7540faa4db0e1dc3a7ec632efc984e1d Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Wed, 14 Dec 2016 02:01:56 -0500 Subject: [PATCH] - #PXC-393: Optimized and accurate way to get mmap file gcache size using mincore syscall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements a new information schema table ("WSREP_STATS"), which contains the "wsrep_gcache_actual_pool_size" variable that reflects the exact amount of memory, which is used by the "gcache" memory pool. Using a separate information schema table for this variable avoids deceleration of regular queries, such as SHOW STATUS. To determine the actual amount of memory occupied by the gcache, the mincore() syscall is used. At the same time, I limited the size of additional vector, which used by mincore() syscall, by dividing of large memory ranges into relatively small 512MB chunks, which are passed to mincore() syscall. This introduces slight performance loss due to additional syscalls, however, these costs is less than 1%, while only the 128 kilobytes of additional memory is allocated for mincore()-related vector. I made sure that the state of gcache pages measured only up to the currently achieved "watermark", but not around the entire allocated pool - to accelerate scenarios such as https://bugs.launchpad.net/percona-xtradb-cluster/+bug/1462674 In addition, measurement of the actual amount of memory occupied by the gcache avoids long-term locking of the other gcache operations, except when mutex is aquired by the allocation or deallocation of overflow pages (in the "Pages" storage). In this case mid-term locking is possible – until the code that uses memcore() syscall will stop working. However, while we do not create/free the overflow pages, a global lock does not happen. --- common/wsrep_api.h | 9 +++ galera/src/replicator.hpp | 1 + galera/src/replicator_smm.cpp | 4 +- galera/src/replicator_smm.hpp | 3 + galera/src/replicator_smm_stats.cpp | 70 +++++++++++++++++++++ galera/src/wsrep_provider.cpp | 12 ++++ galerautils/src/gu_mmap.cpp | 97 +++++++++++++++++++++++++++++ galerautils/src/gu_mmap.hpp | 3 + gcache/src/GCache.cpp | 8 +++ gcache/src/GCache.hpp | 5 ++ gcache/src/gcache_mem_store.cpp | 5 ++ gcache/src/gcache_mem_store.hpp | 1 + gcache/src/gcache_page.cpp | 11 ++++ gcache/src/gcache_page.hpp | 6 +- gcache/src/gcache_page_store.cpp | 23 +++++++ gcache/src/gcache_page_store.hpp | 4 ++ gcache/src/gcache_rb_store.cpp | 11 ++++ gcache/src/gcache_rb_store.hpp | 2 + 18 files changed, 272 insertions(+), 3 deletions(-) diff --git a/common/wsrep_api.h b/common/wsrep_api.h index ae249b5be..18711d633 100644 --- a/common/wsrep_api.h +++ b/common/wsrep_api.h @@ -998,6 +998,15 @@ struct wsrep { */ struct wsrep_stats_var* (*stats_get) (wsrep_t* wsrep); + /*! + * @brief Returns an array of extended status variables. + * Array is terminated by Null variable name. + * + * @param wsrep provider handle + * @return array of struct wsrep_status_var. + */ + struct wsrep_stats_var* (*stats_ext_get) (wsrep_t* wsrep); + /*! * @brief Release resources that might be associated with the array. * diff --git a/galera/src/replicator.hpp b/galera/src/replicator.hpp index 76d521c75..e161b103f 100644 --- a/galera/src/replicator.hpp +++ b/galera/src/replicator.hpp @@ -112,6 +112,7 @@ namespace galera virtual void process_sync(wsrep_seqno_t seqno_l) = 0; virtual const struct wsrep_stats_var* stats_get() = 0; + virtual const struct wsrep_stats_var* stats_ext_get() = 0; virtual void stats_reset() = 0; // static void stats_free(struct wsrep_stats_var*) must be declared in // the child class diff --git a/galera/src/replicator_smm.cpp b/galera/src/replicator_smm.cpp index e34fcf0e5..1ff802aad 100644 --- a/galera/src/replicator_smm.cpp +++ b/galera/src/replicator_smm.cpp @@ -197,7 +197,8 @@ galera::ReplicatorSMM::ReplicatorSMM(const struct wsrep_init_args* args) preordered_id_ (), incoming_list_ (""), incoming_mutex_ (), - wsrep_stats_ () + wsrep_stats_ (), + wsrep_stats_ext_ () { /* Register the application callback that should be called @@ -278,6 +279,7 @@ galera::ReplicatorSMM::ReplicatorSMM(const struct wsrep_init_args* args) cert_.assign_initial_position(seqno, trx_proto_ver()); build_stats_vars(wsrep_stats_); + build_stats_ext_vars(wsrep_stats_ext_); } galera::ReplicatorSMM::~ReplicatorSMM() diff --git a/galera/src/replicator_smm.hpp b/galera/src/replicator_smm.hpp index a49f61148..0b6680c1f 100644 --- a/galera/src/replicator_smm.hpp +++ b/galera/src/replicator_smm.hpp @@ -136,6 +136,7 @@ namespace galera void process_sync(wsrep_seqno_t seqno_l); const struct wsrep_stats_var* stats_get(); + const struct wsrep_stats_var* stats_ext_get(); void stats_reset(); void stats_free(struct wsrep_stats_var*); @@ -457,6 +458,7 @@ namespace galera }; void build_stats_vars (std::vector& stats); + void build_stats_ext_vars (std::vector& stats); void establish_protocol_versions (int version); @@ -616,6 +618,7 @@ namespace galera mutable gu::Mutex incoming_mutex_; mutable std::vector wsrep_stats_; + mutable std::vector wsrep_stats_ext_; }; std::ostream& operator<<(std::ostream& os, ReplicatorSMM::State state); diff --git a/galera/src/replicator_smm_stats.cpp b/galera/src/replicator_smm_stats.cpp index d134d6012..7a982d36b 100644 --- a/galera/src/replicator_smm_stats.cpp +++ b/galera/src/replicator_smm_stats.cpp @@ -333,7 +333,77 @@ galera::ReplicatorSMM::stats_get() log_warn << "Failed to allocate stats vars buffer to " << (vec_size + tail_size) << " bytes. System is running out of memory."; + } + + return buf; +} +typedef enum status_vars_ext +{ + STATS_EXT_GCACHE_ACTUAL_POOL_SIZE = 0, + STATS_EXT_MAX +} StatusExtVars; + +static const struct wsrep_stats_var wsrep_stats_ext[STATS_EXT_MAX + 1] = +{ + { "gcache_actual_pool_size", WSREP_VAR_INT64, { 0 } }, + { 0, WSREP_VAR_STRING, { 0 } } +}; + +void +galera::ReplicatorSMM::build_stats_ext_vars ( + std::vector& stats) +{ + const struct wsrep_stats_var* ptr(wsrep_stats_ext); + + do + { + stats.push_back(*ptr); + } + while (ptr++->name != 0); + + stats[STATS_EXT_GCACHE_ACTUAL_POOL_SIZE].value._int64 = 0; +} + +const struct wsrep_stats_var* +galera::ReplicatorSMM::stats_ext_get() +{ + if (S_DESTROYED == state_()) return 0; + + std::vector sv(wsrep_stats_ext_); + + sv[STATS_EXT_GCACHE_ACTUAL_POOL_SIZE].value._int64 = + gcache_.actual_pool_size(); + + /* Create a buffer to be passed to the caller. */ + // The buffer size needed: + // * Space for wsrep_stats_ext_ array + // * Trailing space for string store + size_t const vec_size( + (sv.size())*sizeof(struct wsrep_stats_var)); + struct wsrep_stats_var* const buf( + reinterpret_cast( + gu_malloc(vec_size))); + + if (buf) + { + size_t sv_pos(STATS_EXT_MAX); + + assert(sv_pos == sv.size() - 1); + + // NULL terminate + sv[sv_pos].name = 0; + sv[sv_pos].type = WSREP_VAR_STRING; + sv[sv_pos].value._string = 0; + + // Finally copy sv vector to buf + memcpy(buf, &sv[0], vec_size); + } + else + { + log_warn << "Failed to allocate extended stats vars buffer to " + << (vec_size) + << " bytes. System is running out of memory."; } return buf; diff --git a/galera/src/wsrep_provider.cpp b/galera/src/wsrep_provider.cpp index b3f77a2ee..d4a58a5b8 100644 --- a/galera/src/wsrep_provider.cpp +++ b/galera/src/wsrep_provider.cpp @@ -926,6 +926,17 @@ struct wsrep_stats_var* galera_stats_get (wsrep_t* gh) } +struct wsrep_stats_var* galera_stats_ext_get (wsrep_t* gh) +{ + assert(gh != 0); + assert(gh->ctx != 0); + + REPL_CLASS* repl(reinterpret_cast< REPL_CLASS * >(gh->ctx)); + + return const_cast(repl->stats_ext_get()); +} + + extern "C" void galera_stats_free (wsrep_t* gh, struct wsrep_stats_var* s) { @@ -1095,6 +1106,7 @@ static wsrep_t galera_str = { &galera_sst_received, &galera_snapshot, &galera_stats_get, + &galera_stats_ext_get, &galera_stats_free, &galera_stats_reset, &galera_pause, diff --git a/galerautils/src/gu_mmap.cpp b/galerautils/src/gu_mmap.cpp index 188421b6d..3bff81687 100644 --- a/galerautils/src/gu_mmap.cpp +++ b/galerautils/src/gu_mmap.cpp @@ -120,3 +120,100 @@ namespace gu } } } + +/** Returns actual memory usage by allocated page range: **/ + +/* + * Verify test macros to make sure we have mincore syscall: + */ +#if defined(_BSD_SOURCE) || defined(_SVID_SOURCE) + +/* + * The buffer size for mincore. 256 kilobytes is enough to request + * information on the status of 1GB memory map (256K * 4096 bytes per + * page = 1GB) in one syscall (when a 4096-byte pages). Increasing this + * parameter allows us to save a few syscalls (when huge amounts of mmap), + * but it also raises the memory requirements for temporary buffer: + */ +#define GU_AMU_CHUNK 0x40000 /* Currently 256K, must be power of two. */ + +size_t gu_actual_memory_usage (const void * const ptr, const size_t length) +{ + size_t size= 0; + if (length) + { + /* + * -PAGE_SIZE is same as ~(PAGE_SIZE-1), but creates less + * potential problems due to implicit type cast in expressions: + */ + uintptr_t first= + reinterpret_cast (ptr) & -GU_PAGE_SIZE; + const uintptr_t last= + (reinterpret_cast (ptr) + length - 1) & -GU_PAGE_SIZE; + const ptrdiff_t total= last - first + GU_PAGE_SIZE; + size_t pages= total / GU_PAGE_SIZE; + size_t chunks= pages / GU_AMU_CHUNK; + unsigned char * const map= + reinterpret_cast (malloc(chunks ? GU_AMU_CHUNK : pages)); + if (map) + { + while (chunks--) + { + if (mincore(reinterpret_cast (first), + (size_t) GU_AMU_CHUNK * GU_PAGE_SIZE, map) == 0) + { + for (size_t i = 0; i < GU_AMU_CHUNK; i++) + { + if (map[i]) + { + size += GU_PAGE_SIZE; + } + } + } + else + { + log_fatal << "Unable to get in-core state vector " + "for page range. Aborting."; + abort(); + } + first += (size_t) GU_AMU_CHUNK * GU_PAGE_SIZE; + } + pages &= GU_AMU_CHUNK - 1; + if (mincore(reinterpret_cast (first), + pages * GU_PAGE_SIZE, map) == 0) + { + for (size_t i = 0; i < pages; i++) + { + if (map[i]) size += GU_PAGE_SIZE; + } + } + else + { + log_fatal << "Unable to get in-core state vector " + "for page range. Aborting."; + abort(); + } + free(map); + } + else + { + log_fatal << "Unable to allocate memory for in-core state vector. " + << "Aborting."; + abort(); + } + } + return size; +} + +#else + +/* + * In case of absence mincore syscall we simply return the total size + * of memory-mapped region: + */ +size_t gu_actual_memory_usage (const void * const ptr, const size_t length) +{ + return length; +} + +#endif diff --git a/galerautils/src/gu_mmap.hpp b/galerautils/src/gu_mmap.hpp index 37214d159..11161e414 100644 --- a/galerautils/src/gu_mmap.hpp +++ b/galerautils/src/gu_mmap.hpp @@ -40,4 +40,7 @@ class MMap } /* namespace gu */ +/** Returns actual memory usage by allocated page range: **/ +size_t gu_actual_memory_usage (const void * const ptr, const size_t length); + #endif /* __GCACHE_MMAP__ */ diff --git a/gcache/src/GCache.cpp b/gcache/src/GCache.cpp index 1d1cce43c..6f558bd0e 100644 --- a/gcache/src/GCache.cpp +++ b/gcache/src/GCache.cpp @@ -73,6 +73,14 @@ namespace gcache << "\n" << "GCache frees : " << frees; } + size_t GCache::actual_pool_size () + { + gu::Lock lock(mtx); + return mem.actual_pool_size() + + rb.actual_pool_size(&mtx) + + ps.actual_pool_size(&mtx); + } + size_t GCache::allocated_pool_size () { gu::Lock lock(mtx); diff --git a/gcache/src/GCache.hpp b/gcache/src/GCache.hpp index 82198b9cb..1a7b78884 100644 --- a/gcache/src/GCache.hpp +++ b/gcache/src/GCache.hpp @@ -99,6 +99,11 @@ namespace gcache int64_t& seqno_d, ssize_t& size); + /*! + * Returns actual gcache memory pool size (in bytes). + */ + size_t actual_pool_size (); + /*! * Returns allocated gcache memory pool size (in bytes). */ diff --git a/gcache/src/gcache_mem_store.cpp b/gcache/src/gcache_mem_store.cpp index 0e8e500b8..1f35cd5d8 100644 --- a/gcache/src/gcache_mem_store.cpp +++ b/gcache/src/gcache_mem_store.cpp @@ -75,6 +75,11 @@ MemStore::seqno_reset() } } +size_t MemStore::actual_pool_size () +{ + return size_; +} + size_t MemStore::allocated_pool_size () { return size_; diff --git a/gcache/src/gcache_mem_store.hpp b/gcache/src/gcache_mem_store.hpp index 6bd5147a6..a503f1f9e 100644 --- a/gcache/src/gcache_mem_store.hpp +++ b/gcache/src/gcache_mem_store.hpp @@ -135,6 +135,7 @@ namespace gcache // for unit tests only size_t _allocd () const { return size_; } + size_t actual_pool_size (); size_t allocated_pool_size (); private: diff --git a/gcache/src/gcache_page.cpp b/gcache/src/gcache_page.cpp index 2f3f117a9..a08f02be3 100644 --- a/gcache/src/gcache_page.cpp +++ b/gcache/src/gcache_page.cpp @@ -169,6 +169,17 @@ gcache::Page::realloc (void* ptr, size_type size) } } +size_t gcache::Page::actual_pool_size (gu::Mutex * mtx) +{ + void* ptr= reinterpret_cast (mmap_.ptr); + size_t used= mmap_.size - min_space_; + size_t size; + mtx->unlock(); + size = gu_actual_memory_usage(ptr, used); + mtx->lock(); + return size; +} + size_t gcache::Page::allocated_pool_size () { return mmap_.size - min_space_; diff --git a/gcache/src/gcache_page.hpp b/gcache/src/gcache_page.hpp index fd5069b72..942373f6c 100644 --- a/gcache/src/gcache_page.hpp +++ b/gcache/src/gcache_page.hpp @@ -10,8 +10,9 @@ #include "gcache_memops.hpp" #include "gcache_bh.hpp" -#include "gu_fdesc.hpp" -#include "gu_mmap.hpp" +#include +#include +#include #include @@ -53,6 +54,7 @@ namespace gcache void* parent() const { return ps_; } + size_t actual_pool_size (gu::Mutex * mtx); size_t allocated_pool_size (); private: diff --git a/gcache/src/gcache_page_store.cpp b/gcache/src/gcache_page_store.cpp index 8edaec6c1..c5345794b 100644 --- a/gcache/src/gcache_page_store.cpp +++ b/gcache/src/gcache_page_store.cpp @@ -119,6 +119,7 @@ gcache::PageStore::cleanup () #ifndef NDEBUG size_t counter = 0; #endif + gu::Lock lock(mtx_); /* * 1. We must release the page if we have exceeded the limit on the * overall size of the page pool (which is set by the user explicitly, @@ -172,6 +173,7 @@ gcache::PageStore::cleanup () void gcache::PageStore::reset () { + gu::Lock lock(mtx_); while (pages_.size() > 0 && delete_page()) {}; } @@ -180,6 +182,8 @@ gcache::PageStore::new_page (size_type size) { Page* const page(new Page(this, make_page_name (base_name_, count_), size)); + gu::Lock lock(mtx_); + pages_.push_back (page); total_size_ += page->size(); current_ = page; @@ -191,6 +195,7 @@ gcache::PageStore::PageStore (const std::string& dir_name, size_t page_size, size_t keep_page) : + mtx_ (), base_name_ (make_base_name(dir_name)), keep_size_ (keep_size), page_size_ (page_size), @@ -226,6 +231,8 @@ gcache::PageStore::PageStore (const std::string& dir_name, gcache::PageStore::~PageStore () { + mtx_.lock(); + try { while (pages_.size() && delete_page()) {}; @@ -244,6 +251,8 @@ gcache::PageStore::~PageStore () << " page files: some buffers are still \"mmapped\"."; } + mtx_.unlock(); + pthread_attr_destroy (&delete_page_attr_); } @@ -315,9 +324,23 @@ gcache::PageStore::realloc (void* ptr, size_type const size) return ret; } +size_t gcache::PageStore::actual_pool_size (gu::Mutex * mtx) +{ + size_t size= 0; + gu::Lock lock(mtx_); + std::deque::iterator ptr= pages_.begin(); + while (ptr != pages_.end()) + { + Page* page= *ptr++; + size += page->actual_pool_size(mtx); + } + return size; +} + size_t gcache::PageStore::allocated_pool_size () { size_t size= 0; + gu::Lock lock(mtx_); std::deque::iterator ptr= pages_.begin(); while (ptr != pages_.end()) { diff --git a/gcache/src/gcache_page_store.hpp b/gcache/src/gcache_page_store.hpp index fb7e9cb87..053d6f029 100644 --- a/gcache/src/gcache_page_store.hpp +++ b/gcache/src/gcache_page_store.hpp @@ -14,6 +14,8 @@ #include #include +#include + namespace gcache { class PageStore : public MemOps @@ -54,6 +56,7 @@ namespace gcache void set_keep_count (size_t count) { keep_page_ = count; cleanup();} + size_t actual_pool_size (gu::Mutex * mtx); size_t allocated_pool_size (); /* for unit tests */ @@ -63,6 +66,7 @@ namespace gcache private: + gu::Mutex mtx_; std::string const base_name_; /* /.../.../gcache.page. */ size_t keep_size_; /* how much pages to keep after freeing*/ size_t page_size_; /* min size of the individual page */ diff --git a/gcache/src/gcache_rb_store.cpp b/gcache/src/gcache_rb_store.cpp index 35cae6b0f..3e3b214ae 100644 --- a/gcache/src/gcache_rb_store.cpp +++ b/gcache/src/gcache_rb_store.cpp @@ -530,6 +530,17 @@ namespace gcache /* this is needed to avoid rescanning from start_ on recovery */ } + size_t RingBuffer::actual_pool_size (gu::Mutex * mtx) + { + void* ptr= reinterpret_cast (mmap_.ptr); + size_t used= max_used_; + size_t size; + mtx->unlock(); + size = gu_actual_memory_usage(ptr, used); + mtx->lock(); + return size; + } + size_t RingBuffer::allocated_pool_size () { return max_used_; diff --git a/gcache/src/gcache_rb_store.hpp b/gcache/src/gcache_rb_store.hpp index 98d85a0ed..a54faff32 100644 --- a/gcache/src/gcache_rb_store.hpp +++ b/gcache/src/gcache_rb_store.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -111,6 +112,7 @@ namespace gcache assert_size_free(); } + size_t actual_pool_size (gu::Mutex * mtx); size_t allocated_pool_size (); private: