From 7b0cb71a004ef27fb071651ffa2067aa25fc71c7 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Wed, 18 Sep 2024 11:11:39 -0700 Subject: [PATCH] WIP --- sqlitecluster/SQLiteNode.cpp | 9 +++++++++ sqlitecluster/SQLiteNode.h | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 6cc77cf94..00853d1bb 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -606,6 +606,7 @@ bool SQLiteNode::update() { // Find the freshest non-broken peer (including permafollowers). if (peer->loggedIn) { if (_forkedFrom.count(peer->name)) { + // UPDATE LOG LINE. SWARN("Hash mismatch. Forked from peer " << peer->name << " so not considering it."); continue; } @@ -1591,11 +1592,14 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { uint64_t commitNum = SToUInt64(message["hashMismatchNumber"]); _db.getCommits(commitNum, commitNum, result); _forkedFrom.insert(peer->name); + + // UPDATE LOG LINE. SALERT("Hash mismatch. Peer " << peer->name << " and I have forked at commit " << message["hashMismatchNumber"] << ". I have forked from " << _forkedFrom.size() << " other nodes. I am " << stateName(_state) << " and have hash " << result[0][0] << " for that commit. Peer has hash " << message["hashMismatchValue"] << "."); if (_forkedFrom.size() > ((_peerList.size() + 1) / 2)) { + // UPDATE LOG LINE. SERROR("Hash mismatch. I have forked from over half the cluster. This is unrecoverable."); } @@ -1891,6 +1895,9 @@ void SQLiteNode::_onDisconnect(SQLitePeer* peer) { // It works for the sync thread as well, as there's handling in _changeState to rollback a commit when // dropping out of leading or standing down (and there can't be commits in progress in other states). SWARN("[clustersync] We were " << stateName(_state) << " but lost quorum (Disconnected from " << peer->name << "). Going to SEARCHING."); + + // Store the time at which this happened for diagnostic purposes. + _lastLostQuorum = STimeNow(); for (const auto* p : _peerList) { SWARN("[clustersync] Peer " << p->name << " logged in? " << (p->loggedIn ? "TRUE" : "FALSE") << (p->permaFollower ? " (permaFollower)" : "")); } @@ -2025,6 +2032,8 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance // loop. It's entirely possible that we do this for valid reasons - it may be the peer that has the bad database and not us, and there are plenty of other reasons we could switch to // SEARCHING, but in those cases, we just wait an extra second before trying again. if (newState == SQLiteNodeState::SEARCHING && _forkedFrom.size()) { + + // UPDATE LOG LINE. SWARN("Going searching while forked peers present, sleeping 1 second."); sleep(1); } diff --git a/sqlitecluster/SQLiteNode.h b/sqlitecluster/SQLiteNode.h index e87090d68..c4b932149 100644 --- a/sqlitecluster/SQLiteNode.h +++ b/sqlitecluster/SQLiteNode.h @@ -320,6 +320,10 @@ class SQLiteNode : public STCPManager { // Set to true to indicate we're attempting to shut down. atomic _isShuttingDown; + // When we spontaneously lose quorum (do to an unexpected node disconnection) we log the time. Later, if we detect we've forked, + // We show this time in a log line as a diagnostic message. + atomic _lastLostQuorum = 0; + // Store the ID of the last transaction that we replicated to peers. Whenever we do an update, we will try and send // any new committed transactions to peers, and update this value. uint64_t _lastSentTransactionID;