From f46df8e2a1101886e1594eed66421cc6d92b0576 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 23 Sep 2024 14:33:06 -0700 Subject: [PATCH 1/4] Notes --- sqlitecluster/SQLiteNode.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 2070b657a..2daead7b0 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -1341,6 +1341,8 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { peer->priority = message.calc("Priority"); peer->loggedIn = true; peer->version = message["Version"]; + + // Set this to STANDINGUP peer->state = stateFromName(message["State"]); // Let the server know that a peer has logged in. @@ -1362,6 +1364,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { peer->state = stateFromName(message["State"]); const SQLiteNodeState to = peer->state; if (from == to) { + // This is what happens. // No state change, just new commits? PINFO("Peer received new commit in state '" << stateName(from) << "', commit #" << message["CommitCount"] << " (" << message["Hash"] << ")"); @@ -1603,6 +1606,8 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { uint64_t commitNum = SToUInt64(message["hashMismatchNumber"]); _db.getCommits(commitNum, commitNum, result); _forkedFrom.insert(peer->name); + + // Remove the forked peer as the sync peer. SALERT("Hash mismatch. Peer " << peer->name << " and I have forked at commit " << message["hashMismatchNumber"] << ". I have forked from " << _forkedFrom.size() << " other nodes. I am " << stateName(_state) @@ -1821,6 +1826,7 @@ void SQLiteNode::_onConnect(SQLitePeer* peer) { // If we're STANDINGUP when a peer connects, send them a STATE message so they know they need to APPROVE or DENY the standup. // Otherwise we will wait for their response that's not coming,and can eventually time out the standup. + // OH, If we're already standing up, both these messages have the same state. LOGIN sets the sate, and then it doesn't change here. if (_state == SQLiteNodeState::STANDINGUP) { SData state("STATE"); state["StateChangeCount"] = to_string(_stateChangeCount); @@ -1872,6 +1878,7 @@ void SQLiteNode::_onDisconnect(SQLitePeer* peer) { PHMMM("Lost our synchronization peer, re-SEARCHING."); SASSERTWARN(_state == SQLiteNodeState::SYNCHRONIZING); _syncPeer = nullptr; + // This happens. _changeState(SQLiteNodeState::SEARCHING); } @@ -2030,6 +2037,7 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance if (newState >= SQLiteNodeState::STANDINGUP) { // Not forked from anyone. Note that this includes both LEADING and FOLLOWING. + // We clear this when standing up. _forkedFrom.clear(); } @@ -2321,6 +2329,8 @@ void SQLiteNode::_updateSyncPeer() nonChosenPeers.push_back(peer->name + ":" + to_string(peer->latency/1000) + "ms"); } } + + // Don't includ forked peers. SINFO("Updating SYNCHRONIZING peer from " << from << " to " << to << ". Not chosen: " << SComposeList(nonChosenPeers)); // And save the new sync peer internally. From 1b4028fbf47cef51085a3081ef4813bd0460ce9f Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 23 Sep 2024 14:56:03 -0700 Subject: [PATCH 2/4] Move standup response to separate function --- sqlitecluster/SQLiteNode.cpp | 193 ++++++++++++++++++----------------- sqlitecluster/SQLiteNode.h | 1 + 2 files changed, 101 insertions(+), 93 deletions(-) diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 2daead7b0..3a0fadac8 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -1341,10 +1341,13 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { peer->priority = message.calc("Priority"); peer->loggedIn = true; peer->version = message["Version"]; - - // Set this to STANDINGUP peer->state = stateFromName(message["State"]); + // If the peer is already standing up, go ahead and approve or deny immediately. + if (peer->state == SQLiteNodeState::STANDINGUP) { + _sendStandupResponse(peer, message); + } + // Let the server know that a peer has logged in. _server.onNodeLogin(peer); } else if (!peer->loggedIn) { @@ -1421,97 +1424,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { peer->transactionResponse = SQLitePeer::Response::NONE; peer->subscribed = false; } else if (to == SQLiteNodeState::STANDINGUP) { - // STANDINGUP: When a peer announces it intends to stand up, we immediately respond with approval or - // denial. We determine this by checking to see if there is any other peer who is already leader or - // also trying to stand up. - SData response("STANDUP_RESPONSE"); - - // Parrot back the node's attempt count so that it can differentiate stale responses. - response["StateChangeCount"] = message["StateChangeCount"]; - - // Reason we would deny, if we do. - if (peer->permaFollower) { - // We think it's a permafollower, deny - PHMMM("Permafollower trying to stand up, denying."); - response["Response"] = "deny"; - response["Reason"] = "You're a permafollower"; - _sendToPeer(peer, response); - return; - } - - if (_forkedFrom.count(peer->name)) { - PHMMM("Forked from peer, can't approve standup."); - response["Response"] = "abstain"; - response["Reason"] = "We are forked"; - _sendToPeer(peer, response); - return; - } - - // What's our state - if (SWITHIN(SQLiteNodeState::STANDINGUP, _state, SQLiteNodeState::STANDINGDOWN)) { - // Oh crap, it's trying to stand up while we're leading. Who is higher priority? - if (peer->priority > _priority) { - // The other peer is a higher priority than us, so we should stand down (maybe it crashed, we - // came up as leader, and now it's been brought back up). We'll want to stand down here, but we - // do it gracefully so that we won't lose any transactions in progress. - if (_state == SQLiteNodeState::STANDINGUP) { - PWARN("Higher-priority peer is trying to stand up while we are STANDINGUP, SEARCHING."); - _changeState(SQLiteNodeState::SEARCHING); - } else if (_state == SQLiteNodeState::LEADING) { - PINFO("Higher-priority peer is trying to stand up while we are LEADING, STANDINGDOWN."); - _changeState(SQLiteNodeState::STANDINGDOWN); - } else { - PWARN("Higher-priority peer is trying to stand up while we are STANDINGDOWN, continuing."); - } - } else { - // Deny because we're currently in the process of leading and we're higher priority. - response["Response"] = "deny"; - response["Reason"] = "I am leading"; - - // Hmm, why is a lower priority peer trying to stand up? Is it possible we're no longer in - // control of the cluster? Let's see how many nodes are subscribed. - if (_majoritySubscribed()) { - // we have a majority of the cluster, so ignore this oddity. - PHMMM("Lower-priority peer is trying to stand up while we are " << stateName(_state) - << " with a majority of the cluster; denying and ignoring."); - } else { - // We don't have a majority of the cluster -- maybe it knows something we don't? For - // example, it could be that the rest of the cluster has forked away from us. This can - // happen if the leader hangs while processing a command: by the time it finishes, the - // cluster might have elected a new leader, forked, and be a thousand commits in the future. - // In this case, let's just reset everything anyway to be safe. - PWARN("Lower-priority peer is trying to stand up while we are " << stateName(_state) - << ", but we don't have a majority of the cluster so reconnecting and SEARCHING."); - _reconnectAll(); - // TODO: This puts us in an ambiguous state if we switch to SEARCHING from LEADING, - // without going through the STANDDOWN process. We'll need to handle it better, but it's - // unclear if this can ever happen at all. exit() may be a reasonable strategy here. - _changeState(SQLiteNodeState::SEARCHING); - } - } - } else { - // Approve if nobody else is trying to stand up - response["Response"] = "approve"; // Optimistic; will override - for (auto otherPeer : _peerList) { - if (otherPeer != peer) { - // See if it's trying to be leader - if (otherPeer->state == SQLiteNodeState::STANDINGUP || otherPeer->state == SQLiteNodeState::LEADING || otherPeer->state == SQLiteNodeState::STANDINGDOWN) { - // We need to contest this standup - response["Response"] = "deny"; - response["Reason"] = "peer '" + otherPeer->name + "' is '" + stateName(otherPeer->state) + "'"; - break; - } - } - } - } - - // Send the response - if (SIEquals(response["Response"], "approve")) { - PINFO("Approving standup request"); - } else { - PHMMM("Not approving standup request because " << response["Reason"]); - } - _sendToPeer(peer, response); + _sendStandupResponse(peer, message); } else if (from == SQLiteNodeState::STANDINGDOWN) { // STANDINGDOWN: When a peer stands down we double-check to make sure we don't have any outstanding // transaction (and if we do, we warn and rollback). @@ -2822,3 +2735,97 @@ string SQLiteNode::_getLostQuorumLogMessage() const { return lostQuorumMessage; } + +void SQLiteNode::_sendStandupResponse(SQLitePeer* peer, const SData& message) { + // STANDINGUP: When a peer announces it intends to stand up, we immediately respond with approval or + // denial. We determine this by checking to see if there is any other peer who is already leader or + // also trying to stand up. + SData response("STANDUP_RESPONSE"); + + // Parrot back the node's attempt count so that it can differentiate stale responses. + response["StateChangeCount"] = message["StateChangeCount"]; + + // Reason we would deny, if we do. + if (peer->permaFollower) { + // We think it's a permafollower, deny + PHMMM("Permafollower trying to stand up, denying."); + response["Response"] = "deny"; + response["Reason"] = "You're a permafollower"; + _sendToPeer(peer, response); + return; + } + + if (_forkedFrom.count(peer->name)) { + PHMMM("Forked from peer, can't approve standup."); + response["Response"] = "abstain"; + response["Reason"] = "We are forked"; + _sendToPeer(peer, response); + return; + } + + // What's our state + if (SWITHIN(SQLiteNodeState::STANDINGUP, _state, SQLiteNodeState::STANDINGDOWN)) { + // Oh crap, it's trying to stand up while we're leading. Who is higher priority? + if (peer->priority > _priority) { + // The other peer is a higher priority than us, so we should stand down (maybe it crashed, we + // came up as leader, and now it's been brought back up). We'll want to stand down here, but we + // do it gracefully so that we won't lose any transactions in progress. + if (_state == SQLiteNodeState::STANDINGUP) { + PWARN("Higher-priority peer is trying to stand up while we are STANDINGUP, SEARCHING."); + _changeState(SQLiteNodeState::SEARCHING); + } else if (_state == SQLiteNodeState::LEADING) { + PINFO("Higher-priority peer is trying to stand up while we are LEADING, STANDINGDOWN."); + _changeState(SQLiteNodeState::STANDINGDOWN); + } else { + PWARN("Higher-priority peer is trying to stand up while we are STANDINGDOWN, continuing."); + } + } else { + // Deny because we're currently in the process of leading and we're higher priority. + response["Response"] = "deny"; + response["Reason"] = "I am leading"; + + // Hmm, why is a lower priority peer trying to stand up? Is it possible we're no longer in + // control of the cluster? Let's see how many nodes are subscribed. + if (_majoritySubscribed()) { + // we have a majority of the cluster, so ignore this oddity. + PHMMM("Lower-priority peer is trying to stand up while we are " << stateName(_state) + << " with a majority of the cluster; denying and ignoring."); + } else { + // We don't have a majority of the cluster -- maybe it knows something we don't? For + // example, it could be that the rest of the cluster has forked away from us. This can + // happen if the leader hangs while processing a command: by the time it finishes, the + // cluster might have elected a new leader, forked, and be a thousand commits in the future. + // In this case, let's just reset everything anyway to be safe. + PWARN("Lower-priority peer is trying to stand up while we are " << stateName(_state) + << ", but we don't have a majority of the cluster so reconnecting and SEARCHING."); + _reconnectAll(); + // TODO: This puts us in an ambiguous state if we switch to SEARCHING from LEADING, + // without going through the STANDDOWN process. We'll need to handle it better, but it's + // unclear if this can ever happen at all. exit() may be a reasonable strategy here. + _changeState(SQLiteNodeState::SEARCHING); + } + } + } else { + // Approve if nobody else is trying to stand up + response["Response"] = "approve"; // Optimistic; will override + for (auto otherPeer : _peerList) { + if (otherPeer != peer) { + // See if it's trying to be leader + if (otherPeer->state == SQLiteNodeState::STANDINGUP || otherPeer->state == SQLiteNodeState::LEADING || otherPeer->state == SQLiteNodeState::STANDINGDOWN) { + // We need to contest this standup + response["Response"] = "deny"; + response["Reason"] = "peer '" + otherPeer->name + "' is '" + stateName(otherPeer->state) + "'"; + break; + } + } + } + } + + // Send the response + if (SIEquals(response["Response"], "approve")) { + PINFO("Approving standup request"); + } else { + PHMMM("Not approving standup request because " << response["Reason"]); + } + _sendToPeer(peer, response); +} \ No newline at end of file diff --git a/sqlitecluster/SQLiteNode.h b/sqlitecluster/SQLiteNode.h index 5f9e6e3e0..c820d1569 100644 --- a/sqlitecluster/SQLiteNode.h +++ b/sqlitecluster/SQLiteNode.h @@ -267,6 +267,7 @@ class SQLiteNode : public STCPManager { // Replicates any transactions that have been made on our database by other threads to peers. void _sendOutstandingTransactions(const set& commitOnlyIDs = {}); + void _sendStandupResponse(SQLitePeer* peer, const SData& message); void _sendPING(SQLitePeer* peer); void _sendToAllPeers(const SData& message, bool subscribedOnly = false); void _sendToPeer(SQLitePeer* peer, const SData& message); From 0a63e7e3c2f3c32c103290ecc612760bcbd003e7 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 23 Sep 2024 14:56:39 -0700 Subject: [PATCH 3/4] Don't send separate state message --- sqlitecluster/SQLiteNode.cpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 3a0fadac8..130449389 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -1736,17 +1736,6 @@ void SQLiteNode::_onConnect(SQLitePeer* peer) { login["Version"] = _version; login["Permafollower"] = _originalPriority ? "false" : "true"; _sendToPeer(peer, login); - - // If we're STANDINGUP when a peer connects, send them a STATE message so they know they need to APPROVE or DENY the standup. - // Otherwise we will wait for their response that's not coming,and can eventually time out the standup. - // OH, If we're already standing up, both these messages have the same state. LOGIN sets the sate, and then it doesn't change here. - if (_state == SQLiteNodeState::STANDINGUP) { - SData state("STATE"); - state["StateChangeCount"] = to_string(_stateChangeCount); - state["State"] = stateName(_state); - state["Priority"] = SToStr(_priority); - _sendToPeer(peer, state); - } } // -------------------------------------------------------------------------- From 28ce2a3f484bb483afcfbcd092c9a31c8f6b94ae Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 23 Sep 2024 14:58:30 -0700 Subject: [PATCH 4/4] Cleanup --- sqlitecluster/SQLiteNode.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 130449389..aee6c2ac3 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -1367,7 +1367,6 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { peer->state = stateFromName(message["State"]); const SQLiteNodeState to = peer->state; if (from == to) { - // This is what happens. // No state change, just new commits? PINFO("Peer received new commit in state '" << stateName(from) << "', commit #" << message["CommitCount"] << " (" << message["Hash"] << ")"); @@ -1520,8 +1519,6 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { _db.getCommits(commitNum, commitNum, result); _forkedFrom.insert(peer->name); - // Remove the forked peer as the sync peer. - SALERT("Hash mismatch. Peer " << peer->name << " and I have forked at commit " << message["hashMismatchNumber"] << ". I have forked from " << _forkedFrom.size() << " other nodes. I am " << stateName(_state) << " and have hash " << result[0][0] << " for that commit. Peer has hash " << message["hashMismatchValue"] << "." @@ -1780,7 +1777,6 @@ void SQLiteNode::_onDisconnect(SQLitePeer* peer) { PHMMM("Lost our synchronization peer, re-SEARCHING."); SASSERTWARN(_state == SQLiteNodeState::SYNCHRONIZING); _syncPeer = nullptr; - // This happens. _changeState(SQLiteNodeState::SEARCHING); } @@ -1939,7 +1935,6 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance if (newState >= SQLiteNodeState::STANDINGUP) { // Not forked from anyone. Note that this includes both LEADING and FOLLOWING. - // We clear this when standing up. _forkedFrom.clear(); } @@ -2231,8 +2226,6 @@ void SQLiteNode::_updateSyncPeer() nonChosenPeers.push_back(peer->name + ":" + to_string(peer->latency/1000) + "ms"); } } - - // Don't includ forked peers. SINFO("Updating SYNCHRONIZING peer from " << from << " to " << to << ". Not chosen: " << SComposeList(nonChosenPeers)); // And save the new sync peer internally. @@ -2817,4 +2810,4 @@ void SQLiteNode::_sendStandupResponse(SQLitePeer* peer, const SData& message) { PHMMM("Not approving standup request because " << response["Reason"]); } _sendToPeer(peer, response); -} \ No newline at end of file +}