Skip to content

Commit

Permalink
Merge pull request #1877 from Expensify/tyler-fix-connect-while-stand…
Browse files Browse the repository at this point in the history
…ing-up

Respond to standup at login correctly.
  • Loading branch information
tylerkaraszewski authored Sep 30, 2024
2 parents 926c960 + 28ce2a3 commit 1ef3e2d
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 102 deletions.
203 changes: 101 additions & 102 deletions sqlitecluster/SQLiteNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,11 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
peer->version = message["Version"];
peer->state = stateFromName(message["State"]);

// If the peer is already standing up, go ahead and approve or deny immediately.
if (peer->state == SQLiteNodeState::STANDINGUP) {
_sendStandupResponse(peer, message);
}

// Let the server know that a peer has logged in.
_server.onNodeLogin(peer);
} else if (!peer->loggedIn) {
Expand Down Expand Up @@ -1420,97 +1425,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
peer->transactionResponse = SQLitePeer::Response::NONE;
peer->subscribed = false;
} else if (to == SQLiteNodeState::STANDINGUP) {
// STANDINGUP: When a peer announces it intends to stand up, we immediately respond with approval or
// denial. We determine this by checking to see if there is any other peer who is already leader or
// also trying to stand up.
SData response("STANDUP_RESPONSE");

// Parrot back the node's attempt count so that it can differentiate stale responses.
response["StateChangeCount"] = message["StateChangeCount"];

// Reason we would deny, if we do.
if (peer->permaFollower) {
// We think it's a permafollower, deny
PHMMM("Permafollower trying to stand up, denying.");
response["Response"] = "deny";
response["Reason"] = "You're a permafollower";
_sendToPeer(peer, response);
return;
}

if (_forkedFrom.count(peer->name)) {
PHMMM("Forked from peer, can't approve standup.");
response["Response"] = "abstain";
response["Reason"] = "We are forked";
_sendToPeer(peer, response);
return;
}

// What's our state
if (SWITHIN(SQLiteNodeState::STANDINGUP, _state, SQLiteNodeState::STANDINGDOWN)) {
// Oh crap, it's trying to stand up while we're leading. Who is higher priority?
if (peer->priority > _priority) {
// The other peer is a higher priority than us, so we should stand down (maybe it crashed, we
// came up as leader, and now it's been brought back up). We'll want to stand down here, but we
// do it gracefully so that we won't lose any transactions in progress.
if (_state == SQLiteNodeState::STANDINGUP) {
PWARN("Higher-priority peer is trying to stand up while we are STANDINGUP, SEARCHING.");
_changeState(SQLiteNodeState::SEARCHING);
} else if (_state == SQLiteNodeState::LEADING) {
PINFO("Higher-priority peer is trying to stand up while we are LEADING, STANDINGDOWN.");
_changeState(SQLiteNodeState::STANDINGDOWN);
} else {
PWARN("Higher-priority peer is trying to stand up while we are STANDINGDOWN, continuing.");
}
} else {
// Deny because we're currently in the process of leading and we're higher priority.
response["Response"] = "deny";
response["Reason"] = "I am leading";

// Hmm, why is a lower priority peer trying to stand up? Is it possible we're no longer in
// control of the cluster? Let's see how many nodes are subscribed.
if (_majoritySubscribed()) {
// we have a majority of the cluster, so ignore this oddity.
PHMMM("Lower-priority peer is trying to stand up while we are " << stateName(_state)
<< " with a majority of the cluster; denying and ignoring.");
} else {
// We don't have a majority of the cluster -- maybe it knows something we don't? For
// example, it could be that the rest of the cluster has forked away from us. This can
// happen if the leader hangs while processing a command: by the time it finishes, the
// cluster might have elected a new leader, forked, and be a thousand commits in the future.
// In this case, let's just reset everything anyway to be safe.
PWARN("Lower-priority peer is trying to stand up while we are " << stateName(_state)
<< ", but we don't have a majority of the cluster so reconnecting and SEARCHING.");
_reconnectAll();
// TODO: This puts us in an ambiguous state if we switch to SEARCHING from LEADING,
// without going through the STANDDOWN process. We'll need to handle it better, but it's
// unclear if this can ever happen at all. exit() may be a reasonable strategy here.
_changeState(SQLiteNodeState::SEARCHING);
}
}
} else {
// Approve if nobody else is trying to stand up
response["Response"] = "approve"; // Optimistic; will override
for (auto otherPeer : _peerList) {
if (otherPeer != peer) {
// See if it's trying to be leader
if (otherPeer->state == SQLiteNodeState::STANDINGUP || otherPeer->state == SQLiteNodeState::LEADING || otherPeer->state == SQLiteNodeState::STANDINGDOWN) {
// We need to contest this standup
response["Response"] = "deny";
response["Reason"] = "peer '" + otherPeer->name + "' is '" + stateName(otherPeer->state) + "'";
break;
}
}
}
}

// Send the response
if (SIEquals(response["Response"], "approve")) {
PINFO("Approving standup request");
} else {
PHMMM("Not approving standup request because " << response["Reason"]);
}
_sendToPeer(peer, response);
_sendStandupResponse(peer, message);
} else if (from == SQLiteNodeState::STANDINGDOWN) {
// STANDINGDOWN: When a peer stands down we double-check to make sure we don't have any outstanding
// transaction (and if we do, we warn and rollback).
Expand Down Expand Up @@ -1605,7 +1520,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
uint64_t commitNum = SToUInt64(message["hashMismatchNumber"]);
_db.getCommits(commitNum, commitNum, result);
_forkedFrom.insert(peer->name);

SALERT("Hash mismatch. Peer " << peer->name << " and I have forked at commit " << message["hashMismatchNumber"]
<< ". I have forked from " << _forkedFrom.size() << " other nodes. I am " << stateName(_state)
<< " and have hash " << result[0][0] << " for that commit. Peer has hash " << message["hashMismatchValue"] << "."
Expand Down Expand Up @@ -1820,16 +1735,6 @@ void SQLiteNode::_onConnect(SQLitePeer* peer) {
login["Version"] = _version;
login["Permafollower"] = _originalPriority ? "false" : "true";
_sendToPeer(peer, login);

// If we're STANDINGUP when a peer connects, send them a STATE message so they know they need to APPROVE or DENY the standup.
// Otherwise we will wait for their response that's not coming,and can eventually time out the standup.
if (_state == SQLiteNodeState::STANDINGUP) {
SData state("STATE");
state["StateChangeCount"] = to_string(_stateChangeCount);
state["State"] = stateName(_state);
state["Priority"] = SToStr(_priority);
_sendToPeer(peer, state);
}
}

// --------------------------------------------------------------------------
Expand Down Expand Up @@ -2814,3 +2719,97 @@ string SQLiteNode::_getLostQuorumLogMessage() const {

return lostQuorumMessage;
}

void SQLiteNode::_sendStandupResponse(SQLitePeer* peer, const SData& message) {
// STANDINGUP: When a peer announces it intends to stand up, we immediately respond with approval or
// denial. We determine this by checking to see if there is any other peer who is already leader or
// also trying to stand up.
SData response("STANDUP_RESPONSE");

// Parrot back the node's attempt count so that it can differentiate stale responses.
response["StateChangeCount"] = message["StateChangeCount"];

// Reason we would deny, if we do.
if (peer->permaFollower) {
// We think it's a permafollower, deny
PHMMM("Permafollower trying to stand up, denying.");
response["Response"] = "deny";
response["Reason"] = "You're a permafollower";
_sendToPeer(peer, response);
return;
}

if (_forkedFrom.count(peer->name)) {
PHMMM("Forked from peer, can't approve standup.");
response["Response"] = "abstain";
response["Reason"] = "We are forked";
_sendToPeer(peer, response);
return;
}

// What's our state
if (SWITHIN(SQLiteNodeState::STANDINGUP, _state, SQLiteNodeState::STANDINGDOWN)) {
// Oh crap, it's trying to stand up while we're leading. Who is higher priority?
if (peer->priority > _priority) {
// The other peer is a higher priority than us, so we should stand down (maybe it crashed, we
// came up as leader, and now it's been brought back up). We'll want to stand down here, but we
// do it gracefully so that we won't lose any transactions in progress.
if (_state == SQLiteNodeState::STANDINGUP) {
PWARN("Higher-priority peer is trying to stand up while we are STANDINGUP, SEARCHING.");
_changeState(SQLiteNodeState::SEARCHING);
} else if (_state == SQLiteNodeState::LEADING) {
PINFO("Higher-priority peer is trying to stand up while we are LEADING, STANDINGDOWN.");
_changeState(SQLiteNodeState::STANDINGDOWN);
} else {
PWARN("Higher-priority peer is trying to stand up while we are STANDINGDOWN, continuing.");
}
} else {
// Deny because we're currently in the process of leading and we're higher priority.
response["Response"] = "deny";
response["Reason"] = "I am leading";

// Hmm, why is a lower priority peer trying to stand up? Is it possible we're no longer in
// control of the cluster? Let's see how many nodes are subscribed.
if (_majoritySubscribed()) {
// we have a majority of the cluster, so ignore this oddity.
PHMMM("Lower-priority peer is trying to stand up while we are " << stateName(_state)
<< " with a majority of the cluster; denying and ignoring.");
} else {
// We don't have a majority of the cluster -- maybe it knows something we don't? For
// example, it could be that the rest of the cluster has forked away from us. This can
// happen if the leader hangs while processing a command: by the time it finishes, the
// cluster might have elected a new leader, forked, and be a thousand commits in the future.
// In this case, let's just reset everything anyway to be safe.
PWARN("Lower-priority peer is trying to stand up while we are " << stateName(_state)
<< ", but we don't have a majority of the cluster so reconnecting and SEARCHING.");
_reconnectAll();
// TODO: This puts us in an ambiguous state if we switch to SEARCHING from LEADING,
// without going through the STANDDOWN process. We'll need to handle it better, but it's
// unclear if this can ever happen at all. exit() may be a reasonable strategy here.
_changeState(SQLiteNodeState::SEARCHING);
}
}
} else {
// Approve if nobody else is trying to stand up
response["Response"] = "approve"; // Optimistic; will override
for (auto otherPeer : _peerList) {
if (otherPeer != peer) {
// See if it's trying to be leader
if (otherPeer->state == SQLiteNodeState::STANDINGUP || otherPeer->state == SQLiteNodeState::LEADING || otherPeer->state == SQLiteNodeState::STANDINGDOWN) {
// We need to contest this standup
response["Response"] = "deny";
response["Reason"] = "peer '" + otherPeer->name + "' is '" + stateName(otherPeer->state) + "'";
break;
}
}
}
}

// Send the response
if (SIEquals(response["Response"], "approve")) {
PINFO("Approving standup request");
} else {
PHMMM("Not approving standup request because " << response["Reason"]);
}
_sendToPeer(peer, response);
}
1 change: 1 addition & 0 deletions sqlitecluster/SQLiteNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ class SQLiteNode : public STCPManager {

// Replicates any transactions that have been made on our database by other threads to peers.
void _sendOutstandingTransactions(const set<uint64_t>& commitOnlyIDs = {});
void _sendStandupResponse(SQLitePeer* peer, const SData& message);
void _sendPING(SQLitePeer* peer);
void _sendToAllPeers(const SData& message, bool subscribedOnly = false);
void _sendToPeer(SQLitePeer* peer, const SData& message);
Expand Down

0 comments on commit 1ef3e2d

Please sign in to comment.