Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Log the last time we spontaneously disconnected from the cluster when forked. #1874

Merged
merged 6 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions sqlitecluster/SQLiteNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,7 @@ bool SQLiteNode::update() {
// Find the freshest non-broken peer (including permafollowers).
if (peer->loggedIn) {
if (_forkedFrom.count(peer->name)) {
SWARN("Hash mismatch. Forked from peer " << peer->name << " so not considering it.");
SWARN("Hash mismatch. Forked from peer " << peer->name << " so not considering it." << _getLostQuorumLogMessage());
continue;
}

Expand Down Expand Up @@ -1591,12 +1591,14 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
uint64_t commitNum = SToUInt64(message["hashMismatchNumber"]);
_db.getCommits(commitNum, commitNum, result);
_forkedFrom.insert(peer->name);

SALERT("Hash mismatch. Peer " << peer->name << " and I have forked at commit " << message["hashMismatchNumber"]
<< ". I have forked from " << _forkedFrom.size() << " other nodes. I am " << stateName(_state)
<< " and have hash " << result[0][0] << " for that commit. Peer has hash " << message["hashMismatchValue"] << ".");
<< " and have hash " << result[0][0] << " for that commit. Peer has hash " << message["hashMismatchValue"] << "."
<< _getLostQuorumLogMessage());

if (_forkedFrom.size() > ((_peerList.size() + 1) / 2)) {
SERROR("Hash mismatch. I have forked from over half the cluster. This is unrecoverable.");
SERROR("Hash mismatch. I have forked from over half the cluster. This is unrecoverable." << _getLostQuorumLogMessage());
}

STHROW("Hash mismatch");
Expand Down Expand Up @@ -1891,6 +1893,9 @@ void SQLiteNode::_onDisconnect(SQLitePeer* peer) {
// It works for the sync thread as well, as there's handling in _changeState to rollback a commit when
// dropping out of leading or standing down (and there can't be commits in progress in other states).
SWARN("[clustersync] We were " << stateName(_state) << " but lost quorum (Disconnected from " << peer->name << "). Going to SEARCHING.");

// Store the time at which this happened for diagnostic purposes.
_lastLostQuorum = STimeNow();
for (const auto* p : _peerList) {
SWARN("[clustersync] Peer " << p->name << " logged in? " << (p->loggedIn ? "TRUE" : "FALSE") << (p->permaFollower ? " (permaFollower)" : ""));
}
Expand Down Expand Up @@ -2025,7 +2030,7 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
// loop. It's entirely possible that we do this for valid reasons - it may be the peer that has the bad database and not us, and there are plenty of other reasons we could switch to
// SEARCHING, but in those cases, we just wait an extra second before trying again.
if (newState == SQLiteNodeState::SEARCHING && _forkedFrom.size()) {
SWARN("Going searching while forked peers present, sleeping 1 second.");
SWARN("Going searching while forked peers present, sleeping 1 second." << _getLostQuorumLogMessage());
sleep(1);
}

Expand Down Expand Up @@ -2785,3 +2790,13 @@ void SQLiteNode::kill() {
peer->reset();
}
}

string SQLiteNode::_getLostQuorumLogMessage() const {
string lostQuorumMessage;
if (_lastLostQuorum) {
lostQuorumMessage = " Lost Quorum at: " + STIMESTAMP_MS(_lastLostQuorum) + " (" +
to_string((double)(STimeNow() - _lastLostQuorum) / 1000000.0) + " seconds ago).";
chiragsalian marked this conversation as resolved.
Show resolved Hide resolved
}

return lostQuorumMessage;
}
6 changes: 6 additions & 0 deletions sqlitecluster/SQLiteNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,8 @@ class SQLiteNode : public STCPManager {

void _changeState(SQLiteNodeState newState, uint64_t commitIDToCancelAfter = 0);

string _getLostQuorumLogMessage() const;

// Handlers for transaction messages.
void _handleBeginTransaction(SQLite& db, SQLitePeer* peer, const SData& message, bool wasConflict);
void _handlePrepareTransaction(SQLite& db, SQLitePeer* peer, const SData& message, uint64_t dequeueTime, uint64_t threadStartTime);
Expand Down Expand Up @@ -320,6 +322,10 @@ class SQLiteNode : public STCPManager {
// Set to true to indicate we're attempting to shut down.
atomic<bool> _isShuttingDown;

// When we spontaneously lose quorum (due to an unexpected node disconnection) we log the time. Later, if we detect we've forked,
// We show this time in a log line as a diagnostic message.
atomic<uint64_t> _lastLostQuorum = 0;

// Store the ID of the last transaction that we replicated to peers. Whenever we do an update, we will try and send
// any new committed transactions to peers, and update this value.
uint64_t _lastSentTransactionID;
Expand Down