Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
tylerkaraszewski committed Sep 18, 2024
1 parent 446f2ea commit 7b0cb71
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 0 deletions.
9 changes: 9 additions & 0 deletions sqlitecluster/SQLiteNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ bool SQLiteNode::update() {
// Find the freshest non-broken peer (including permafollowers).
if (peer->loggedIn) {
if (_forkedFrom.count(peer->name)) {
// UPDATE LOG LINE.
SWARN("Hash mismatch. Forked from peer " << peer->name << " so not considering it.");
continue;
}
Expand Down Expand Up @@ -1591,11 +1592,14 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
uint64_t commitNum = SToUInt64(message["hashMismatchNumber"]);
_db.getCommits(commitNum, commitNum, result);
_forkedFrom.insert(peer->name);

// UPDATE LOG LINE.
SALERT("Hash mismatch. Peer " << peer->name << " and I have forked at commit " << message["hashMismatchNumber"]
<< ". I have forked from " << _forkedFrom.size() << " other nodes. I am " << stateName(_state)
<< " and have hash " << result[0][0] << " for that commit. Peer has hash " << message["hashMismatchValue"] << ".");

if (_forkedFrom.size() > ((_peerList.size() + 1) / 2)) {
// UPDATE LOG LINE.
SERROR("Hash mismatch. I have forked from over half the cluster. This is unrecoverable.");
}

Expand Down Expand Up @@ -1891,6 +1895,9 @@ void SQLiteNode::_onDisconnect(SQLitePeer* peer) {
// It works for the sync thread as well, as there's handling in _changeState to rollback a commit when
// dropping out of leading or standing down (and there can't be commits in progress in other states).
SWARN("[clustersync] We were " << stateName(_state) << " but lost quorum (Disconnected from " << peer->name << "). Going to SEARCHING.");

// Store the time at which this happened for diagnostic purposes.
_lastLostQuorum = STimeNow();
for (const auto* p : _peerList) {
SWARN("[clustersync] Peer " << p->name << " logged in? " << (p->loggedIn ? "TRUE" : "FALSE") << (p->permaFollower ? " (permaFollower)" : ""));
}
Expand Down Expand Up @@ -2025,6 +2032,8 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
// loop. It's entirely possible that we do this for valid reasons - it may be the peer that has the bad database and not us, and there are plenty of other reasons we could switch to
// SEARCHING, but in those cases, we just wait an extra second before trying again.
if (newState == SQLiteNodeState::SEARCHING && _forkedFrom.size()) {

// UPDATE LOG LINE.
SWARN("Going searching while forked peers present, sleeping 1 second.");
sleep(1);
}
Expand Down
4 changes: 4 additions & 0 deletions sqlitecluster/SQLiteNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,10 @@ class SQLiteNode : public STCPManager {
// Set to true to indicate we're attempting to shut down.
atomic<bool> _isShuttingDown;

// When we spontaneously lose quorum (do to an unexpected node disconnection) we log the time. Later, if we detect we've forked,
// We show this time in a log line as a diagnostic message.
atomic<uint64_t> _lastLostQuorum = 0;

// Store the ID of the last transaction that we replicated to peers. Whenever we do an update, we will try and send
// any new committed transactions to peers, and update this value.
uint64_t _lastSentTransactionID;
Expand Down

0 comments on commit 7b0cb71

Please sign in to comment.