From 3ab1cec3804a5a4f8510a159b45e76af0f58aa19 Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Mon, 23 Sep 2024 13:35:54 +0000
Subject: [PATCH 1/9] Add "2ha.sh" script, managing 2-node Canonical K8s HA AA
 clusters

Scenario overview:

* Canonical K8s cluster containing 2 nodes
* Dqlite data store (unable to obtain quorum)
* Primary node dqlite files stored on DRBD
  * sync block-level replication between the two nodes
* cluster monitoring and failover handled through Pacemaker

Script functionality:

* boostrap the service
  * wait for a DRBD primary to be elected
  * detect the node role based on the DRBD status and Dqlite state
    * have the replica wait for the primary to be ready before continuing
  * recover Dqlite after failovers
  * transfer and apply recovery files to secondary nodes
  * transfer Dqlite files to DRBD and other backup locations, creating
    necessary symlinks
* install required packages
* purge all K8s data
* clear Pacemaker taints
* remove recovery data

"2ha.sh start_service" is intended to be used as part of a systemd
unit that bootstraps the k8s services, coordinating with the other
node and taking any necessary steps to recover Dqlite.
---
 k8s/hack/2ha.sh | 823 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 823 insertions(+)
 create mode 100755 k8s/hack/2ha.sh

diff --git a/k8s/hack/2ha.sh b/k8s/hack/2ha.sh
new file mode 100755
index 000000000..4df08f194
--- /dev/null
+++ b/k8s/hack/2ha.sh
@@ -0,0 +1,823 @@
+#!/bin/bash
+
+# Prerequisites:
+# * required packages installed using the "install_packages" command.
+# * initialized k8s cluster, both nodes joined
+# * the current user has ssh access to the peer node.
+#   - used to handle k8s services and transfer dqlite data
+# * the current user has passwordless sudo enabled.
+sourced=0
+
+DEBUG=${DEBUG:-0}
+if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
+    sourced=1
+else
+    sourced=0
+    set -eEu -o pipefail
+
+    if [[ $DEBUG -eq 1 ]]; then
+        export PS4='+(${BASH_SOURCE}:${LINENO}): ${FUNCNAME[0]:+${FUNCNAME[0]}(): }'
+        set -x
+    fi
+fi
+
+SYSTEMD_SERVICE_NAME=${SYSTEMD_SERVICE_NAME:-"2ha_k8s"}
+DRBD_MOUNT_DIR=${DRBD_MOUNT_DIR:-"/mnt/drbd0"}
+SSH_USERNAME=${SSH_USERNAME:-"ubuntu"}
+SSH_OPTS=${SSH_OPTS:-"-o StrictHostKeyChecking=no -o ConnectTimeout=5"}
+K8SD_LOG_LEVEL=${K8SD_LOG_LEVEL:-"0"}
+K8S_SNAP_CHANNEL=${K8S_SNAP_CHANNEL:-"latest/edge"}
+DRBD_RES_NAME=${DRBD_RES_NAME:-"r0"}
+DRBD_READY_TIMEOUT=${DRBD_READY_TIMEOUT:-30}
+PEER_READY_TIMEOUT=${PEER_READY_TIMEOUT:-60}
+
+K8SD_PATH=${K8SD_PATH:-/snap/k8s/current/bin/k8sd}
+
+K8S_DQLITE_STATE_DIR=/var/snap/k8s/common/var/lib/k8s-dqlite
+K8SD_STATE_DIR="/var/snap/k8s/common/var/lib/k8sd/state"
+
+K8S_DQLITE_STATE_BKP_DIR=/var/snap/k8s/common/var/lib/k8s-dqlite.bkp
+K8SD_STATE_BKP_DIR="/var/snap/k8s/common/var/lib/k8sd/state.bkp"
+
+K8S_DQLITE_INFO_YAML="$K8S_DQLITE_STATE_DIR/info.yaml"
+K8S_DQLITE_CLUSTER_YAML="$K8S_DQLITE_STATE_DIR/cluster.yaml"
+
+K8SD_INFO_YAML="$K8SD_STATE_DIR/database/info.yaml"
+K8SD_CLUSTER_YAML="$K8SD_STATE_DIR/database/cluster.yaml"
+
+# Backup yamls are expected to contain the right node ids and
+# addresses while the DRBD files may contain settings from the other node
+# and have to be updated.
+K8S_DQLITE_INFO_BKP_YAML="$K8S_DQLITE_STATE_BKP_DIR/info.yaml"
+K8S_DQLITE_CLUSTER_BKP_YAML="$K8S_DQLITE_STATE_BKP_DIR/cluster.yaml"
+K8SD_INFO_BKP_YAML="$K8SD_STATE_BKP_DIR/database/info.yaml"
+K8SD_CLUSTER_BKP_YAML="$K8SD_STATE_BKP_DIR/database/cluster.yaml"
+
+K8SD_RECOVERY_TARBALL="$K8SD_STATE_DIR/recovery_db.tar.gz"
+# K8SD will remove this file upon starting. We need to create a backup that
+# can be transferred to other nodes.
+K8SD_RECOVERY_TARBALL_BKP="$K8SD_STATE_DIR/recovery_db.bkp.tar.gz"
+
+DQLITE_ROLE_VOTER=0
+DQLITE_ROLE_STANDBY=1
+DQLITE_ROLE_SPARE=2
+
+function log_message () {
+    local msg="[$(date -uIseconds)] $@"
+    >&2 echo -e "$msg"
+}
+
+function get_dqlite_node_id() {
+    local infoYamlPath=$1
+    sudo cat $infoYamlPath | yq -r '.ID'
+}
+
+function get_dqlite_node_addr() {
+    local infoYamlPath=$1
+    sudo cat $infoYamlPath | yq -r '.Address'
+}
+
+function get_dqlite_node_role() {
+    local infoYamlPath=$1
+    sudo cat $infoYamlPath | yq -r '.Role'
+}
+
+function get_dqlite_role_from_cluster_yaml() {
+    # Note that the cluster.yaml role may not match the info.yaml role.
+    # In case of a freshly joined node, info.yaml will have "voter" role
+    # while cluster.yaml has "spare" role.
+    local clusterYamlPath=$1
+    local nodeId=$2
+
+    # Update the specified node.
+    sudo cat $clusterYamlPath | \
+        yq -r "(.[] | select(.ID == \"$nodeId\") | .Role )"
+}
+
+function set_dqlite_node_role() {
+    # The yq snap installs in confined mode, so it's unable to access the
+    # dqlite config files.
+    # In order to modify files in-place, we're using sponge. It reads all
+    # the stdin data before opening the output file.
+    local infoYamlPath=$1
+    local role=$2
+    sudo cat $infoYamlPath | \
+        yq ".Role = $role" |
+        sudo sponge $infoYamlPath
+}
+
+# Update cluster.yaml, setting the specified node as voter (role = 0).
+# The other nodes will become spares, having the role set to 2.
+function set_dqlite_node_as_sole_voter() {
+    local clusterYamlPath=$1
+    local nodeId=$2
+
+    # Update the specified node.
+    sudo cat $clusterYamlPath | \
+        yq "(.[] | select(.ID == \"$nodeId\") | .Role ) = 0" | \
+        sudo sponge $clusterYamlPath
+
+    # Update the other nodes.
+    sudo cat $clusterYamlPath | \
+        yq "(.[] | select(.ID != \"$nodeId\") | .Role ) = 2" | \
+        sudo sponge $clusterYamlPath
+}
+
+function get_dql_peer_ip() {
+    local clusterYamlPath=$1
+    local nodeId=$2
+
+    local addresses=( $(sudo cat $clusterYamlPath | \
+         yq "(.[] | select(.ID != \"$nodeId\") | .Address )") )
+
+    if [[ ${#addresses[@]} -gt 1 ]]; then
+        log_message "More than one dql peers found: ${addresses[@]}"
+        exit 1
+    fi
+
+    if [[ ${#addresses[@]} -lt 1 ]]; then
+        log_message "No dql peers found."
+        exit 1
+    fi
+
+    echo ${addresses[0]} | cut -d ":" -f 1
+}
+
+# This function moves the dqlite state directories to the DRBD mount,
+# replacing them with symlinks. This ensures that the primary will always use
+# the latest DRBD data.
+#
+# The existing contents are moved to a backup folder, which can be used as
+# part of the recovery process.
+function move_statedirs() {
+    sudo mkdir -p $DRBD_MOUNT_DIR/k8s-dqlite
+    sudo mkdir -p $DRBD_MOUNT_DIR/k8sd
+
+    log_message "Validating dqlite state directories."
+    check_statedir $K8S_DQLITE_STATE_DIR $DRBD_MOUNT_DIR/k8s-dqlite
+    check_statedir $K8SD_STATE_DIR $DRBD_MOUNT_DIR/k8sd
+
+    if [[ ! -L $K8S_DQLITE_STATE_DIR ]] || [[ ! -L $K8SD_STATE_DIR ]]; then
+        local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_YAML`
+        if [[ -z $k8sDqliteNodeId ]]; then
+            log_message "Couldn't retrieve k8s-dqlite node id."
+            exit 1
+        fi
+
+
+        local expRole=`get_expected_dqlite_role`
+        # For fresh k8s clusters, the info.yaml role may not match the cluster.yaml role.
+        local k8sDqliteRole=`get_dqlite_role_from_cluster_yaml \
+            $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId`
+
+        if [[ $expRole -ne $k8sDqliteRole ]]; then
+            # TODO: consider automating this. We may move the pacemaker resource
+            # ourselves and maybe even copy the remote files through scp or ssh.
+            # However, there's a risk of race conditions.
+            log_message "DRBD volume mounted on replica, refusing to transfer dqlite files."
+            log_message "Move the DRBD volume to the primary node (through the fs_res Pacemaker resource) and try again."
+            log_message "Example: sudo crm resource move fs_res <primary_node> && sudo crm resource clear fs_res"
+            exit 1
+        fi
+    fi
+
+    # Ensure that the k8s services are stopped.
+    log_message "Stopping k8s services."
+    sudo snap stop k8s
+
+    if [[ ! -L $K8S_DQLITE_STATE_DIR ]]; then
+        log_message "Not a symlink: $K8S_DQLITE_STATE_DIR, " \
+                    "transferring to $DRBD_MOUNT_DIR/k8s-dqlite"
+        sudo cp -r $K8S_DQLITE_STATE_DIR/. $DRBD_MOUNT_DIR/k8s-dqlite
+
+        log_message "Creating k8s-dqlite state dir backup: $K8S_DQLITE_STATE_BKP_DIR"
+        sudo rm -rf $K8S_DQLITE_STATE_BKP_DIR
+        sudo mv $K8S_DQLITE_STATE_DIR/ $K8S_DQLITE_STATE_BKP_DIR
+
+        log_message "Creating symlink $K8S_DQLITE_STATE_DIR -> $DRBD_MOUNT_DIR/k8s-dqlite"
+        sudo ln -sf $DRBD_MOUNT_DIR/k8s-dqlite $K8S_DQLITE_STATE_DIR
+    else
+        log_message "Symlink $K8S_DQLITE_STATE_DIR points to $DRBD_MOUNT_DIR/k8s-dqlite"
+    fi
+
+    if [[ ! -L $K8SD_STATE_DIR ]]; then
+        log_message "Not a symlink: $K8SD_STATE_DIR, " \
+                    "transferring to $DRBD_MOUNT_DIR/k8sd"
+        sudo cp -r $K8SD_STATE_DIR/. $DRBD_MOUNT_DIR/k8sd
+
+        log_message "Creating k8sd state dir backup: $K8SD_STATE_BKP_DIR"
+        sudo rm -rf $K8SD_STATE_BKP_DIR
+        sudo mv $K8SD_STATE_DIR/ $K8SD_STATE_BKP_DIR
+
+        log_message "Creating symlink $K8SD_STATE_DIR -> $DRBD_MOUNT_DIR/k8sd"
+        sudo ln -sf $DRBD_MOUNT_DIR/k8sd $K8SD_STATE_DIR
+    else
+        log_message "Symlink $K8SD_STATE_DIR points to $DRBD_MOUNT_DIR/k8sd"
+    fi
+}
+
+function ensure_mount_rw() {
+    if ! mount | grep "on $DRBD_MOUNT_DIR type" &> /dev/null; then
+        log_message "Missing DRBD mount: $DRBD_MOUNT_DIR"
+        return 1
+    fi
+
+    if ! mount | grep "on $DRBD_MOUNT_DIR type" | grep "rw" &> /dev/null; then
+        log_message "DRBD mount read-only: $DRBD_MOUNT_DIR"
+        return 1
+    fi
+}
+
+function wait_drbd_promoted() {
+    log_message "Waiting for one of the DRBD nodes to be promoted."
+
+    local pollInterval=2
+    # Special parameter, no need to increase it ourselves.
+    SECONDS=0
+
+    while [[ $SECONDS -lt $DRBD_READY_TIMEOUT ]]; do
+        if sudo crm resource status drbd_master_slave | grep Promoted ; then
+            log_message "DRBD node promoted."
+            return 0
+        else
+            log_message "No DRBD node promoted yet, retrying in ${pollInterval}s"
+            sleep $pollInterval
+        fi
+    done
+
+    log_message "Timed out waiting for primary DRBD node." \
+                "Waited: ${SECONDS}. Timeout: ${DRBD_READY_TIMEOUT}s."
+    return 1
+}
+
+function ensure_drbd_unmounted() {
+    if mount | grep "on $DRBD_MOUNT_DIR type" &> /dev/null ; then
+        log_message "DRBD device mounted: $DRBD_MOUNT_DIR"
+        return 1
+    fi
+}
+
+function ensure_drbd_ready() {
+    ensure_mount_rw 
+
+    diskStatus=`sudo drbdadm status r0 | grep disk | head -1 | cut -d ":" -f 2`
+    if [[ $diskStatus != "UpToDate" ]]; then
+        log_message "DRBD disk status not ready. Current status: $diskStatus"
+        return 1
+    else
+        log_message "DRBD disk up to date."
+    fi
+}
+
+function wait_drbd_primary () {
+    log_message "Waiting for primary DRBD node to be ready."
+
+    local pollInterval=2
+    # Special parameter, no need to increase it ourselves.
+    SECONDS=0
+
+    while [[ $SECONDS -lt $DRBD_READY_TIMEOUT ]]; do
+        if ensure_drbd_ready; then
+            log_message "Primary DRBD node ready."
+            return 0
+        else
+            log_message "Primary DRBD node not ready yet, retrying in ${pollInterval}s"
+            sleep $pollInterval
+        fi
+    done
+
+    log_message "Timed out waiting for primary DRBD node." \
+                "Waited: ${SECONDS}. Timeout: ${DRBD_READY_TIMEOUT}s."
+    return 1
+}
+
+function wait_for_peer_k8s() {
+    local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_BKP_YAML`
+    if [[ -z $k8sDqliteNodeId ]]; then
+        log_message "Couldn't retrieve k8s-dqlite node id."
+        exit 1
+    fi
+
+    local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId`
+    if [[ -z $peerIp ]]; then
+        log_message "Couldn't retrieve dqlite peer ip."
+        exit 1
+    fi
+
+    log_message "Waiting for k8s to start on peer: $peerIp. Timeout: ${PEER_READY_TIMEOUT}s."
+
+    local pollInterval=2
+    # Special parameter, no need to increase it ourselves.
+    SECONDS=0
+
+    while [[ $SECONDS -lt $PEER_READY_TIMEOUT ]]; do
+        if ssh $SSH_OPTS $SSH_USERNAME@$peerIp sudo k8s status &> /dev/null; then
+            log_message "Peer ready."
+            return 0
+        else
+            log_message "Peer not ready yet, retrying in ${pollInterval}s."
+            sleep $pollInterval
+        fi
+    done
+
+    log_message "Timed out waiting for k8s services to start on peer." \
+                "Waited: ${SECONDS}. Timeout: ${PEER_READY_TIMEOUT}s."
+    return 1
+
+}
+
+# "drbdadm status" throws the following if our service starts before
+# Pacemaker initialized DRBD (even on the secondary).
+#
+#  r0: No such resource
+#  Command 'drbdsetup-84 status r0' terminated with exit code 10
+function wait_drbd_resource () {
+    log_message "Waiting for DRBD resource."
+
+    local pollInterval=2
+    # Special parameter, no need to increase it ourselves.
+    SECONDS=0
+
+    while [[ $SECONDS -lt $DRBD_READY_TIMEOUT ]]; do
+        if sudo drbdadm status &> /dev/null; then
+            log_message "DRBD ready."
+            return 0
+        else
+            log_message "DRBD not ready yet, retrying in ${pollInterval}s" 
+            sleep $pollInterval
+        fi
+    done
+
+    log_message "Timed out waiting for DRBD resource." \
+                "Waited: ${SECONDS}. Timeout: ${DRBD_READY_TIMEOUT}s."
+    return 1
+}
+
+# Based on the drbd volume state, we decide if this node should be a
+# dqlite voter or a spare.
+function get_expected_dqlite_role() {
+    drbdResRole=`sudo drbdadm status $DRBD_RES_NAME | head -1 | grep role | cut -d ":" -f 2`
+
+    case $drbdResRole in
+        "Primary")
+            echo $DQLITE_ROLE_VOTER
+            ;;
+        "Secondary")
+            echo $DQLITE_ROLE_SPARE
+            ;;
+        *)
+            log_message "Unexpected DRBD role: $drbdResRole"
+            exit 1
+            ;;
+    esac
+}
+
+function validate_drbd_state() {
+    wait_drbd_promoted
+
+    drbdResRole=`sudo drbdadm status $DRBD_RES_NAME | head -1 | grep role | cut -d ":" -f 2`
+
+    case $drbdResRole in
+        "Primary")
+            wait_drbd_primary
+            ;;
+        "Secondary")
+            ensure_drbd_unmounted
+            ;;
+        *)
+            log_message "Unexpected DRBD role: $drbdResRole"
+            exit 1
+            ;;
+    esac
+}
+
+# After a failover, the state dir points to the shared DRBD volume.
+# We need to restore the node certificate and config files.
+function restore_dqlite_confs_and_certs() {
+    log_message "Restoring dqlite configs and certificates."
+
+    sudo cp $K8S_DQLITE_STATE_BKP_DIR/info.yaml $K8S_DQLITE_STATE_DIR
+
+    sudo cp $K8SD_STATE_BKP_DIR/database/info.yaml $K8SD_STATE_DIR/database/
+    sudo cp $K8SD_STATE_BKP_DIR/daemon.yaml $K8SD_STATE_DIR/
+
+    # restore k8s-dqlite certificates
+    sudo cp $K8S_DQLITE_STATE_BKP_DIR/cluster.crt $K8S_DQLITE_STATE_DIR
+    sudo cp $K8S_DQLITE_STATE_BKP_DIR/cluster.key $K8S_DQLITE_STATE_DIR
+
+    # restore k8sd certificates
+    sudo cp $K8SD_STATE_BKP_DIR/cluster.crt $K8SD_STATE_DIR
+    sudo cp $K8SD_STATE_BKP_DIR/cluster.key $K8SD_STATE_DIR
+    sudo cp $K8SD_STATE_BKP_DIR/server.crt $K8SD_STATE_DIR
+    sudo cp $K8SD_STATE_BKP_DIR/server.key $K8SD_STATE_DIR
+}
+
+# Promote the current node as primary and prepare the recovery archives.
+function promote_as_primary() {
+    local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_BKP_YAML`
+    if [[ -z $k8sDqliteNodeId ]]; then
+        log_message "Couldn't retrieve k8s-dqlite node id."
+        exit 1
+    fi
+
+    local k8sdNodeId=`get_dqlite_node_id $K8SD_INFO_BKP_YAML`
+    if [[ -z $k8sDqliteNodeId ]]; then
+        log_message "Couldn't retrieve k8s-dqlite node id."
+        exit 1
+    fi
+
+    local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId`
+    if [[ -z $peerIp ]]; then
+        log_message "Couldn't retrieve dqlite peer ip."
+        exit 1
+    fi
+
+    log_message "Stopping local k8s services."
+    sudo snap stop k8s
+
+    # After a node crash, there may be a leaked control socket file and
+    # k8sd will refuse to perform the recovery. We've just stopped the k8s snap,
+    # it should be safe to remove such stale unix sockets.
+    log_message "Removing stale control sockets."
+    sudo rm -f $K8SD_STATE_DIR/control.socket
+
+    local stoppedPeer=0
+    log_message "Checking peer k8s services: $peerIp"
+    if ssh $SSH_OPTS $SSH_USERNAME@$peerIp sudo snap services k8s | grep -v inactive | grep "active"; then
+        log_message "Attempting to stop peer k8s services."
+        # Stop the k8s snap directly instead of the wrapper service so that
+        # we won't cause failures if both nodes start at the same time.
+        # The secondary will wait for the k8s services to start on the primary.
+        if ssh $SSH_OPTS $SSH_USERNAME@$peerIp sudo snap stop k8s; then
+            stoppedPeer=1
+            log_message "Successfully stopped peer k8s services."
+            log_message "The stopped services are going to be restarted after the recovery finishes."
+        else
+            log_message "Couldn't stop k8s services on the peer node." \
+                        "Assuming that it's stopped and proceeding with the recovery."
+        fi
+    fi
+
+    log_message "Ensuring rw access to DRBD mount."
+    # Having RW access to the drbd mount implies that this is the primary node.
+    ensure_mount_rw
+
+    restore_dqlite_confs_and_certs
+
+    log_message "Updating dqlite roles."
+    # Update info.yaml
+    set_dqlite_node_role $K8S_DQLITE_INFO_YAML $DQLITE_ROLE_VOTER
+    set_dqlite_node_role $K8SD_INFO_YAML $DQLITE_ROLE_VOTER
+
+    # Update cluster.yaml
+    set_dqlite_node_as_sole_voter $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId
+    set_dqlite_node_as_sole_voter $K8SD_CLUSTER_YAML $k8sdNodeId
+
+    log_message "Restoring dqlite."
+    sudo $K8SD_PATH cluster-recover \
+        --state-dir=$K8SD_STATE_DIR \
+        --k8s-dqlite-state-dir=$K8S_DQLITE_STATE_DIR \
+        --log-level $K8SD_LOG_LEVEL \
+        --non-interactive
+
+    # TODO: consider removing offending segments if the last snapshot is behind
+    # and then try again.
+
+    log_message "Copying k8sd recovery tarball to $K8SD_RECOVERY_TARBALL_BKP"
+    sudo cp $K8SD_RECOVERY_TARBALL $K8SD_RECOVERY_TARBALL_BKP
+
+    log_message "Restarting k8s services."
+    sudo snap start k8s
+
+    # TODO: validate k8s status
+
+    if [[ $stoppedPeer -ne 0 ]]; then
+        log_message "Restarting peer k8s services: $peerIp"
+        # It's importand to issue a restart here since we stopped the k8s snap
+        # directly and the wrapper service doesn't currently monitor it.
+        ssh $SSH_OPTS $SSH_USERNAME@$peerIp sudo systemctl restart $SYSTEMD_SERVICE_NAME ||
+            log_message "Couldn't start peer k8s services."
+    fi
+}
+
+function process_recovery_files_on_secondary() {
+    local peerIp="$1"
+
+    log_message "Ensuring that the drbd volume is unmounted."
+    ensure_drbd_unmounted
+
+    log_message "Restoring local dqlite backup files."
+    sudo cp -r $K8S_DQLITE_STATE_BKP_DIR/. $DRBD_MOUNT_DIR/k8s-dqlite/
+    sudo cp -r $K8SD_STATE_BKP_DIR/. $DRBD_MOUNT_DIR/k8sd/
+
+    sudo rm -f $DRBD_MOUNT_DIR/k8s-dqlite/00*-*
+    sudo rm -f $DRBD_MOUNT_DIR/k8s-dqlite/snapshot-*
+    sudo rm -f $DRBD_MOUNT_DIR/k8s-dqlite/metadata*
+
+    sudo rm -f $DRBD_MOUNT_DIR/k8sd/database/00*-*
+    sudo rm -f $DRBD_MOUNT_DIR/k8sd/database/snapshot-*
+    sudo rm -f $DRBD_MOUNT_DIR/k8sd/database/metadata*
+
+    log_message "Retrieving k8sd recovery tarball."
+    scp $SSH_OPTS $SSH_USERNAME@$peerIp:$K8SD_RECOVERY_TARBALL_BKP /tmp/
+    sudo mv /tmp/`basename $K8SD_RECOVERY_TARBALL_BKP` \
+        $K8SD_RECOVERY_TARBALL
+
+    # TODO: do we really need to transfer recovery tarballs in this situation?
+    # the spare is simply forwarding the requests to the primary, it doesn't really
+    # hold any data.
+    lastK8sDqliteRecoveryTarball=`ssh $SSH_USERNAME@$peerIp \
+        sudo ls /var/snap/k8s/common/ | \
+            grep -P "recovery-k8s-dqlite-.*post-recovery" | \
+            tail -1`
+    if [ -z "$lastK8sDqliteRecoveryTarball" ]; then
+        log_message "couldn't retrieve latest k8s-dqlite recovery tarball from $peerIp"
+        exit 1
+    fi
+
+    log_message "Retrieving k8s-dqlite recovery tarball."
+    scp $SSH_USERNAME@$peerIp:/var/snap/k8s/common/$lastK8sDqliteRecoveryTarball /tmp/
+    sudo tar -xf /tmp/$lastK8sDqliteRecoveryTarball -C $K8S_DQLITE_STATE_DIR
+
+    log_message "Updating dqlite roles."
+    # Update info.yaml
+    set_dqlite_node_role $K8S_DQLITE_INFO_YAML $DQLITE_ROLE_SPARE
+    set_dqlite_node_role $K8SD_INFO_YAML $DQLITE_ROLE_SPARE
+    # We're skipping cluster.yaml, we expect the recovery archives to contain
+    # updated cluster.yaml files.
+}
+
+# Recover a former primary, now secondary dqlite node.
+# Run "promote_as_primary" on the ther node first.
+function rejoin_secondary() {
+    log_message "Recovering secondary node."
+
+    local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_BKP_YAML`
+    if [[ -z $k8sDqliteNodeId ]]; then
+        log_message "Couldn't retrieve k8s-dqlite node id."
+        exit 1
+    fi
+
+    local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId`
+    if [[ -z $peerIp ]]; then
+        log_message "Couldn't retrieve dqlite peer ip."
+        exit 1
+    fi
+
+    log_message "Stopping k8s services."
+    sudo snap stop k8s
+
+    log_message "Adding temporary Pacemaker constraint."
+    # We need to prevent failovers from happening while restoring secondary
+    # dqlite data, otherwise we may end up overriding or deleting the primary
+    # node data.
+    #
+    # TODO: consider reducing the constraint scope (e.g. resource level constraint
+    # instead of putting the entire node in standby).
+    sudo crm node standby
+    if ! process_recovery_files_on_secondary $peerIp; then
+        log_message "Dqlite recovery filed, removing temporary Pacemaker constraints."
+        sudo crm node online
+        exit 1
+    fi
+
+    log_message "Restoring Pacemaker state."
+    sudo crm node online
+
+    log_message "Restarting k8s services"
+    sudo snap start k8s
+}
+
+function install_packages() {
+    sudo apt-get update
+
+    sudo DEBIAN_FRONTEND=noninteractive apt-get install \
+      python3 python3-netaddr \
+      pacemaker resource-agents-extra \
+      drbd-utils ntp linux-image-generic snap moreutils -y
+    sudo modprobe drbd || sudo apt-get install -y linux-modules-extra-$(uname -r)
+
+    sudo snap install jq
+    sudo snap install yq
+    sudo snap install install k8s --classic $K8S_SNAP_CHANNEL
+}
+
+function check_statedir() {
+    local stateDir="$1"
+    local expLink="$2"
+
+    if [[ ! -e $stateDir ]]; then
+        log_message "State directory missing: $stateDir"
+        exit 1
+    fi
+
+    target=`readlink -f $stateDir`
+    if [[ -L "$stateDir" ]] && [[ "$target" != "$expLink" ]]; then
+        log_message "Unexpected symlink target. " \
+                    "State directory: $stateDir. " \
+                    "Expected symlink target: $expLink. " \
+                    "Actual symlink target: $target."
+        exit 1
+    fi
+
+    if [[ ! -L $stateDir ]] &&  [[ ! -z "$( ls -A $expLink )" ]]; then
+        log_message "State directory is not a symlink, however the " \
+                    "expected link target exists and is not empty. " \
+                    "We can't know which files to keep, erroring out. " \
+                    "State directory: $stateDir. " \
+                    "Expected symlink target: $expLink."
+        exit 1
+    fi
+}
+
+function check_peer_recovery_tarballs() {
+    log_message "Retrieving k8s-dqlite node id."
+    local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_BKP_YAML`
+    if [[ -z $k8sDqliteNodeId ]]; then
+        log_message "Couldn't retrieve k8s-dqlite node id."
+        exit 1
+    fi
+
+    log_message "Retrieving dqlite peer ip."
+    local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId`
+    if [[ -z $peerIp ]]; then
+        log_message "Couldn't retrieve dqlite peer ip."
+        exit 1
+    fi
+
+    log_message "Checking for recovery taballs on $peerIp."
+
+    k8sdRecoveryTarball=`ssh $SSH_OPTS $SSH_USERNAME@$peerIp \
+        sudo ls -A "$K8SD_RECOVERY_TARBALL_BKP"`
+    if [[ -z $k8sdRecoveryTarball ]]; then
+        log_message "Peer $peerIp doesn't have k8sd recovery tarball."
+        return 1
+    fi
+
+    lastK8sDqliteRecoveryTarball=`ssh $SSH_OPTS $SSH_USERNAME@$peerIp \
+        sudo ls /var/snap/k8s/common/ | \
+            grep -P "recovery-k8s-dqlite-.*post-recovery"`
+    if [[ -z $k8sdRecoveryTarball ]]; then
+        log_message "Peer $peerIp doesn't have k8s-dqlite recovery tarball."
+        return 1
+    fi
+}
+
+function start_service() {
+    log_message "Initializing node."
+
+    # DRBD is the primary source of truth for the dqlite role.
+    # We need to wait for it to become available.
+    wait_drbd_resource
+
+    # dump the drbd and pacemaker status for debugging purposes.
+    sudo drbdadm status
+    sudo crm status
+
+    validate_drbd_state
+
+    move_statedirs
+
+    local expRole=`get_expected_dqlite_role`
+    case $expRole in
+        $DQLITE_ROLE_VOTER)
+            log_message "Assuming the dqlite voter role (primary)."
+
+            # We'll assume that if the primary stopped, it needs to go through
+            # the recovery process.
+            promote_as_primary
+            ;;
+        $DQLITE_ROLE_SPARE)
+            log_message "Assuming the dqlite spare role (secondary)."
+
+            wait_for_peer_k8s
+
+            if check_peer_recovery_tarballs; then
+                log_message "Recovery tarballs found, initiating recovery."
+                rejoin_secondary
+            else
+                # Maybe the primary didn't change and we don't need to go
+                # through the recovery process.
+                # TODO: consider comparing the cluster.yaml files from the
+                # two nodes.
+                log_message "Recovery tarballs missing, skipping recovery."
+                log_message "Starting k8s services."
+                sudo snap k8s start
+            fi
+            ;;
+        *)
+            log_message "Unexpected dqlite role: $expRole"
+            exit 1
+            ;;
+    esac
+}
+
+function clean_recovery_data() {
+    log_message "Cleaning up dqlite recovery data."
+    rm -f $K8SD_RECOVERY_TARBALL
+    rm -f $K8SD_RECOVERY_TARBALL_BKP
+    rm -f $K8S_DQLITE_STATE_DIR/recovery-k8s-dqlite*
+}
+
+function purge() {
+    log_message "Removing the k8s snap and all the associated files."
+
+    sudo snap remove --purge k8s
+
+    if [[ -d $DRBD_MOUNT_DIR ]]; then
+        log_message "Cleaning up $DRBD_MOUNT_DIR."
+        sudo rm -rf $DRBD_MOUNT_DIR/k8sd
+        sudo rm -rf $DRBD_MOUNT_DIR/k8s-dqlite
+
+        if ! ensure_drbd_unmounted; then
+            log_message "Cleaning up $DRBD_MOUNT_DIR mount point."
+
+            # The replicas use the mount dir directly, without a block device
+            # attachment. We need to clean up the mount point as well.
+            #
+            # We're using another mount with "--bind" to bypass the drbd mount.
+            tempdir=`mktemp -d`
+            # We need to mount the parent dir.
+            sudo mount --bind `dirname $DRBD_MOUNT_DIR` $tempdir
+            sudo rm -rf $tempdir/`basename $DRBD_MOUNT_DIR`/k8sd
+            sudo rm -rf $tempdir/`basename $DRBD_MOUNT_DIR`/k8s-dqlite
+            sudo umount $tempdir
+            sudo rm -rf $tempdir
+        fi
+    fi
+}
+
+function clear_taints() {
+    log_message "Clearing tainted Pacemaker resources."
+    sudo crm resource clear ha_k8s_failover_service
+    sudo crm resource clear fs_res
+    sudo crm resource clear drbd_master_slave
+
+    sudo crm resource cleanup ha_k8s_failover_service
+    sudo crm resource cleanup fs_res
+    sudo crm resource cleanup drbd_master_slave
+}
+
+function main() {
+    local command=$1
+
+    case $command in
+        "move_statedirs")
+            move_statedirs
+            ;;
+        "install_packages")
+            install_packages
+            ;;
+        "start_service")
+            start_service
+            ;;
+        "clean_recovery_data")
+            clean_recovery_data
+            ;;
+        "purge")
+            purge
+            ;;
+        "clear_taints")
+            clear_taints
+            ;;
+        *)
+            cat << EOF
+Unknown command: $1
+
+usage: $0 <command>
+
+Commands:
+    move_statedirs          Move the dqlite state directories to the DRBD mount,
+                            replacing them with symlinks.
+                            The existing contents are moved to a backup folder,
+                            which can be used as part of the recovery process.
+    install_packages        Install the packages required by the 2-node HA
+                            cluster.
+    start_service           Initialize the k8s services, taking the following
+                            steps:
+                            1. Based on the drbd state, decide if this node
+                               should assume the primary (dqlite voter) or
+                               secondary (spare) role.
+                            2. If this is the first start, transfer the dqlite
+                               state directories and create backups.
+                            3. If this node is a primary, promote it and initiate
+                               the dqlite recovery, creating recovery tarballs.
+                               Otherwise, copy over the recovery files and
+                               join the existing cluster as a spare.
+                            4. Start the k8s services.
+                            IMPORTANT: ensure that the DRBD volume is attached
+                            to the primary node when running the command for
+                            the first time.
+    clean_recovery_data     Remove database recovery files. Should be called
+                            after the cluster has been fully recovered.
+    purge                   Remove the k8s snap and all its associated files.
+    clear_taints            Clear tainted Pacemaker resources.
+
+EOF
+            ;;
+    esac
+}
+
+if [[ $sourced -ne 1 ]]; then
+    main $@
+fi

From a4e88283c729790a958108e19a429be31987eab1 Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Tue, 24 Sep 2024 16:12:14 +0300
Subject: [PATCH 2/9] Add 2-node HA guide

We're adding a guide that covers the 2-node A-A HA scenario.
---
 docs/src/snap/howto/2-node-ha.md | 423 +++++++++++++++++++++++++++++++
 docs/src/snap/howto/index.md     |   1 +
 2 files changed, 424 insertions(+)
 create mode 100644 docs/src/snap/howto/2-node-ha.md

diff --git a/docs/src/snap/howto/2-node-ha.md b/docs/src/snap/howto/2-node-ha.md
new file mode 100644
index 000000000..487a02b82
--- /dev/null
+++ b/docs/src/snap/howto/2-node-ha.md
@@ -0,0 +1,423 @@
+# 2-Node Active-Active HA using Dqlite
+
+## Rationale
+
+High availability is a mandatory requirement for most production-grade
+Kubernetes deployments, usually implying three or more nodes.
+
+However, 2-node HA clusters are desired in some situations due to cost saving
+and operational efficiency considerations. Follow this guide to learn how
+Canonical Kubernetes can achieve high availability with just two nodes
+while using the default datastore, Dqlite.
+
+Dqlite cannot achieve Raft quorum with less than three nodes. This means that
+Dqlite will not be able to replicate data and the secondaries will simply
+forward the queries to the primary node.
+
+In the event of a node failure, the database will have to be recovered by
+following the steps outlined in the [Dqlite recovery guide].
+
+## Proposed solution
+
+Since Dqlite data replication is not available in this situation, we propose
+using synchronous block level replication through DRBD.
+
+The cluster monitoring and failover process will be handled by Pacemaker and
+Corosync. In the event of a node failure, the DRBD volume will be mounted on
+the replica, which can then access the most recent version of the Dqlite database.
+
+Additional recovery steps are automated and invoked through Pacemaker.
+
+## Alternatives
+
+Another possible approach is to use PostgreSQL with Kine and logical replication.
+However, it is outside the scope of this document.
+
+See the [external datastore guide] for more information on how Canonical
+Kubernetes can be configured to use other datastores.
+
+## Guide
+
+### Prerequisites
+
+Make sure that:
+
+* Both nodes have joined the Kubernetes cluster.
+  See the [getting started] and [add/remove nodes] guides.
+* The user associated with the HA service has SSH access to the peer node and
+  passwordless sudo configured. For simplicity, the default "ubuntu" user can
+  be used.
+* We recommend using static IP configuration.
+
+The [2ha.sh script] automates most operations related to the 2-node HA scenario.
+Retrieve it like so:
+
+```
+sudo mkdir -p /var/snap/k8s/common
+repo=https://raw.githubusercontent.com/petrutlucian94/k8s-snap
+sudo curl $repo/refs/heads/KU-1606/2ha_script/k8s/hack/2ha.sh \
+    -o /var/snap/k8s/common/2ha.sh
+sudo chmod a+rx /var/snap/k8s/common/2ha.sh
+```
+
+The first step is to install the required packages:
+
+```
+/var/snap/k8s/common/2ha.sh install_packages
+```
+
+### DRBD
+
+For the purpose of this guide, we are going to use a loopback device as DRBD
+backing storage:
+
+```
+sudo dd if=/dev/zero of=/opt/drbd0-backstore bs=1M count=2000
+```
+
+Ensure that the loopback device is attached at boot time, before Pacemaker
+starts.
+
+```
+cat <<EOF | sudo tee /etc/rc.local
+#!/bin/sh
+mknod /dev/lodrbd b 7 200
+losetup /dev/lodrbd /opt/drbd0-backstore
+EOF
+
+sudo chmod +x /etc/rc.local
+
+cat <<EOF | sudo tee /etc/systemd/system/rc-local.service
+# This unit gets pulled automatically into multi-user.target by
+# systemd-rc-local-generator if /etc/rc.local is executable.
+[Unit]
+Description=/etc/rc.local Compatibility
+Documentation=man:systemd-rc-local-generator(8)
+ConditionFileIsExecutable=/etc/rc.local
+After=network.target
+
+[Service]
+Type=forking
+ExecStart=/etc/rc.local start
+TimeoutSec=0
+RemainAfterExit=yes
+GuessMainPID=no
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+sudo systemctl enable rc-local.service
+sudo systemctl start rc-local.service
+```
+
+Let's configure the DRBD block device that will hold the Dqlite data.
+Make sure to use the right node addresses.
+
+```
+# Disable the DRBD service, it will be managed through Pacemaker.
+sudo systemctl disable drbd
+
+HAONE_ADDR=<firstNodeAddress>
+HATWO_ADDR=<secondNodeAddress>
+
+cat <<EOF | sudo tee /etc/drbd.d/r0.res
+resource r0 {
+  on haone {
+    device /dev/drbd0;
+    disk /dev/lodrbd;
+    address ${HAONE_ADDR}:7788;
+    meta-disk internal;
+  }
+  on hatwo {
+    device /dev/drbd0;
+    disk /dev/lodrbd;
+    address ${HATWO_ADDR}:7788;
+    meta-disk internal;
+  }
+}
+EOF
+
+sudo drbdadm create-md r0
+sudo drbdadm status
+```
+
+Let's create a mount point for the DRBD block device. Non-default mount points
+need to be passed to the 2ha.sh script mentioned above, see the script for the
+full list of configurable parameters.
+
+```
+DRBD_MOUNT_DIR=/mnt/drbd0
+sudo mkdir -p $DRBD_MOUNT_DIR
+```
+
+Run the following *once* to initialize the filesystem.
+
+```
+sudo drbdadm up r0
+
+sudo drbdadm -- --overwrite-data-of-peer primary r0/0
+sudo mkfs.ext4 /dev/drbd0
+
+sudo drbdadm down r0
+```
+
+Add the drbd device to the ``multipathd`` blacklist, ensuring that the multipath
+service will not attempt to manage this device:
+
+```
+sudo cat <<EOF | sudo tee -a /etc/multipath.conf
+blacklist {
+    devnode "^drbd*"
+}
+EOF
+
+sudo systemctl restart multipathd
+```
+
+### Corosync
+
+Let's prepare the Corosync configuration. Again, make sure to use the correct
+addresses.
+
+```
+HAONE_ADDR=<firstNodeAddress>
+HATWO_ADDR=<secondNodeAddress>
+
+sudo mv /etc/corosync/corosync.conf /etc/corosync/corosync.conf.orig
+
+cat <<EOF | sudo tee /etc/corosync/corosync.conf
+totem {
+  version: 2
+  cluster_name: ha
+  secauth: off
+  transport:udpu
+  interface {
+    ringnumber: 0
+    bindnetaddr: ${netaddr}
+    broadcast: yes
+    mcastport: 5405
+  }
+}
+
+nodelist {
+  node {
+    ring0_addr: ${HAONE_ADDR}
+    name: haone
+    nodeid: 1
+  }
+  node {
+    ring0_addr: ${HATWO_ADDR}
+    name: hatwo
+    nodeid: 2
+  }
+}
+
+quorum {
+  provider: corosync_votequorum
+  two_node: 1
+  wait_for_all: 1
+  last_man_standing: 1
+  auto_tie_breaker: 0
+}
+EOF
+```
+
+Follow the above steps on both nodes before moving forward.
+
+### Pacemaker
+
+Let's define a Pacemaker resource for the DRBD block device, which
+ensures that the block device will be mounted on the replica in case of a
+primary node failure.
+
+Pacemaker fencing (stonith) configuration is environment specific and thus
+outside the scope of this guide. However, we highly recommend using fencing
+if possible to reduce the risk of cluster split-brain situations.
+
+```
+HAONE_ADDR=<firstNodeAddress>
+HATWO_ADDR=<secondNodeAddress>
+DRBD_MOUNT_DIR=${DRBD_MOUNT_DIR:-"/mnt/drbd0"}
+
+sudo crm configure <<EOF
+property stonith-enabled=false
+property no-quorum-policy=ignore
+primitive drbd_res ocf:linbit:drbd params drbd_resource=r0 op monitor interval=29s role=Master op monitor interval=31s role=Slave
+ms drbd_master_slave drbd_res meta master-max=1 master-node-max=1 clone-max=2 clone-node-max=1 notify=true
+primitive fs_res ocf:heartbeat:Filesystem params device=/dev/drbd0 directory=${DRBD_MOUNT_DIR} fstype=ext4
+colocation fs_drbd_colo INFINITY: fs_res drbd_master_slave:Master
+order fs_after_drbd mandatory: drbd_master_slave:promote fs_res:start
+commit
+show
+quit
+EOF
+```
+
+Before moving forward, let's ensure that the DRBD Pacemaker resource runs on
+the primary (voter) Dqlite node.
+
+Remember that in our case only the primary node contains the latest Dqlite data,
+which will be transfered to the DRBD device once the clustered service starts.
+This is automatically handled by the ``2ha_k8s.sh start_service`` command.
+
+```
+sudo k8s status
+sudo drbadadm status
+sudo crm status
+```
+
+If the DRBD device is assigned to the secondary Dqlite node (spare), move it
+to the primary like so:
+
+```
+sudo crm resource move fs_res <primary_node_name>
+
+# remove the node constraint.
+sudo crm resource clear fs_res
+```
+
+### Kubernetes services
+
+We can now turn our attention to the Kubernetes services. Ensure that the k8s
+snap services no longer start automatically. Instead, they will be manged by a
+wrapper service.
+
+```
+for f in `sudo snap services k8s  | awk 'NR>1 {print $1}'`; do
+    echo "disabling snap.$f"
+    sudo systemctl disable "snap.$f";
+done
+```
+
+The next step is to define the wrapper service. Add the following to
+``/etc/systemd/system/2ha_k8s.service``. Note that the sample uses the ``ubuntu``
+user, feel free to use a different one as long as the prerequisites are met.
+
+```
+[Unit]
+Description=K8s service wrapper handling Dqlite recovery for 2-node HA setups.
+After=network.target pacemaker.service
+
+[Service]
+User=ubuntu
+Group=ubuntu
+Type=oneshot
+ExecStart=/bin/bash /var/snap/k8s/common/2ha.sh start_service
+ExecStop=/bin/bash sudo snap stop k8s
+RemainAfterExit=true
+
+[Install]
+WantedBy=multi-user.target
+```
+
+```{note}
+The ``2ha.sh start_service`` command used by the service wrapper automatically
+detects the expected Dqlite role based on the DRBD state and takes the necessary
+steps to bootstrap the Dqlite state directories, synchronize with the peer node
+(if available) and recover the database.
+```
+
+We need the ``2ha_k8s`` service to be restarted once a DRBD failover occurs.
+For that, we are going to define a separate service that will be invoked by
+Pacemaker. Create a file called ``/etc/systemd/system/2ha_k8s_failover.service``
+containing the following:
+
+```
+[Unit]
+Description=Managed by Pacemaker, restarts 2ha_k8s on failover.
+After=network.target home-ubuntu-workspace.mount
+
+[Service]
+Type=oneshot
+ExecStart=systemctl restart 2ha_k8s
+RemainAfterExit=true
+```
+
+Reload the systemd configuration and set ``2ha_k8s`` to start automatically.
+Notice that ``2ha_k8s_failover`` must not be configured to start automatically,
+but instead is going to be managed through Pacemaker.
+
+```
+sudo systemctl enable 2ha_k8s
+sudo systemctl daemon-reload
+```
+
+Make sure that both nodes have been configured using the above steps before
+moving forward.
+
+We can now define a new Pacemaker resource that will invoke the
+``2ha_k8s_failover`` service when a DRBD failover occurs.
+
+```
+sudo crm configure <<EOF
+primitive ha_k8s_failover_service systemd:2ha_k8s_failover op start interval=0 timeout=120 op stop interval=0 timeout=30
+order failover_after_fs mandatory: fs_res:start ha_k8s_failover_service:start
+colocation fs_failover_colo INFINITY: fs_res ha_k8s_failover_service
+commit
+show
+quit
+EOF
+```
+
+The setup is ready, start the HA k8s service on both nodes:
+
+```
+sudo systemctl start 2ha_k8s
+```
+
+## Troubleshooting
+
+### Dqlite recovery failing because of unexpected data segments
+
+Dqlite recovery may fail if there are data segments past the latest snapshot.
+
+```
+Error: failed to recover k8s-dqlite, error: k8s-dqlite recovery failed, error:
+recover failed with error code 1, error details: raft_recover(): io:
+closed segment 0000000000002369-0000000000002655 is past last snapshot
+snapshot-2-2048-642428, pre-recovery backup:
+/var/snap/k8s/common/recovery-k8s-dqlite-2024-09-05T082644Z-pre-recovery.tar.gz
+```
+
+Remove the offending segments and restart the ``2ha_k8s`` service.
+
+### DRBD split brain
+
+The DRBD cluster may enter a split brain state and stop synchronizing. The
+chances increase if fencing (stonith) is not enabled.
+
+```
+ubuntu@hatwo:~$ sudo drbdadm status
+r0 role:Primary
+  disk:UpToDate
+
+ubuntu@hatwo:~$ cat /proc/drbd
+version: 8.4.11 (api:1/proto:86-101)
+srcversion: C7B8F7076B8D6DB066D84D9
+ 0: cs:StandAlone ro:Secondary/Unknown ds:UpToDate/DUnknown   r-----
+    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:1802140
+
+ubuntu@hatwo:~$ dmesg | grep "Split"
+[  +0.000082] block drbd0: Split-Brain detected but unresolved, dropping connection!
+
+```
+
+To recover DRBD, use following procedure:
+
+```
+# On the stale node:
+sudo drbdadm secondary r0 
+sudo drbdadm disconnect r0
+sudo drbdadm -- --discard-my-data connect r0
+
+# On the node that contains the latest data
+sudo drbdadm connect r0
+```
+
+<!--LINKS -->
+[Dqlite recovery guide]: restore-quorum
+[external datastore guide]: external-datastore
+[2ha.sh script]: https://github.com/canonical/k8s-snap/blob/main/k8s/hack/2ha.sh
+[getting started]: ../tutorial/getting-started
+[add/remove nodes]: ../tutorial/add-remove-nodes
diff --git a/docs/src/snap/howto/index.md b/docs/src/snap/howto/index.md
index 3ae545030..295245127 100644
--- a/docs/src/snap/howto/index.md
+++ b/docs/src/snap/howto/index.md
@@ -22,6 +22,7 @@ proxy
 backup-restore
 refresh-certs
 restore-quorum
+2-node-ha
 epa
 contribute
 support

From 33ed437101b8d9abb195a7ee46b83d931f60c28a Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Wed, 25 Sep 2024 10:24:38 +0000
Subject: [PATCH 3/9] Update docs as per PR comments

---
 docs/src/snap/howto/2-node-ha.md | 78 +++++++++++++++-----------------
 k8s/hack/2ha.sh                  |  7 ++-
 2 files changed, 41 insertions(+), 44 deletions(-)

diff --git a/docs/src/snap/howto/2-node-ha.md b/docs/src/snap/howto/2-node-ha.md
index 487a02b82..97589107c 100644
--- a/docs/src/snap/howto/2-node-ha.md
+++ b/docs/src/snap/howto/2-node-ha.md
@@ -1,37 +1,38 @@
-# 2-Node Active-Active HA using Dqlite
+# 2-Node Active-Active High-Availability using Dqlite
 
 ## Rationale
 
-High availability is a mandatory requirement for most production-grade
+High availability (HA) is a mandatory requirement for most production-grade
 Kubernetes deployments, usually implying three or more nodes.
 
-However, 2-node HA clusters are desired in some situations due to cost saving
-and operational efficiency considerations. Follow this guide to learn how
-Canonical Kubernetes can achieve high availability with just two nodes
-while using the default datastore, Dqlite.
+2-node HA clusters are sometimes preferred for cost savings and operational
+efficiency considerations. Follow this guide to learn how Canonical Kubernetes
+can achieve high availability with just two nodes while using the default
+datastore, Dqlite.
 
 Dqlite cannot achieve Raft quorum with less than three nodes. This means that
 Dqlite will not be able to replicate data and the secondaries will simply
 forward the queries to the primary node.
 
-In the event of a node failure, the database will have to be recovered by
-following the steps outlined in the [Dqlite recovery guide].
+In the event of a node failure, database recovery will require following the
+steps in the [Dqlite recovery guide].
 
 ## Proposed solution
 
 Since Dqlite data replication is not available in this situation, we propose
-using synchronous block level replication through DRBD.
+using synchronous block level replication through
+[Distributed Replicated Block Device] (DRBD).
 
 The cluster monitoring and failover process will be handled by Pacemaker and
-Corosync. In the event of a node failure, the DRBD volume will be mounted on
-the replica, which can then access the most recent version of the Dqlite database.
+Corosync. After a node failure, the DRBD volume will be mounted on the standby
+node, allowing access to the latest Dqlite database version.
 
 Additional recovery steps are automated and invoked through Pacemaker.
 
 ## Alternatives
 
-Another possible approach is to use PostgreSQL with Kine and logical replication.
-However, it is outside the scope of this document.
+Another possible approach is to use PostgreSQL with Kine and logical
+replication. However, it is outside the scope of this document.
 
 See the [external datastore guide] for more information on how Canonical
 Kubernetes can be configured to use other datastores.
@@ -40,30 +41,20 @@ Kubernetes can be configured to use other datastores.
 
 ### Prerequisites
 
-Make sure that:
-
-* Both nodes have joined the Kubernetes cluster.
+* Ensure both nodes are part of the Kubernetes cluster.
   See the [getting started] and [add/remove nodes] guides.
 * The user associated with the HA service has SSH access to the peer node and
   passwordless sudo configured. For simplicity, the default "ubuntu" user can
   be used.
 * We recommend using static IP configuration.
 
-The [2ha.sh script] automates most operations related to the 2-node HA scenario.
-Retrieve it like so:
-
-```
-sudo mkdir -p /var/snap/k8s/common
-repo=https://raw.githubusercontent.com/petrutlucian94/k8s-snap
-sudo curl $repo/refs/heads/KU-1606/2ha_script/k8s/hack/2ha.sh \
-    -o /var/snap/k8s/common/2ha.sh
-sudo chmod a+rx /var/snap/k8s/common/2ha.sh
-```
+The [2ha.sh script] automates most operations related to the 2-node HA scenario
+and is included in the snap.
 
 The first step is to install the required packages:
 
 ```
-/var/snap/k8s/common/2ha.sh install_packages
+/snap/k8s/current/k8s/hack/2ha.sh install_packages
 ```
 
 ### DRBD
@@ -112,7 +103,7 @@ sudo systemctl start rc-local.service
 ```
 
 Let's configure the DRBD block device that will hold the Dqlite data.
-Make sure to use the right node addresses.
+Ensure the correct node addresses are used.
 
 ```
 # Disable the DRBD service, it will be managed through Pacemaker.
@@ -143,8 +134,8 @@ sudo drbdadm status
 ```
 
 Let's create a mount point for the DRBD block device. Non-default mount points
-need to be passed to the 2ha.sh script mentioned above, see the script for the
-full list of configurable parameters.
+need to be passed to the ``2ha.sh`` script mentioned above, see the script for
+the full list of configurable parameters.
 
 ```
 DRBD_MOUNT_DIR=/mnt/drbd0
@@ -231,7 +222,7 @@ Let's define a Pacemaker resource for the DRBD block device, which
 ensures that the block device will be mounted on the replica in case of a
 primary node failure.
 
-Pacemaker fencing (stonith) configuration is environment specific and thus
+[Pacemaker fencing] (stonith) configuration is environment specific and thus
 outside the scope of this guide. However, we highly recommend using fencing
 if possible to reduce the risk of cluster split-brain situations.
 
@@ -257,8 +248,8 @@ EOF
 Before moving forward, let's ensure that the DRBD Pacemaker resource runs on
 the primary (voter) Dqlite node.
 
-Remember that in our case only the primary node contains the latest Dqlite data,
-which will be transfered to the DRBD device once the clustered service starts.
+In this setup, only the primary node holds the latest Dqlite data, which will
+be transferred to the DRBD device once the clustered service starts.
 This is automatically handled by the ``2ha_k8s.sh start_service`` command.
 
 ```
@@ -280,7 +271,7 @@ sudo crm resource clear fs_res
 ### Kubernetes services
 
 We can now turn our attention to the Kubernetes services. Ensure that the k8s
-snap services no longer start automatically. Instead, they will be manged by a
+snap services no longer start automatically. Instead, they will be managed by a
 wrapper service.
 
 ```
@@ -291,8 +282,9 @@ done
 ```
 
 The next step is to define the wrapper service. Add the following to
-``/etc/systemd/system/2ha_k8s.service``. Note that the sample uses the ``ubuntu``
-user, feel free to use a different one as long as the prerequisites are met.
+``/etc/systemd/system/2ha_k8s.service``. Note that the sample uses the
+``ubuntu`` user, feel free to use a different one as long as the prerequisites
+are met.
 
 ```
 [Unit]
@@ -303,7 +295,7 @@ After=network.target pacemaker.service
 User=ubuntu
 Group=ubuntu
 Type=oneshot
-ExecStart=/bin/bash /var/snap/k8s/common/2ha.sh start_service
+ExecStart=/bin/bash /snap/k8s/current/k8s/hack/2ha.sh start_service
 ExecStop=/bin/bash sudo snap stop k8s
 RemainAfterExit=true
 
@@ -313,15 +305,15 @@ WantedBy=multi-user.target
 
 ```{note}
 The ``2ha.sh start_service`` command used by the service wrapper automatically
-detects the expected Dqlite role based on the DRBD state and takes the necessary
-steps to bootstrap the Dqlite state directories, synchronize with the peer node
-(if available) and recover the database.
+detects the expected Dqlite role based on the DRBD state and takes the
+necessary steps to bootstrap the Dqlite state directories, synchronize with the
+peer node (if available) and recover the database.
 ```
 
 We need the ``2ha_k8s`` service to be restarted once a DRBD failover occurs.
 For that, we are going to define a separate service that will be invoked by
-Pacemaker. Create a file called ``/etc/systemd/system/2ha_k8s_failover.service``
-containing the following:
+Pacemaker. Create a file called
+``/etc/systemd/system/2ha_k8s_failover.service`` containing the following:
 
 ```
 [Unit]
@@ -416,8 +408,10 @@ sudo drbdadm connect r0
 ```
 
 <!--LINKS -->
+[Distributed Replicated Block Device]: https://ubuntu.com/server/docs/distributed-replicated-block-device-drbd
 [Dqlite recovery guide]: restore-quorum
 [external datastore guide]: external-datastore
 [2ha.sh script]: https://github.com/canonical/k8s-snap/blob/main/k8s/hack/2ha.sh
 [getting started]: ../tutorial/getting-started
 [add/remove nodes]: ../tutorial/add-remove-nodes
+[Pacemaker fencing]: https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/html/fencing.html
diff --git a/k8s/hack/2ha.sh b/k8s/hack/2ha.sh
index 4df08f194..c3c2a0c3a 100755
--- a/k8s/hack/2ha.sh
+++ b/k8s/hack/2ha.sh
@@ -1,10 +1,13 @@
 #!/bin/bash
 
+# This script automates various operations on 2-node HA A-A Canonical K8s
+# clusters that use the default datastore, Dqlite.
+#
 # Prerequisites:
 # * required packages installed using the "install_packages" command.
-# * initialized k8s cluster, both nodes joined
+# * initialized K8s cluster, both nodes joined
 # * the current user has ssh access to the peer node.
-#   - used to handle k8s services and transfer dqlite data
+#   - used to handle K8s services and transfer Dqlite data
 # * the current user has passwordless sudo enabled.
 sourced=0
 

From 0f8bbca43e7118febe65c178bb69e5d712aefd73 Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Mon, 30 Sep 2024 09:31:42 +0300
Subject: [PATCH 4/9] Rename 2ha.sh to two-node-ha.sh

---
 .../howto/{2-node-ha.md => two-node-ha.md}    | 48 +++++++++----------
 k8s/hack/{2ha.sh => two-node-ha.sh}           |  2 +-
 2 files changed, 25 insertions(+), 25 deletions(-)
 rename docs/src/snap/howto/{2-node-ha.md => two-node-ha.md} (85%)
 rename k8s/hack/{2ha.sh => two-node-ha.sh} (99%)

diff --git a/docs/src/snap/howto/2-node-ha.md b/docs/src/snap/howto/two-node-ha.md
similarity index 85%
rename from docs/src/snap/howto/2-node-ha.md
rename to docs/src/snap/howto/two-node-ha.md
index 97589107c..7b84284f6 100644
--- a/docs/src/snap/howto/2-node-ha.md
+++ b/docs/src/snap/howto/two-node-ha.md
@@ -48,13 +48,13 @@ Kubernetes can be configured to use other datastores.
   be used.
 * We recommend using static IP configuration.
 
-The [2ha.sh script] automates most operations related to the 2-node HA scenario
-and is included in the snap.
+The [two-node-ha.sh script] automates most operations related to the 2-node HA
+scenario and is included in the snap.
 
 The first step is to install the required packages:
 
 ```
-/snap/k8s/current/k8s/hack/2ha.sh install_packages
+/snap/k8s/current/k8s/hack/two-node-ha.sh install_packages
 ```
 
 ### DRBD
@@ -134,8 +134,8 @@ sudo drbdadm status
 ```
 
 Let's create a mount point for the DRBD block device. Non-default mount points
-need to be passed to the ``2ha.sh`` script mentioned above, see the script for
-the full list of configurable parameters.
+need to be passed to the ``two-node-ha.sh`` script mentioned above, see the
+script for the full list of configurable parameters.
 
 ```
 DRBD_MOUNT_DIR=/mnt/drbd0
@@ -250,7 +250,7 @@ the primary (voter) Dqlite node.
 
 In this setup, only the primary node holds the latest Dqlite data, which will
 be transferred to the DRBD device once the clustered service starts.
-This is automatically handled by the ``2ha_k8s.sh start_service`` command.
+This is automatically handled by the ``two-node-ha.sh start_service`` command.
 
 ```
 sudo k8s status
@@ -282,7 +282,7 @@ done
 ```
 
 The next step is to define the wrapper service. Add the following to
-``/etc/systemd/system/2ha_k8s.service``. Note that the sample uses the
+``/etc/systemd/system/two-node-ha-k8s.service``. Note that the sample uses the
 ``ubuntu`` user, feel free to use a different one as long as the prerequisites
 are met.
 
@@ -295,7 +295,7 @@ After=network.target pacemaker.service
 User=ubuntu
 Group=ubuntu
 Type=oneshot
-ExecStart=/bin/bash /snap/k8s/current/k8s/hack/2ha.sh start_service
+ExecStart=/bin/bash /snap/k8s/current/k8s/hack/two-node-ha.sh start_service
 ExecStop=/bin/bash sudo snap stop k8s
 RemainAfterExit=true
 
@@ -304,34 +304,34 @@ WantedBy=multi-user.target
 ```
 
 ```{note}
-The ``2ha.sh start_service`` command used by the service wrapper automatically
+The ``two-node-ha.sh start_service`` command used by the service wrapper automatically
 detects the expected Dqlite role based on the DRBD state and takes the
 necessary steps to bootstrap the Dqlite state directories, synchronize with the
 peer node (if available) and recover the database.
 ```
 
-We need the ``2ha_k8s`` service to be restarted once a DRBD failover occurs.
-For that, we are going to define a separate service that will be invoked by
-Pacemaker. Create a file called
-``/etc/systemd/system/2ha_k8s_failover.service`` containing the following:
+We need the ``two-node-ha-k8s`` service to be restarted once a DRBD failover
+occurs. For that, we are going to define a separate service that will be
+invoked by Pacemaker. Create a file called
+``/etc/systemd/system/two-node-ha-k8s-failover.service`` containing the following:
 
 ```
 [Unit]
-Description=Managed by Pacemaker, restarts 2ha_k8s on failover.
+Description=Managed by Pacemaker, restarts two-node-ha-k8s on failover.
 After=network.target home-ubuntu-workspace.mount
 
 [Service]
 Type=oneshot
-ExecStart=systemctl restart 2ha_k8s
+ExecStart=systemctl restart two-node-ha-k8s
 RemainAfterExit=true
 ```
 
-Reload the systemd configuration and set ``2ha_k8s`` to start automatically.
-Notice that ``2ha_k8s_failover`` must not be configured to start automatically,
-but instead is going to be managed through Pacemaker.
+Reload the systemd configuration and set ``two-node-ha-k8s`` to start
+automatically. Notice that ``two-node-ha-k8s-failover`` must not be configured
+to start automatically, but instead is going to be managed through Pacemaker.
 
 ```
-sudo systemctl enable 2ha_k8s
+sudo systemctl enable two-node-ha-k8s
 sudo systemctl daemon-reload
 ```
 
@@ -339,11 +339,11 @@ Make sure that both nodes have been configured using the above steps before
 moving forward.
 
 We can now define a new Pacemaker resource that will invoke the
-``2ha_k8s_failover`` service when a DRBD failover occurs.
+``two-node-ha-k8s-failover`` service when a DRBD failover occurs.
 
 ```
 sudo crm configure <<EOF
-primitive ha_k8s_failover_service systemd:2ha_k8s_failover op start interval=0 timeout=120 op stop interval=0 timeout=30
+primitive ha_k8s_failover_service systemd:two-node-ha-k8s-failover op start interval=0 timeout=120 op stop interval=0 timeout=30
 order failover_after_fs mandatory: fs_res:start ha_k8s_failover_service:start
 colocation fs_failover_colo INFINITY: fs_res ha_k8s_failover_service
 commit
@@ -355,7 +355,7 @@ EOF
 The setup is ready, start the HA k8s service on both nodes:
 
 ```
-sudo systemctl start 2ha_k8s
+sudo systemctl start two-node-ha-k8s
 ```
 
 ## Troubleshooting
@@ -372,7 +372,7 @@ snapshot-2-2048-642428, pre-recovery backup:
 /var/snap/k8s/common/recovery-k8s-dqlite-2024-09-05T082644Z-pre-recovery.tar.gz
 ```
 
-Remove the offending segments and restart the ``2ha_k8s`` service.
+Remove the offending segments and restart the ``two-node-ha-k8s`` service.
 
 ### DRBD split brain
 
@@ -411,7 +411,7 @@ sudo drbdadm connect r0
 [Distributed Replicated Block Device]: https://ubuntu.com/server/docs/distributed-replicated-block-device-drbd
 [Dqlite recovery guide]: restore-quorum
 [external datastore guide]: external-datastore
-[2ha.sh script]: https://github.com/canonical/k8s-snap/blob/main/k8s/hack/2ha.sh
+[two-node-ha.sh script]: https://github.com/canonical/k8s-snap/blob/main/k8s/hack/two-node-ha.sh
 [getting started]: ../tutorial/getting-started
 [add/remove nodes]: ../tutorial/add-remove-nodes
 [Pacemaker fencing]: https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/html/fencing.html
diff --git a/k8s/hack/2ha.sh b/k8s/hack/two-node-ha.sh
similarity index 99%
rename from k8s/hack/2ha.sh
rename to k8s/hack/two-node-ha.sh
index c3c2a0c3a..0e68e354d 100755
--- a/k8s/hack/2ha.sh
+++ b/k8s/hack/two-node-ha.sh
@@ -24,7 +24,7 @@ else
     fi
 fi
 
-SYSTEMD_SERVICE_NAME=${SYSTEMD_SERVICE_NAME:-"2ha_k8s"}
+SYSTEMD_SERVICE_NAME=${SYSTEMD_SERVICE_NAME:-"two-node-ha-k8s"}
 DRBD_MOUNT_DIR=${DRBD_MOUNT_DIR:-"/mnt/drbd0"}
 SSH_USERNAME=${SSH_USERNAME:-"ubuntu"}
 SSH_OPTS=${SSH_OPTS:-"-o StrictHostKeyChecking=no -o ConnectTimeout=5"}

From b1925f2a61808ffb0c89cb6aecf733d7975429aa Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Mon, 30 Sep 2024 09:53:02 +0300
Subject: [PATCH 5/9] s/2-node/two-node

---
 docs/src/snap/howto/two-node-ha.md | 10 +++++-----
 k8s/hack/two-node-ha.sh            |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md
index 7b84284f6..ee314025c 100644
--- a/docs/src/snap/howto/two-node-ha.md
+++ b/docs/src/snap/howto/two-node-ha.md
@@ -1,11 +1,11 @@
-# 2-Node Active-Active High-Availability using Dqlite
+# Two-Node Active-Active High-Availability using Dqlite
 
 ## Rationale
 
 High availability (HA) is a mandatory requirement for most production-grade
 Kubernetes deployments, usually implying three or more nodes.
 
-2-node HA clusters are sometimes preferred for cost savings and operational
+Two-node HA clusters are sometimes preferred for cost savings and operational
 efficiency considerations. Follow this guide to learn how Canonical Kubernetes
 can achieve high availability with just two nodes while using the default
 datastore, Dqlite.
@@ -48,8 +48,8 @@ Kubernetes can be configured to use other datastores.
   be used.
 * We recommend using static IP configuration.
 
-The [two-node-ha.sh script] automates most operations related to the 2-node HA
-scenario and is included in the snap.
+The [two-node-ha.sh script] automates most operations related to the two-node
+HA scenario and is included in the snap.
 
 The first step is to install the required packages:
 
@@ -288,7 +288,7 @@ are met.
 
 ```
 [Unit]
-Description=K8s service wrapper handling Dqlite recovery for 2-node HA setups.
+Description=K8s service wrapper handling Dqlite recovery for two-node HA setups.
 After=network.target pacemaker.service
 
 [Service]
diff --git a/k8s/hack/two-node-ha.sh b/k8s/hack/two-node-ha.sh
index 0e68e354d..8a2bad9a9 100755
--- a/k8s/hack/two-node-ha.sh
+++ b/k8s/hack/two-node-ha.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# This script automates various operations on 2-node HA A-A Canonical K8s
+# This script automates various operations on two-node HA A-A Canonical K8s
 # clusters that use the default datastore, Dqlite.
 #
 # Prerequisites:
@@ -794,7 +794,7 @@ Commands:
                             replacing them with symlinks.
                             The existing contents are moved to a backup folder,
                             which can be used as part of the recovery process.
-    install_packages        Install the packages required by the 2-node HA
+    install_packages        Install the packages required by the two-node HA
                             cluster.
     start_service           Initialize the k8s services, taking the following
                             steps:

From 8d690c3351cb2b22e677a727a15d0d9937602cda Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Mon, 30 Sep 2024 10:33:05 +0300
Subject: [PATCH 6/9] Address comments

---
 docs/src/snap/howto/index.md       |   2 +-
 docs/src/snap/howto/two-node-ha.md | 103 ++++++++++++++++-------------
 2 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/docs/src/snap/howto/index.md b/docs/src/snap/howto/index.md
index 295245127..817b5a247 100644
--- a/docs/src/snap/howto/index.md
+++ b/docs/src/snap/howto/index.md
@@ -22,7 +22,7 @@ proxy
 backup-restore
 refresh-certs
 restore-quorum
-2-node-ha
+two-node-ha
 epa
 contribute
 support
diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md
index ee314025c..1ff25f005 100644
--- a/docs/src/snap/howto/two-node-ha.md
+++ b/docs/src/snap/howto/two-node-ha.md
@@ -1,17 +1,15 @@
 # Two-Node Active-Active High-Availability using Dqlite
 
-## Rationale
-
 High availability (HA) is a mandatory requirement for most production-grade
 Kubernetes deployments, usually implying three or more nodes.
 
 Two-node HA clusters are sometimes preferred for cost savings and operational
 efficiency considerations. Follow this guide to learn how Canonical Kubernetes
 can achieve high availability with just two nodes while using the default
-datastore, Dqlite.
+datastore, [Dqlite].
 
-Dqlite cannot achieve Raft quorum with less than three nodes. This means that
-Dqlite will not be able to replicate data and the secondaries will simply
+Dqlite cannot achieve a [Raft] quorum with fewer than three nodes. This means
+that Dqlite will not be able to replicate data and the secondaries will simply
 forward the queries to the primary node.
 
 In the event of a node failure, database recovery will require following the
@@ -23,25 +21,15 @@ Since Dqlite data replication is not available in this situation, we propose
 using synchronous block level replication through
 [Distributed Replicated Block Device] (DRBD).
 
-The cluster monitoring and failover process will be handled by Pacemaker and
-Corosync. After a node failure, the DRBD volume will be mounted on the standby
-node, allowing access to the latest Dqlite database version.
+The cluster monitoring and failover process will be handled by [Pacemaker] and
+[Corosync]. After a node failure, the DRBD volume will be mounted on the
+standby node, allowing access to the latest Dqlite database version.
 
 Additional recovery steps are automated and invoked through Pacemaker.
 
-## Alternatives
-
-Another possible approach is to use PostgreSQL with Kine and logical
-replication. However, it is outside the scope of this document.
-
-See the [external datastore guide] for more information on how Canonical
-Kubernetes can be configured to use other datastores.
-
-## Guide
+### Prerequisites:
 
-### Prerequisites
-
-* Ensure both nodes are part of the Kubernetes cluster.
+* Please ensure that both nodes are part of the Kubernetes cluster.
   See the [getting started] and [add/remove nodes] guides.
 * The user associated with the HA service has SSH access to the peer node and
   passwordless sudo configured. For simplicity, the default "ubuntu" user can
@@ -57,10 +45,9 @@ The first step is to install the required packages:
 /snap/k8s/current/k8s/hack/two-node-ha.sh install_packages
 ```
 
-### DRBD
+### Distributed Replicated Block Device (DRBD)
 
-For the purpose of this guide, we are going to use a loopback device as DRBD
-backing storage:
+This example uses a loopback device as DRBD backing storage:
 
 ```
 sudo dd if=/dev/zero of=/opt/drbd0-backstore bs=1M count=2000
@@ -77,6 +64,12 @@ losetup /dev/lodrbd /opt/drbd0-backstore
 EOF
 
 sudo chmod +x /etc/rc.local
+```
+
+Add a service to automatically execute the ``/etc/rc.local`` script.
+
+```
+
 
 cat <<EOF | sudo tee /etc/systemd/system/rc-local.service
 # This unit gets pulled automatically into multi-user.target by
@@ -102,8 +95,8 @@ sudo systemctl enable rc-local.service
 sudo systemctl start rc-local.service
 ```
 
-Let's configure the DRBD block device that will hold the Dqlite data.
-Ensure the correct node addresses are used.
+Configure the DRBD block device that will hold the Dqlite data.
+Please ensure that the correct node addresses are used.
 
 ```
 # Disable the DRBD service, it will be managed through Pacemaker.
@@ -133,7 +126,7 @@ sudo drbdadm create-md r0
 sudo drbdadm status
 ```
 
-Let's create a mount point for the DRBD block device. Non-default mount points
+Create a mount point for the DRBD block device. Non-default mount points
 need to be passed to the ``two-node-ha.sh`` script mentioned above, see the
 script for the full list of configurable parameters.
 
@@ -153,7 +146,7 @@ sudo mkfs.ext4 /dev/drbd0
 sudo drbdadm down r0
 ```
 
-Add the drbd device to the ``multipathd`` blacklist, ensuring that the multipath
+Add the DRBD device to the ``multipathd`` blacklist, ensuring that the multipath
 service will not attempt to manage this device:
 
 ```
@@ -168,7 +161,7 @@ sudo systemctl restart multipathd
 
 ### Corosync
 
-Let's prepare the Corosync configuration. Again, make sure to use the correct
+Prepare the Corosync configuration. Again, make sure to use the correct
 addresses.
 
 ```
@@ -223,8 +216,8 @@ ensures that the block device will be mounted on the replica in case of a
 primary node failure.
 
 [Pacemaker fencing] (stonith) configuration is environment specific and thus
-outside the scope of this guide. However, we highly recommend using fencing
-if possible to reduce the risk of cluster split-brain situations.
+outside the scope of this guide. Using fencing is highly recommended if it is
+possible to reduce the risk of cluster split-brain situations.
 
 ```
 HAONE_ADDR=<firstNodeAddress>
@@ -268,11 +261,10 @@ sudo crm resource move fs_res <primary_node_name>
 sudo crm resource clear fs_res
 ```
 
-### Kubernetes services
+### Managing Kubernetes Snap Services
 
-We can now turn our attention to the Kubernetes services. Ensure that the k8s
-snap services no longer start automatically. Instead, they will be managed by a
-wrapper service.
+For the two-node HA setup k8s snap services should no longer start
+automatically. Instead, they will be managed by a wrapper service.
 
 ```
 for f in `sudo snap services k8s  | awk 'NR>1 {print $1}'`; do
@@ -281,10 +273,15 @@ for f in `sudo snap services k8s  | awk 'NR>1 {print $1}'`; do
 done
 ```
 
+### Preparing the wrapper service
+
 The next step is to define the wrapper service. Add the following to
-``/etc/systemd/system/two-node-ha-k8s.service``. Note that the sample uses the
-``ubuntu`` user, feel free to use a different one as long as the prerequisites
+``/etc/systemd/system/two-node-ha-k8s.service``.
+
+```{note}
+the sample uses the ``ubuntu`` user, feel free to use a different one as long as the prerequisites
 are met.
+```
 
 ```
 [Unit]
@@ -304,16 +301,17 @@ WantedBy=multi-user.target
 ```
 
 ```{note}
-The ``two-node-ha.sh start_service`` command used by the service wrapper automatically
-detects the expected Dqlite role based on the DRBD state and takes the
-necessary steps to bootstrap the Dqlite state directories, synchronize with the
-peer node (if available) and recover the database.
+The ``two-node-ha.sh start_service`` command used by the service wrapper
+automatically detects the expected Dqlite role based on the DRBD state.
+It then takes the necessary steps to bootstrap the Dqlite state directories,
+synchronize with the peer node (if available) and recover the database.
 ```
 
-We need the ``two-node-ha-k8s`` service to be restarted once a DRBD failover
-occurs. For that, we are going to define a separate service that will be
-invoked by Pacemaker. Create a file called
-``/etc/systemd/system/two-node-ha-k8s-failover.service`` containing the following:
+When a DRBD failover occurs, the ``two-node-ha-k8s`` service needs to be
+restarted. To accomplish this,, we are going to define a separate service that
+will be invoked by Pacemaker. Create a file called
+``/etc/systemd/system/two-node-ha-k8s-failover.service`` containing the
+following:
 
 ```
 [Unit]
@@ -338,7 +336,9 @@ sudo systemctl daemon-reload
 Make sure that both nodes have been configured using the above steps before
 moving forward.
 
-We can now define a new Pacemaker resource that will invoke the
+### Automating the failover procedure
+
+Define a new Pacemaker resource that will invoke the
 ``two-node-ha-k8s-failover`` service when a DRBD failover occurs.
 
 ```
@@ -352,7 +352,8 @@ quit
 EOF
 ```
 
-The setup is ready, start the HA k8s service on both nodes:
+Once the setup is complete on both nodes, start the two-node HA k8s service on
+each node:
 
 ```
 sudo systemctl start two-node-ha-k8s
@@ -360,6 +361,9 @@ sudo systemctl start two-node-ha-k8s
 
 ## Troubleshooting
 
+Here are some potential problems that may affect two-node HA clusters and how
+to address them.
+
 ### Dqlite recovery failing because of unexpected data segments
 
 Dqlite recovery may fail if there are data segments past the latest snapshot.
@@ -376,7 +380,7 @@ Remove the offending segments and restart the ``two-node-ha-k8s`` service.
 
 ### DRBD split brain
 
-The DRBD cluster may enter a split brain state and stop synchronizing. The
+The DRBD cluster may enter a [split brain] state and stop synchronizing. The
 chances increase if fencing (stonith) is not enabled.
 
 ```
@@ -408,10 +412,15 @@ sudo drbdadm connect r0
 ```
 
 <!--LINKS -->
+[Dqlite]: https://dqlite.io/
+[Raft]: https://raft.github.io/
 [Distributed Replicated Block Device]: https://ubuntu.com/server/docs/distributed-replicated-block-device-drbd
 [Dqlite recovery guide]: restore-quorum
 [external datastore guide]: external-datastore
 [two-node-ha.sh script]: https://github.com/canonical/k8s-snap/blob/main/k8s/hack/two-node-ha.sh
 [getting started]: ../tutorial/getting-started
 [add/remove nodes]: ../tutorial/add-remove-nodes
+[Pacemaker]: https://clusterlabs.org/pacemaker/
+[Corosync]: https://clusterlabs.org/corosync.html
 [Pacemaker fencing]: https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/html/fencing.html
+[split brain]: https://en.wikipedia.org/wiki/Split-brain_(computing)

From 97c287bbadf60134f02616cbbd04fadf7e28e194 Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Mon, 30 Sep 2024 15:15:31 +0300
Subject: [PATCH 7/9] Remove empty lines and add separate note about the A-A
 cluster

---
 docs/src/snap/howto/two-node-ha.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md
index 1ff25f005..36ed7487d 100644
--- a/docs/src/snap/howto/two-node-ha.md
+++ b/docs/src/snap/howto/two-node-ha.md
@@ -1,4 +1,4 @@
-# Two-Node Active-Active High-Availability using Dqlite
+# Two-Node High-Availability with Dqlite
 
 High availability (HA) is a mandatory requirement for most production-grade
 Kubernetes deployments, usually implying three or more nodes.
@@ -6,7 +6,8 @@ Kubernetes deployments, usually implying three or more nodes.
 Two-node HA clusters are sometimes preferred for cost savings and operational
 efficiency considerations. Follow this guide to learn how Canonical Kubernetes
 can achieve high availability with just two nodes while using the default
-datastore, [Dqlite].
+datastore, [Dqlite]. Both nodes will be active members of the cluster, sharing
+the Kubernetes load.
 
 Dqlite cannot achieve a [Raft] quorum with fewer than three nodes. This means
 that Dqlite will not be able to replicate data and the secondaries will simply
@@ -69,8 +70,6 @@ sudo chmod +x /etc/rc.local
 Add a service to automatically execute the ``/etc/rc.local`` script.
 
 ```
-
-
 cat <<EOF | sudo tee /etc/systemd/system/rc-local.service
 # This unit gets pulled automatically into multi-user.target by
 # systemd-rc-local-generator if /etc/rc.local is executable.

From 3729dc2031fe615717ce7dd81c5cefecfaa03676 Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Mon, 30 Sep 2024 16:33:03 +0300
Subject: [PATCH 8/9] Address comments

---
 docs/src/snap/howto/two-node-ha.md | 16 +++----
 k8s/hack/two-node-ha.sh            | 74 +++++++++++++++---------------
 2 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md
index 36ed7487d..8c750cfbe 100644
--- a/docs/src/snap/howto/two-node-ha.md
+++ b/docs/src/snap/howto/two-node-ha.md
@@ -126,8 +126,8 @@ sudo drbdadm status
 ```
 
 Create a mount point for the DRBD block device. Non-default mount points
-need to be passed to the ``two-node-ha.sh`` script mentioned above, see the
-script for the full list of configurable parameters.
+need to be passed to the ``two-node-ha.sh`` script mentioned above. Please
+refer to the script for the full list of configurable parameters.
 
 ```
 DRBD_MOUNT_DIR=/mnt/drbd0
@@ -214,9 +214,10 @@ Let's define a Pacemaker resource for the DRBD block device, which
 ensures that the block device will be mounted on the replica in case of a
 primary node failure.
 
-[Pacemaker fencing] (stonith) configuration is environment specific and thus
-outside the scope of this guide. Using fencing is highly recommended if it is
-possible to reduce the risk of cluster split-brain situations.
+[Pacemaker fencing] (Shoot The Other Node In The Head - STONITH) configuration
+is environment specific and thus outside the scope of this guide. Using fencing
+is highly recommended, if it is possible, to reduce the risk of cluster
+split-brain situations.
 
 ```
 HAONE_ADDR=<firstNodeAddress>
@@ -262,7 +263,7 @@ sudo crm resource clear fs_res
 
 ### Managing Kubernetes Snap Services
 
-For the two-node HA setup k8s snap services should no longer start
+For the two-node HA setup, k8s snap services should no longer start
 automatically. Instead, they will be managed by a wrapper service.
 
 ```
@@ -307,7 +308,7 @@ synchronize with the peer node (if available) and recover the database.
 ```
 
 When a DRBD failover occurs, the ``two-node-ha-k8s`` service needs to be
-restarted. To accomplish this,, we are going to define a separate service that
+restarted. To accomplish this, we are going to define a separate service that
 will be invoked by Pacemaker. Create a file called
 ``/etc/systemd/system/two-node-ha-k8s-failover.service`` containing the
 following:
@@ -395,7 +396,6 @@ srcversion: C7B8F7076B8D6DB066D84D9
 
 ubuntu@hatwo:~$ dmesg | grep "Split"
 [  +0.000082] block drbd0: Split-Brain detected but unresolved, dropping connection!
-
 ```
 
 To recover DRBD, use following procedure:
diff --git a/k8s/hack/two-node-ha.sh b/k8s/hack/two-node-ha.sh
index 8a2bad9a9..f3c4506e9 100755
--- a/k8s/hack/two-node-ha.sh
+++ b/k8s/hack/two-node-ha.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-# This script automates various operations on two-node HA A-A Canonical K8s
-# clusters that use the default datastore, Dqlite.
+# This script automates various operations on two-node HA Active-Active
+# Canonical K8s clusters that use the default datastore, Dqlite.
 #
 # Prerequisites:
 # * required packages installed using the "install_packages" command.
@@ -87,8 +87,8 @@ function get_dqlite_node_role() {
 
 function get_dqlite_role_from_cluster_yaml() {
     # Note that the cluster.yaml role may not match the info.yaml role.
-    # In case of a freshly joined node, info.yaml will have "voter" role
-    # while cluster.yaml has "spare" role.
+    # In case of a freshly joined node, info.yaml will show it as a "voter"
+    # while cluster.yaml lists it as a "spare" node.
     local clusterYamlPath=$1
     local nodeId=$2
 
@@ -99,7 +99,7 @@ function get_dqlite_role_from_cluster_yaml() {
 
 function set_dqlite_node_role() {
     # The yq snap installs in confined mode, so it's unable to access the
-    # dqlite config files.
+    # Dqlite config files.
     # In order to modify files in-place, we're using sponge. It reads all
     # the stdin data before opening the output file.
     local infoYamlPath=$1
@@ -146,7 +146,7 @@ function get_dql_peer_ip() {
     echo ${addresses[0]} | cut -d ":" -f 1
 }
 
-# This function moves the dqlite state directories to the DRBD mount,
+# This function moves the Dqlite state directories to the DRBD mount,
 # replacing them with symlinks. This ensures that the primary will always use
 # the latest DRBD data.
 #
@@ -156,7 +156,7 @@ function move_statedirs() {
     sudo mkdir -p $DRBD_MOUNT_DIR/k8s-dqlite
     sudo mkdir -p $DRBD_MOUNT_DIR/k8sd
 
-    log_message "Validating dqlite state directories."
+    log_message "Validating Dqlite state directories."
     check_statedir $K8S_DQLITE_STATE_DIR $DRBD_MOUNT_DIR/k8s-dqlite
     check_statedir $K8SD_STATE_DIR $DRBD_MOUNT_DIR/k8sd
 
@@ -177,7 +177,7 @@ function move_statedirs() {
             # TODO: consider automating this. We may move the pacemaker resource
             # ourselves and maybe even copy the remote files through scp or ssh.
             # However, there's a risk of race conditions.
-            log_message "DRBD volume mounted on replica, refusing to transfer dqlite files."
+            log_message "DRBD volume mounted on replica, refusing to transfer Dqlite files."
             log_message "Move the DRBD volume to the primary node (through the fs_res Pacemaker resource) and try again."
             log_message "Example: sudo crm resource move fs_res <primary_node> && sudo crm resource clear fs_res"
             exit 1
@@ -261,7 +261,7 @@ function ensure_drbd_unmounted() {
 }
 
 function ensure_drbd_ready() {
-    ensure_mount_rw 
+    ensure_mount_rw
 
     diskStatus=`sudo drbdadm status r0 | grep disk | head -1 | cut -d ":" -f 2`
     if [[ $diskStatus != "UpToDate" ]]; then
@@ -303,7 +303,7 @@ function wait_for_peer_k8s() {
 
     local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId`
     if [[ -z $peerIp ]]; then
-        log_message "Couldn't retrieve dqlite peer ip."
+        log_message "Couldn't retrieve Dqlite peer ip."
         exit 1
     fi
 
@@ -356,8 +356,8 @@ function wait_drbd_resource () {
     return 1
 }
 
-# Based on the drbd volume state, we decide if this node should be a
-# dqlite voter or a spare.
+# Based on the DRBD volume state, we decide if this node should be a
+# Dqlite voter or a spare.
 function get_expected_dqlite_role() {
     drbdResRole=`sudo drbdadm status $DRBD_RES_NAME | head -1 | grep role | cut -d ":" -f 2`
 
@@ -397,7 +397,7 @@ function validate_drbd_state() {
 # After a failover, the state dir points to the shared DRBD volume.
 # We need to restore the node certificate and config files.
 function restore_dqlite_confs_and_certs() {
-    log_message "Restoring dqlite configs and certificates."
+    log_message "Restoring Dqlite configs and certificates."
 
     sudo cp $K8S_DQLITE_STATE_BKP_DIR/info.yaml $K8S_DQLITE_STATE_DIR
 
@@ -431,7 +431,7 @@ function promote_as_primary() {
 
     local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId`
     if [[ -z $peerIp ]]; then
-        log_message "Couldn't retrieve dqlite peer ip."
+        log_message "Couldn't retrieve Dqlite peer ip."
         exit 1
     fi
 
@@ -457,17 +457,17 @@ function promote_as_primary() {
             log_message "The stopped services are going to be restarted after the recovery finishes."
         else
             log_message "Couldn't stop k8s services on the peer node." \
-                        "Assuming that it's stopped and proceeding with the recovery."
+                        "Assuming that the peer node is stopped and proceeding with the recovery."
         fi
     fi
 
     log_message "Ensuring rw access to DRBD mount."
-    # Having RW access to the drbd mount implies that this is the primary node.
+    # Having RW access to the DRBD mount implies that this is the primary node.
     ensure_mount_rw
 
     restore_dqlite_confs_and_certs
 
-    log_message "Updating dqlite roles."
+    log_message "Updating Dqlite roles."
     # Update info.yaml
     set_dqlite_node_role $K8S_DQLITE_INFO_YAML $DQLITE_ROLE_VOTER
     set_dqlite_node_role $K8SD_INFO_YAML $DQLITE_ROLE_VOTER
@@ -476,7 +476,7 @@ function promote_as_primary() {
     set_dqlite_node_as_sole_voter $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId
     set_dqlite_node_as_sole_voter $K8SD_CLUSTER_YAML $k8sdNodeId
 
-    log_message "Restoring dqlite."
+    log_message "Restoring Dqlite."
     sudo $K8SD_PATH cluster-recover \
         --state-dir=$K8SD_STATE_DIR \
         --k8s-dqlite-state-dir=$K8S_DQLITE_STATE_DIR \
@@ -506,10 +506,10 @@ function promote_as_primary() {
 function process_recovery_files_on_secondary() {
     local peerIp="$1"
 
-    log_message "Ensuring that the drbd volume is unmounted."
+    log_message "Ensuring that the DRBD volume is unmounted."
     ensure_drbd_unmounted
 
-    log_message "Restoring local dqlite backup files."
+    log_message "Restoring local Dqlite backup files."
     sudo cp -r $K8S_DQLITE_STATE_BKP_DIR/. $DRBD_MOUNT_DIR/k8s-dqlite/
     sudo cp -r $K8SD_STATE_BKP_DIR/. $DRBD_MOUNT_DIR/k8sd/
 
@@ -542,7 +542,7 @@ function process_recovery_files_on_secondary() {
     scp $SSH_USERNAME@$peerIp:/var/snap/k8s/common/$lastK8sDqliteRecoveryTarball /tmp/
     sudo tar -xf /tmp/$lastK8sDqliteRecoveryTarball -C $K8S_DQLITE_STATE_DIR
 
-    log_message "Updating dqlite roles."
+    log_message "Updating Dqlite roles."
     # Update info.yaml
     set_dqlite_node_role $K8S_DQLITE_INFO_YAML $DQLITE_ROLE_SPARE
     set_dqlite_node_role $K8SD_INFO_YAML $DQLITE_ROLE_SPARE
@@ -550,7 +550,7 @@ function process_recovery_files_on_secondary() {
     # updated cluster.yaml files.
 }
 
-# Recover a former primary, now secondary dqlite node.
+# Recover a former primary, now secondary Dqlite node.
 # Run "promote_as_primary" on the ther node first.
 function rejoin_secondary() {
     log_message "Recovering secondary node."
@@ -563,7 +563,7 @@ function rejoin_secondary() {
 
     local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId`
     if [[ -z $peerIp ]]; then
-        log_message "Couldn't retrieve dqlite peer ip."
+        log_message "Couldn't retrieve Dqlite peer ip."
         exit 1
     fi
 
@@ -572,7 +572,7 @@ function rejoin_secondary() {
 
     log_message "Adding temporary Pacemaker constraint."
     # We need to prevent failovers from happening while restoring secondary
-    # dqlite data, otherwise we may end up overriding or deleting the primary
+    # Dqlite data, otherwise we may end up overriding or deleting the primary
     # node data.
     #
     # TODO: consider reducing the constraint scope (e.g. resource level constraint
@@ -641,10 +641,10 @@ function check_peer_recovery_tarballs() {
         exit 1
     fi
 
-    log_message "Retrieving dqlite peer ip."
+    log_message "Retrieving Dqlite peer ip."
     local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId`
     if [[ -z $peerIp ]]; then
-        log_message "Couldn't retrieve dqlite peer ip."
+        log_message "Couldn't retrieve Dqlite peer ip."
         exit 1
     fi
 
@@ -669,11 +669,11 @@ function check_peer_recovery_tarballs() {
 function start_service() {
     log_message "Initializing node."
 
-    # DRBD is the primary source of truth for the dqlite role.
+    # DRBD is the primary source of truth for the Dqlite role.
     # We need to wait for it to become available.
     wait_drbd_resource
 
-    # dump the drbd and pacemaker status for debugging purposes.
+    # dump the DRBD and pacemaker status for debugging purposes.
     sudo drbdadm status
     sudo crm status
 
@@ -684,14 +684,14 @@ function start_service() {
     local expRole=`get_expected_dqlite_role`
     case $expRole in
         $DQLITE_ROLE_VOTER)
-            log_message "Assuming the dqlite voter role (primary)."
+            log_message "Assuming the Dqlite voter role (primary)."
 
             # We'll assume that if the primary stopped, it needs to go through
             # the recovery process.
             promote_as_primary
             ;;
         $DQLITE_ROLE_SPARE)
-            log_message "Assuming the dqlite spare role (secondary)."
+            log_message "Assuming the Dqlite spare role (secondary)."
 
             wait_for_peer_k8s
 
@@ -709,14 +709,14 @@ function start_service() {
             fi
             ;;
         *)
-            log_message "Unexpected dqlite role: $expRole"
+            log_message "Unexpected Dqlite role: $expRole"
             exit 1
             ;;
     esac
 }
 
 function clean_recovery_data() {
-    log_message "Cleaning up dqlite recovery data."
+    log_message "Cleaning up Dqlite recovery data."
     rm -f $K8SD_RECOVERY_TARBALL
     rm -f $K8SD_RECOVERY_TARBALL_BKP
     rm -f $K8S_DQLITE_STATE_DIR/recovery-k8s-dqlite*
@@ -738,7 +738,7 @@ function purge() {
             # The replicas use the mount dir directly, without a block device
             # attachment. We need to clean up the mount point as well.
             #
-            # We're using another mount with "--bind" to bypass the drbd mount.
+            # We're using another mount with "--bind" to bypass the DRBD mount.
             tempdir=`mktemp -d`
             # We need to mount the parent dir.
             sudo mount --bind `dirname $DRBD_MOUNT_DIR` $tempdir
@@ -790,7 +790,7 @@ Unknown command: $1
 usage: $0 <command>
 
 Commands:
-    move_statedirs          Move the dqlite state directories to the DRBD mount,
+    move_statedirs          Move the Dqlite state directories to the DRBD mount,
                             replacing them with symlinks.
                             The existing contents are moved to a backup folder,
                             which can be used as part of the recovery process.
@@ -798,13 +798,13 @@ Commands:
                             cluster.
     start_service           Initialize the k8s services, taking the following
                             steps:
-                            1. Based on the drbd state, decide if this node
+                            1. Based on the DRBD state, decide if this node
                                should assume the primary (dqlite voter) or
                                secondary (spare) role.
-                            2. If this is the first start, transfer the dqlite
+                            2. If this is the first start, transfer the Dqlite
                                state directories and create backups.
                             3. If this node is a primary, promote it and initiate
-                               the dqlite recovery, creating recovery tarballs.
+                               the Dqlite recovery, creating recovery tarballs.
                                Otherwise, copy over the recovery files and
                                join the existing cluster as a spare.
                             4. Start the k8s services.

From 113652413483563eea79275d0b4ee285b76a3e3a Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Tue, 1 Oct 2024 11:30:04 +0300
Subject: [PATCH 9/9] add warning to troubleshooting section

---
 docs/src/snap/howto/two-node-ha.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md
index 8c750cfbe..cdd894a79 100644
--- a/docs/src/snap/howto/two-node-ha.md
+++ b/docs/src/snap/howto/two-node-ha.md
@@ -364,6 +364,11 @@ sudo systemctl start two-node-ha-k8s
 Here are some potential problems that may affect two-node HA clusters and how
 to address them.
 
+```{warning}
+Before taking any of the actions below, please back up the entire Dqlite data
+directory to avoid losing data in case something goes wrong.
+```
+
 ### Dqlite recovery failing because of unexpected data segments
 
 Dqlite recovery may fail if there are data segments past the latest snapshot.