From 3ab1cec3804a5a4f8510a159b45e76af0f58aa19 Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Mon, 23 Sep 2024 13:35:54 +0000 Subject: [PATCH 1/9] Add "2ha.sh" script, managing 2-node Canonical K8s HA AA clusters Scenario overview: * Canonical K8s cluster containing 2 nodes * Dqlite data store (unable to obtain quorum) * Primary node dqlite files stored on DRBD * sync block-level replication between the two nodes * cluster monitoring and failover handled through Pacemaker Script functionality: * boostrap the service * wait for a DRBD primary to be elected * detect the node role based on the DRBD status and Dqlite state * have the replica wait for the primary to be ready before continuing * recover Dqlite after failovers * transfer and apply recovery files to secondary nodes * transfer Dqlite files to DRBD and other backup locations, creating necessary symlinks * install required packages * purge all K8s data * clear Pacemaker taints * remove recovery data "2ha.sh start_service" is intended to be used as part of a systemd unit that bootstraps the k8s services, coordinating with the other node and taking any necessary steps to recover Dqlite. --- k8s/hack/2ha.sh | 823 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 823 insertions(+) create mode 100755 k8s/hack/2ha.sh diff --git a/k8s/hack/2ha.sh b/k8s/hack/2ha.sh new file mode 100755 index 000000000..4df08f194 --- /dev/null +++ b/k8s/hack/2ha.sh @@ -0,0 +1,823 @@ +#!/bin/bash + +# Prerequisites: +# * required packages installed using the "install_packages" command. +# * initialized k8s cluster, both nodes joined +# * the current user has ssh access to the peer node. +# - used to handle k8s services and transfer dqlite data +# * the current user has passwordless sudo enabled. +sourced=0 + +DEBUG=${DEBUG:-0} +if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then + sourced=1 +else + sourced=0 + set -eEu -o pipefail + + if [[ $DEBUG -eq 1 ]]; then + export PS4='+(${BASH_SOURCE}:${LINENO}): ${FUNCNAME[0]:+${FUNCNAME[0]}(): }' + set -x + fi +fi + +SYSTEMD_SERVICE_NAME=${SYSTEMD_SERVICE_NAME:-"2ha_k8s"} +DRBD_MOUNT_DIR=${DRBD_MOUNT_DIR:-"/mnt/drbd0"} +SSH_USERNAME=${SSH_USERNAME:-"ubuntu"} +SSH_OPTS=${SSH_OPTS:-"-o StrictHostKeyChecking=no -o ConnectTimeout=5"} +K8SD_LOG_LEVEL=${K8SD_LOG_LEVEL:-"0"} +K8S_SNAP_CHANNEL=${K8S_SNAP_CHANNEL:-"latest/edge"} +DRBD_RES_NAME=${DRBD_RES_NAME:-"r0"} +DRBD_READY_TIMEOUT=${DRBD_READY_TIMEOUT:-30} +PEER_READY_TIMEOUT=${PEER_READY_TIMEOUT:-60} + +K8SD_PATH=${K8SD_PATH:-/snap/k8s/current/bin/k8sd} + +K8S_DQLITE_STATE_DIR=/var/snap/k8s/common/var/lib/k8s-dqlite +K8SD_STATE_DIR="/var/snap/k8s/common/var/lib/k8sd/state" + +K8S_DQLITE_STATE_BKP_DIR=/var/snap/k8s/common/var/lib/k8s-dqlite.bkp +K8SD_STATE_BKP_DIR="/var/snap/k8s/common/var/lib/k8sd/state.bkp" + +K8S_DQLITE_INFO_YAML="$K8S_DQLITE_STATE_DIR/info.yaml" +K8S_DQLITE_CLUSTER_YAML="$K8S_DQLITE_STATE_DIR/cluster.yaml" + +K8SD_INFO_YAML="$K8SD_STATE_DIR/database/info.yaml" +K8SD_CLUSTER_YAML="$K8SD_STATE_DIR/database/cluster.yaml" + +# Backup yamls are expected to contain the right node ids and +# addresses while the DRBD files may contain settings from the other node +# and have to be updated. +K8S_DQLITE_INFO_BKP_YAML="$K8S_DQLITE_STATE_BKP_DIR/info.yaml" +K8S_DQLITE_CLUSTER_BKP_YAML="$K8S_DQLITE_STATE_BKP_DIR/cluster.yaml" +K8SD_INFO_BKP_YAML="$K8SD_STATE_BKP_DIR/database/info.yaml" +K8SD_CLUSTER_BKP_YAML="$K8SD_STATE_BKP_DIR/database/cluster.yaml" + +K8SD_RECOVERY_TARBALL="$K8SD_STATE_DIR/recovery_db.tar.gz" +# K8SD will remove this file upon starting. We need to create a backup that +# can be transferred to other nodes. +K8SD_RECOVERY_TARBALL_BKP="$K8SD_STATE_DIR/recovery_db.bkp.tar.gz" + +DQLITE_ROLE_VOTER=0 +DQLITE_ROLE_STANDBY=1 +DQLITE_ROLE_SPARE=2 + +function log_message () { + local msg="[$(date -uIseconds)] $@" + >&2 echo -e "$msg" +} + +function get_dqlite_node_id() { + local infoYamlPath=$1 + sudo cat $infoYamlPath | yq -r '.ID' +} + +function get_dqlite_node_addr() { + local infoYamlPath=$1 + sudo cat $infoYamlPath | yq -r '.Address' +} + +function get_dqlite_node_role() { + local infoYamlPath=$1 + sudo cat $infoYamlPath | yq -r '.Role' +} + +function get_dqlite_role_from_cluster_yaml() { + # Note that the cluster.yaml role may not match the info.yaml role. + # In case of a freshly joined node, info.yaml will have "voter" role + # while cluster.yaml has "spare" role. + local clusterYamlPath=$1 + local nodeId=$2 + + # Update the specified node. + sudo cat $clusterYamlPath | \ + yq -r "(.[] | select(.ID == \"$nodeId\") | .Role )" +} + +function set_dqlite_node_role() { + # The yq snap installs in confined mode, so it's unable to access the + # dqlite config files. + # In order to modify files in-place, we're using sponge. It reads all + # the stdin data before opening the output file. + local infoYamlPath=$1 + local role=$2 + sudo cat $infoYamlPath | \ + yq ".Role = $role" | + sudo sponge $infoYamlPath +} + +# Update cluster.yaml, setting the specified node as voter (role = 0). +# The other nodes will become spares, having the role set to 2. +function set_dqlite_node_as_sole_voter() { + local clusterYamlPath=$1 + local nodeId=$2 + + # Update the specified node. + sudo cat $clusterYamlPath | \ + yq "(.[] | select(.ID == \"$nodeId\") | .Role ) = 0" | \ + sudo sponge $clusterYamlPath + + # Update the other nodes. + sudo cat $clusterYamlPath | \ + yq "(.[] | select(.ID != \"$nodeId\") | .Role ) = 2" | \ + sudo sponge $clusterYamlPath +} + +function get_dql_peer_ip() { + local clusterYamlPath=$1 + local nodeId=$2 + + local addresses=( $(sudo cat $clusterYamlPath | \ + yq "(.[] | select(.ID != \"$nodeId\") | .Address )") ) + + if [[ ${#addresses[@]} -gt 1 ]]; then + log_message "More than one dql peers found: ${addresses[@]}" + exit 1 + fi + + if [[ ${#addresses[@]} -lt 1 ]]; then + log_message "No dql peers found." + exit 1 + fi + + echo ${addresses[0]} | cut -d ":" -f 1 +} + +# This function moves the dqlite state directories to the DRBD mount, +# replacing them with symlinks. This ensures that the primary will always use +# the latest DRBD data. +# +# The existing contents are moved to a backup folder, which can be used as +# part of the recovery process. +function move_statedirs() { + sudo mkdir -p $DRBD_MOUNT_DIR/k8s-dqlite + sudo mkdir -p $DRBD_MOUNT_DIR/k8sd + + log_message "Validating dqlite state directories." + check_statedir $K8S_DQLITE_STATE_DIR $DRBD_MOUNT_DIR/k8s-dqlite + check_statedir $K8SD_STATE_DIR $DRBD_MOUNT_DIR/k8sd + + if [[ ! -L $K8S_DQLITE_STATE_DIR ]] || [[ ! -L $K8SD_STATE_DIR ]]; then + local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_YAML` + if [[ -z $k8sDqliteNodeId ]]; then + log_message "Couldn't retrieve k8s-dqlite node id." + exit 1 + fi + + + local expRole=`get_expected_dqlite_role` + # For fresh k8s clusters, the info.yaml role may not match the cluster.yaml role. + local k8sDqliteRole=`get_dqlite_role_from_cluster_yaml \ + $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId` + + if [[ $expRole -ne $k8sDqliteRole ]]; then + # TODO: consider automating this. We may move the pacemaker resource + # ourselves and maybe even copy the remote files through scp or ssh. + # However, there's a risk of race conditions. + log_message "DRBD volume mounted on replica, refusing to transfer dqlite files." + log_message "Move the DRBD volume to the primary node (through the fs_res Pacemaker resource) and try again." + log_message "Example: sudo crm resource move fs_res && sudo crm resource clear fs_res" + exit 1 + fi + fi + + # Ensure that the k8s services are stopped. + log_message "Stopping k8s services." + sudo snap stop k8s + + if [[ ! -L $K8S_DQLITE_STATE_DIR ]]; then + log_message "Not a symlink: $K8S_DQLITE_STATE_DIR, " \ + "transferring to $DRBD_MOUNT_DIR/k8s-dqlite" + sudo cp -r $K8S_DQLITE_STATE_DIR/. $DRBD_MOUNT_DIR/k8s-dqlite + + log_message "Creating k8s-dqlite state dir backup: $K8S_DQLITE_STATE_BKP_DIR" + sudo rm -rf $K8S_DQLITE_STATE_BKP_DIR + sudo mv $K8S_DQLITE_STATE_DIR/ $K8S_DQLITE_STATE_BKP_DIR + + log_message "Creating symlink $K8S_DQLITE_STATE_DIR -> $DRBD_MOUNT_DIR/k8s-dqlite" + sudo ln -sf $DRBD_MOUNT_DIR/k8s-dqlite $K8S_DQLITE_STATE_DIR + else + log_message "Symlink $K8S_DQLITE_STATE_DIR points to $DRBD_MOUNT_DIR/k8s-dqlite" + fi + + if [[ ! -L $K8SD_STATE_DIR ]]; then + log_message "Not a symlink: $K8SD_STATE_DIR, " \ + "transferring to $DRBD_MOUNT_DIR/k8sd" + sudo cp -r $K8SD_STATE_DIR/. $DRBD_MOUNT_DIR/k8sd + + log_message "Creating k8sd state dir backup: $K8SD_STATE_BKP_DIR" + sudo rm -rf $K8SD_STATE_BKP_DIR + sudo mv $K8SD_STATE_DIR/ $K8SD_STATE_BKP_DIR + + log_message "Creating symlink $K8SD_STATE_DIR -> $DRBD_MOUNT_DIR/k8sd" + sudo ln -sf $DRBD_MOUNT_DIR/k8sd $K8SD_STATE_DIR + else + log_message "Symlink $K8SD_STATE_DIR points to $DRBD_MOUNT_DIR/k8sd" + fi +} + +function ensure_mount_rw() { + if ! mount | grep "on $DRBD_MOUNT_DIR type" &> /dev/null; then + log_message "Missing DRBD mount: $DRBD_MOUNT_DIR" + return 1 + fi + + if ! mount | grep "on $DRBD_MOUNT_DIR type" | grep "rw" &> /dev/null; then + log_message "DRBD mount read-only: $DRBD_MOUNT_DIR" + return 1 + fi +} + +function wait_drbd_promoted() { + log_message "Waiting for one of the DRBD nodes to be promoted." + + local pollInterval=2 + # Special parameter, no need to increase it ourselves. + SECONDS=0 + + while [[ $SECONDS -lt $DRBD_READY_TIMEOUT ]]; do + if sudo crm resource status drbd_master_slave | grep Promoted ; then + log_message "DRBD node promoted." + return 0 + else + log_message "No DRBD node promoted yet, retrying in ${pollInterval}s" + sleep $pollInterval + fi + done + + log_message "Timed out waiting for primary DRBD node." \ + "Waited: ${SECONDS}. Timeout: ${DRBD_READY_TIMEOUT}s." + return 1 +} + +function ensure_drbd_unmounted() { + if mount | grep "on $DRBD_MOUNT_DIR type" &> /dev/null ; then + log_message "DRBD device mounted: $DRBD_MOUNT_DIR" + return 1 + fi +} + +function ensure_drbd_ready() { + ensure_mount_rw + + diskStatus=`sudo drbdadm status r0 | grep disk | head -1 | cut -d ":" -f 2` + if [[ $diskStatus != "UpToDate" ]]; then + log_message "DRBD disk status not ready. Current status: $diskStatus" + return 1 + else + log_message "DRBD disk up to date." + fi +} + +function wait_drbd_primary () { + log_message "Waiting for primary DRBD node to be ready." + + local pollInterval=2 + # Special parameter, no need to increase it ourselves. + SECONDS=0 + + while [[ $SECONDS -lt $DRBD_READY_TIMEOUT ]]; do + if ensure_drbd_ready; then + log_message "Primary DRBD node ready." + return 0 + else + log_message "Primary DRBD node not ready yet, retrying in ${pollInterval}s" + sleep $pollInterval + fi + done + + log_message "Timed out waiting for primary DRBD node." \ + "Waited: ${SECONDS}. Timeout: ${DRBD_READY_TIMEOUT}s." + return 1 +} + +function wait_for_peer_k8s() { + local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_BKP_YAML` + if [[ -z $k8sDqliteNodeId ]]; then + log_message "Couldn't retrieve k8s-dqlite node id." + exit 1 + fi + + local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId` + if [[ -z $peerIp ]]; then + log_message "Couldn't retrieve dqlite peer ip." + exit 1 + fi + + log_message "Waiting for k8s to start on peer: $peerIp. Timeout: ${PEER_READY_TIMEOUT}s." + + local pollInterval=2 + # Special parameter, no need to increase it ourselves. + SECONDS=0 + + while [[ $SECONDS -lt $PEER_READY_TIMEOUT ]]; do + if ssh $SSH_OPTS $SSH_USERNAME@$peerIp sudo k8s status &> /dev/null; then + log_message "Peer ready." + return 0 + else + log_message "Peer not ready yet, retrying in ${pollInterval}s." + sleep $pollInterval + fi + done + + log_message "Timed out waiting for k8s services to start on peer." \ + "Waited: ${SECONDS}. Timeout: ${PEER_READY_TIMEOUT}s." + return 1 + +} + +# "drbdadm status" throws the following if our service starts before +# Pacemaker initialized DRBD (even on the secondary). +# +# r0: No such resource +# Command 'drbdsetup-84 status r0' terminated with exit code 10 +function wait_drbd_resource () { + log_message "Waiting for DRBD resource." + + local pollInterval=2 + # Special parameter, no need to increase it ourselves. + SECONDS=0 + + while [[ $SECONDS -lt $DRBD_READY_TIMEOUT ]]; do + if sudo drbdadm status &> /dev/null; then + log_message "DRBD ready." + return 0 + else + log_message "DRBD not ready yet, retrying in ${pollInterval}s" + sleep $pollInterval + fi + done + + log_message "Timed out waiting for DRBD resource." \ + "Waited: ${SECONDS}. Timeout: ${DRBD_READY_TIMEOUT}s." + return 1 +} + +# Based on the drbd volume state, we decide if this node should be a +# dqlite voter or a spare. +function get_expected_dqlite_role() { + drbdResRole=`sudo drbdadm status $DRBD_RES_NAME | head -1 | grep role | cut -d ":" -f 2` + + case $drbdResRole in + "Primary") + echo $DQLITE_ROLE_VOTER + ;; + "Secondary") + echo $DQLITE_ROLE_SPARE + ;; + *) + log_message "Unexpected DRBD role: $drbdResRole" + exit 1 + ;; + esac +} + +function validate_drbd_state() { + wait_drbd_promoted + + drbdResRole=`sudo drbdadm status $DRBD_RES_NAME | head -1 | grep role | cut -d ":" -f 2` + + case $drbdResRole in + "Primary") + wait_drbd_primary + ;; + "Secondary") + ensure_drbd_unmounted + ;; + *) + log_message "Unexpected DRBD role: $drbdResRole" + exit 1 + ;; + esac +} + +# After a failover, the state dir points to the shared DRBD volume. +# We need to restore the node certificate and config files. +function restore_dqlite_confs_and_certs() { + log_message "Restoring dqlite configs and certificates." + + sudo cp $K8S_DQLITE_STATE_BKP_DIR/info.yaml $K8S_DQLITE_STATE_DIR + + sudo cp $K8SD_STATE_BKP_DIR/database/info.yaml $K8SD_STATE_DIR/database/ + sudo cp $K8SD_STATE_BKP_DIR/daemon.yaml $K8SD_STATE_DIR/ + + # restore k8s-dqlite certificates + sudo cp $K8S_DQLITE_STATE_BKP_DIR/cluster.crt $K8S_DQLITE_STATE_DIR + sudo cp $K8S_DQLITE_STATE_BKP_DIR/cluster.key $K8S_DQLITE_STATE_DIR + + # restore k8sd certificates + sudo cp $K8SD_STATE_BKP_DIR/cluster.crt $K8SD_STATE_DIR + sudo cp $K8SD_STATE_BKP_DIR/cluster.key $K8SD_STATE_DIR + sudo cp $K8SD_STATE_BKP_DIR/server.crt $K8SD_STATE_DIR + sudo cp $K8SD_STATE_BKP_DIR/server.key $K8SD_STATE_DIR +} + +# Promote the current node as primary and prepare the recovery archives. +function promote_as_primary() { + local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_BKP_YAML` + if [[ -z $k8sDqliteNodeId ]]; then + log_message "Couldn't retrieve k8s-dqlite node id." + exit 1 + fi + + local k8sdNodeId=`get_dqlite_node_id $K8SD_INFO_BKP_YAML` + if [[ -z $k8sDqliteNodeId ]]; then + log_message "Couldn't retrieve k8s-dqlite node id." + exit 1 + fi + + local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId` + if [[ -z $peerIp ]]; then + log_message "Couldn't retrieve dqlite peer ip." + exit 1 + fi + + log_message "Stopping local k8s services." + sudo snap stop k8s + + # After a node crash, there may be a leaked control socket file and + # k8sd will refuse to perform the recovery. We've just stopped the k8s snap, + # it should be safe to remove such stale unix sockets. + log_message "Removing stale control sockets." + sudo rm -f $K8SD_STATE_DIR/control.socket + + local stoppedPeer=0 + log_message "Checking peer k8s services: $peerIp" + if ssh $SSH_OPTS $SSH_USERNAME@$peerIp sudo snap services k8s | grep -v inactive | grep "active"; then + log_message "Attempting to stop peer k8s services." + # Stop the k8s snap directly instead of the wrapper service so that + # we won't cause failures if both nodes start at the same time. + # The secondary will wait for the k8s services to start on the primary. + if ssh $SSH_OPTS $SSH_USERNAME@$peerIp sudo snap stop k8s; then + stoppedPeer=1 + log_message "Successfully stopped peer k8s services." + log_message "The stopped services are going to be restarted after the recovery finishes." + else + log_message "Couldn't stop k8s services on the peer node." \ + "Assuming that it's stopped and proceeding with the recovery." + fi + fi + + log_message "Ensuring rw access to DRBD mount." + # Having RW access to the drbd mount implies that this is the primary node. + ensure_mount_rw + + restore_dqlite_confs_and_certs + + log_message "Updating dqlite roles." + # Update info.yaml + set_dqlite_node_role $K8S_DQLITE_INFO_YAML $DQLITE_ROLE_VOTER + set_dqlite_node_role $K8SD_INFO_YAML $DQLITE_ROLE_VOTER + + # Update cluster.yaml + set_dqlite_node_as_sole_voter $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId + set_dqlite_node_as_sole_voter $K8SD_CLUSTER_YAML $k8sdNodeId + + log_message "Restoring dqlite." + sudo $K8SD_PATH cluster-recover \ + --state-dir=$K8SD_STATE_DIR \ + --k8s-dqlite-state-dir=$K8S_DQLITE_STATE_DIR \ + --log-level $K8SD_LOG_LEVEL \ + --non-interactive + + # TODO: consider removing offending segments if the last snapshot is behind + # and then try again. + + log_message "Copying k8sd recovery tarball to $K8SD_RECOVERY_TARBALL_BKP" + sudo cp $K8SD_RECOVERY_TARBALL $K8SD_RECOVERY_TARBALL_BKP + + log_message "Restarting k8s services." + sudo snap start k8s + + # TODO: validate k8s status + + if [[ $stoppedPeer -ne 0 ]]; then + log_message "Restarting peer k8s services: $peerIp" + # It's importand to issue a restart here since we stopped the k8s snap + # directly and the wrapper service doesn't currently monitor it. + ssh $SSH_OPTS $SSH_USERNAME@$peerIp sudo systemctl restart $SYSTEMD_SERVICE_NAME || + log_message "Couldn't start peer k8s services." + fi +} + +function process_recovery_files_on_secondary() { + local peerIp="$1" + + log_message "Ensuring that the drbd volume is unmounted." + ensure_drbd_unmounted + + log_message "Restoring local dqlite backup files." + sudo cp -r $K8S_DQLITE_STATE_BKP_DIR/. $DRBD_MOUNT_DIR/k8s-dqlite/ + sudo cp -r $K8SD_STATE_BKP_DIR/. $DRBD_MOUNT_DIR/k8sd/ + + sudo rm -f $DRBD_MOUNT_DIR/k8s-dqlite/00*-* + sudo rm -f $DRBD_MOUNT_DIR/k8s-dqlite/snapshot-* + sudo rm -f $DRBD_MOUNT_DIR/k8s-dqlite/metadata* + + sudo rm -f $DRBD_MOUNT_DIR/k8sd/database/00*-* + sudo rm -f $DRBD_MOUNT_DIR/k8sd/database/snapshot-* + sudo rm -f $DRBD_MOUNT_DIR/k8sd/database/metadata* + + log_message "Retrieving k8sd recovery tarball." + scp $SSH_OPTS $SSH_USERNAME@$peerIp:$K8SD_RECOVERY_TARBALL_BKP /tmp/ + sudo mv /tmp/`basename $K8SD_RECOVERY_TARBALL_BKP` \ + $K8SD_RECOVERY_TARBALL + + # TODO: do we really need to transfer recovery tarballs in this situation? + # the spare is simply forwarding the requests to the primary, it doesn't really + # hold any data. + lastK8sDqliteRecoveryTarball=`ssh $SSH_USERNAME@$peerIp \ + sudo ls /var/snap/k8s/common/ | \ + grep -P "recovery-k8s-dqlite-.*post-recovery" | \ + tail -1` + if [ -z "$lastK8sDqliteRecoveryTarball" ]; then + log_message "couldn't retrieve latest k8s-dqlite recovery tarball from $peerIp" + exit 1 + fi + + log_message "Retrieving k8s-dqlite recovery tarball." + scp $SSH_USERNAME@$peerIp:/var/snap/k8s/common/$lastK8sDqliteRecoveryTarball /tmp/ + sudo tar -xf /tmp/$lastK8sDqliteRecoveryTarball -C $K8S_DQLITE_STATE_DIR + + log_message "Updating dqlite roles." + # Update info.yaml + set_dqlite_node_role $K8S_DQLITE_INFO_YAML $DQLITE_ROLE_SPARE + set_dqlite_node_role $K8SD_INFO_YAML $DQLITE_ROLE_SPARE + # We're skipping cluster.yaml, we expect the recovery archives to contain + # updated cluster.yaml files. +} + +# Recover a former primary, now secondary dqlite node. +# Run "promote_as_primary" on the ther node first. +function rejoin_secondary() { + log_message "Recovering secondary node." + + local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_BKP_YAML` + if [[ -z $k8sDqliteNodeId ]]; then + log_message "Couldn't retrieve k8s-dqlite node id." + exit 1 + fi + + local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId` + if [[ -z $peerIp ]]; then + log_message "Couldn't retrieve dqlite peer ip." + exit 1 + fi + + log_message "Stopping k8s services." + sudo snap stop k8s + + log_message "Adding temporary Pacemaker constraint." + # We need to prevent failovers from happening while restoring secondary + # dqlite data, otherwise we may end up overriding or deleting the primary + # node data. + # + # TODO: consider reducing the constraint scope (e.g. resource level constraint + # instead of putting the entire node in standby). + sudo crm node standby + if ! process_recovery_files_on_secondary $peerIp; then + log_message "Dqlite recovery filed, removing temporary Pacemaker constraints." + sudo crm node online + exit 1 + fi + + log_message "Restoring Pacemaker state." + sudo crm node online + + log_message "Restarting k8s services" + sudo snap start k8s +} + +function install_packages() { + sudo apt-get update + + sudo DEBIAN_FRONTEND=noninteractive apt-get install \ + python3 python3-netaddr \ + pacemaker resource-agents-extra \ + drbd-utils ntp linux-image-generic snap moreutils -y + sudo modprobe drbd || sudo apt-get install -y linux-modules-extra-$(uname -r) + + sudo snap install jq + sudo snap install yq + sudo snap install install k8s --classic $K8S_SNAP_CHANNEL +} + +function check_statedir() { + local stateDir="$1" + local expLink="$2" + + if [[ ! -e $stateDir ]]; then + log_message "State directory missing: $stateDir" + exit 1 + fi + + target=`readlink -f $stateDir` + if [[ -L "$stateDir" ]] && [[ "$target" != "$expLink" ]]; then + log_message "Unexpected symlink target. " \ + "State directory: $stateDir. " \ + "Expected symlink target: $expLink. " \ + "Actual symlink target: $target." + exit 1 + fi + + if [[ ! -L $stateDir ]] && [[ ! -z "$( ls -A $expLink )" ]]; then + log_message "State directory is not a symlink, however the " \ + "expected link target exists and is not empty. " \ + "We can't know which files to keep, erroring out. " \ + "State directory: $stateDir. " \ + "Expected symlink target: $expLink." + exit 1 + fi +} + +function check_peer_recovery_tarballs() { + log_message "Retrieving k8s-dqlite node id." + local k8sDqliteNodeId=`get_dqlite_node_id $K8S_DQLITE_INFO_BKP_YAML` + if [[ -z $k8sDqliteNodeId ]]; then + log_message "Couldn't retrieve k8s-dqlite node id." + exit 1 + fi + + log_message "Retrieving dqlite peer ip." + local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId` + if [[ -z $peerIp ]]; then + log_message "Couldn't retrieve dqlite peer ip." + exit 1 + fi + + log_message "Checking for recovery taballs on $peerIp." + + k8sdRecoveryTarball=`ssh $SSH_OPTS $SSH_USERNAME@$peerIp \ + sudo ls -A "$K8SD_RECOVERY_TARBALL_BKP"` + if [[ -z $k8sdRecoveryTarball ]]; then + log_message "Peer $peerIp doesn't have k8sd recovery tarball." + return 1 + fi + + lastK8sDqliteRecoveryTarball=`ssh $SSH_OPTS $SSH_USERNAME@$peerIp \ + sudo ls /var/snap/k8s/common/ | \ + grep -P "recovery-k8s-dqlite-.*post-recovery"` + if [[ -z $k8sdRecoveryTarball ]]; then + log_message "Peer $peerIp doesn't have k8s-dqlite recovery tarball." + return 1 + fi +} + +function start_service() { + log_message "Initializing node." + + # DRBD is the primary source of truth for the dqlite role. + # We need to wait for it to become available. + wait_drbd_resource + + # dump the drbd and pacemaker status for debugging purposes. + sudo drbdadm status + sudo crm status + + validate_drbd_state + + move_statedirs + + local expRole=`get_expected_dqlite_role` + case $expRole in + $DQLITE_ROLE_VOTER) + log_message "Assuming the dqlite voter role (primary)." + + # We'll assume that if the primary stopped, it needs to go through + # the recovery process. + promote_as_primary + ;; + $DQLITE_ROLE_SPARE) + log_message "Assuming the dqlite spare role (secondary)." + + wait_for_peer_k8s + + if check_peer_recovery_tarballs; then + log_message "Recovery tarballs found, initiating recovery." + rejoin_secondary + else + # Maybe the primary didn't change and we don't need to go + # through the recovery process. + # TODO: consider comparing the cluster.yaml files from the + # two nodes. + log_message "Recovery tarballs missing, skipping recovery." + log_message "Starting k8s services." + sudo snap k8s start + fi + ;; + *) + log_message "Unexpected dqlite role: $expRole" + exit 1 + ;; + esac +} + +function clean_recovery_data() { + log_message "Cleaning up dqlite recovery data." + rm -f $K8SD_RECOVERY_TARBALL + rm -f $K8SD_RECOVERY_TARBALL_BKP + rm -f $K8S_DQLITE_STATE_DIR/recovery-k8s-dqlite* +} + +function purge() { + log_message "Removing the k8s snap and all the associated files." + + sudo snap remove --purge k8s + + if [[ -d $DRBD_MOUNT_DIR ]]; then + log_message "Cleaning up $DRBD_MOUNT_DIR." + sudo rm -rf $DRBD_MOUNT_DIR/k8sd + sudo rm -rf $DRBD_MOUNT_DIR/k8s-dqlite + + if ! ensure_drbd_unmounted; then + log_message "Cleaning up $DRBD_MOUNT_DIR mount point." + + # The replicas use the mount dir directly, without a block device + # attachment. We need to clean up the mount point as well. + # + # We're using another mount with "--bind" to bypass the drbd mount. + tempdir=`mktemp -d` + # We need to mount the parent dir. + sudo mount --bind `dirname $DRBD_MOUNT_DIR` $tempdir + sudo rm -rf $tempdir/`basename $DRBD_MOUNT_DIR`/k8sd + sudo rm -rf $tempdir/`basename $DRBD_MOUNT_DIR`/k8s-dqlite + sudo umount $tempdir + sudo rm -rf $tempdir + fi + fi +} + +function clear_taints() { + log_message "Clearing tainted Pacemaker resources." + sudo crm resource clear ha_k8s_failover_service + sudo crm resource clear fs_res + sudo crm resource clear drbd_master_slave + + sudo crm resource cleanup ha_k8s_failover_service + sudo crm resource cleanup fs_res + sudo crm resource cleanup drbd_master_slave +} + +function main() { + local command=$1 + + case $command in + "move_statedirs") + move_statedirs + ;; + "install_packages") + install_packages + ;; + "start_service") + start_service + ;; + "clean_recovery_data") + clean_recovery_data + ;; + "purge") + purge + ;; + "clear_taints") + clear_taints + ;; + *) + cat << EOF +Unknown command: $1 + +usage: $0 + +Commands: + move_statedirs Move the dqlite state directories to the DRBD mount, + replacing them with symlinks. + The existing contents are moved to a backup folder, + which can be used as part of the recovery process. + install_packages Install the packages required by the 2-node HA + cluster. + start_service Initialize the k8s services, taking the following + steps: + 1. Based on the drbd state, decide if this node + should assume the primary (dqlite voter) or + secondary (spare) role. + 2. If this is the first start, transfer the dqlite + state directories and create backups. + 3. If this node is a primary, promote it and initiate + the dqlite recovery, creating recovery tarballs. + Otherwise, copy over the recovery files and + join the existing cluster as a spare. + 4. Start the k8s services. + IMPORTANT: ensure that the DRBD volume is attached + to the primary node when running the command for + the first time. + clean_recovery_data Remove database recovery files. Should be called + after the cluster has been fully recovered. + purge Remove the k8s snap and all its associated files. + clear_taints Clear tainted Pacemaker resources. + +EOF + ;; + esac +} + +if [[ $sourced -ne 1 ]]; then + main $@ +fi From a4e88283c729790a958108e19a429be31987eab1 Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Tue, 24 Sep 2024 16:12:14 +0300 Subject: [PATCH 2/9] Add 2-node HA guide We're adding a guide that covers the 2-node A-A HA scenario. --- docs/src/snap/howto/2-node-ha.md | 423 +++++++++++++++++++++++++++++++ docs/src/snap/howto/index.md | 1 + 2 files changed, 424 insertions(+) create mode 100644 docs/src/snap/howto/2-node-ha.md diff --git a/docs/src/snap/howto/2-node-ha.md b/docs/src/snap/howto/2-node-ha.md new file mode 100644 index 000000000..487a02b82 --- /dev/null +++ b/docs/src/snap/howto/2-node-ha.md @@ -0,0 +1,423 @@ +# 2-Node Active-Active HA using Dqlite + +## Rationale + +High availability is a mandatory requirement for most production-grade +Kubernetes deployments, usually implying three or more nodes. + +However, 2-node HA clusters are desired in some situations due to cost saving +and operational efficiency considerations. Follow this guide to learn how +Canonical Kubernetes can achieve high availability with just two nodes +while using the default datastore, Dqlite. + +Dqlite cannot achieve Raft quorum with less than three nodes. This means that +Dqlite will not be able to replicate data and the secondaries will simply +forward the queries to the primary node. + +In the event of a node failure, the database will have to be recovered by +following the steps outlined in the [Dqlite recovery guide]. + +## Proposed solution + +Since Dqlite data replication is not available in this situation, we propose +using synchronous block level replication through DRBD. + +The cluster monitoring and failover process will be handled by Pacemaker and +Corosync. In the event of a node failure, the DRBD volume will be mounted on +the replica, which can then access the most recent version of the Dqlite database. + +Additional recovery steps are automated and invoked through Pacemaker. + +## Alternatives + +Another possible approach is to use PostgreSQL with Kine and logical replication. +However, it is outside the scope of this document. + +See the [external datastore guide] for more information on how Canonical +Kubernetes can be configured to use other datastores. + +## Guide + +### Prerequisites + +Make sure that: + +* Both nodes have joined the Kubernetes cluster. + See the [getting started] and [add/remove nodes] guides. +* The user associated with the HA service has SSH access to the peer node and + passwordless sudo configured. For simplicity, the default "ubuntu" user can + be used. +* We recommend using static IP configuration. + +The [2ha.sh script] automates most operations related to the 2-node HA scenario. +Retrieve it like so: + +``` +sudo mkdir -p /var/snap/k8s/common +repo=https://raw.githubusercontent.com/petrutlucian94/k8s-snap +sudo curl $repo/refs/heads/KU-1606/2ha_script/k8s/hack/2ha.sh \ + -o /var/snap/k8s/common/2ha.sh +sudo chmod a+rx /var/snap/k8s/common/2ha.sh +``` + +The first step is to install the required packages: + +``` +/var/snap/k8s/common/2ha.sh install_packages +``` + +### DRBD + +For the purpose of this guide, we are going to use a loopback device as DRBD +backing storage: + +``` +sudo dd if=/dev/zero of=/opt/drbd0-backstore bs=1M count=2000 +``` + +Ensure that the loopback device is attached at boot time, before Pacemaker +starts. + +``` +cat < +HATWO_ADDR= + +cat < +HATWO_ADDR= + +sudo mv /etc/corosync/corosync.conf /etc/corosync/corosync.conf.orig + +cat < +HATWO_ADDR= +DRBD_MOUNT_DIR=${DRBD_MOUNT_DIR:-"/mnt/drbd0"} + +sudo crm configure < + +# remove the node constraint. +sudo crm resource clear fs_res +``` + +### Kubernetes services + +We can now turn our attention to the Kubernetes services. Ensure that the k8s +snap services no longer start automatically. Instead, they will be manged by a +wrapper service. + +``` +for f in `sudo snap services k8s | awk 'NR>1 {print $1}'`; do + echo "disabling snap.$f" + sudo systemctl disable "snap.$f"; +done +``` + +The next step is to define the wrapper service. Add the following to +``/etc/systemd/system/2ha_k8s.service``. Note that the sample uses the ``ubuntu`` +user, feel free to use a different one as long as the prerequisites are met. + +``` +[Unit] +Description=K8s service wrapper handling Dqlite recovery for 2-node HA setups. +After=network.target pacemaker.service + +[Service] +User=ubuntu +Group=ubuntu +Type=oneshot +ExecStart=/bin/bash /var/snap/k8s/common/2ha.sh start_service +ExecStop=/bin/bash sudo snap stop k8s +RemainAfterExit=true + +[Install] +WantedBy=multi-user.target +``` + +```{note} +The ``2ha.sh start_service`` command used by the service wrapper automatically +detects the expected Dqlite role based on the DRBD state and takes the necessary +steps to bootstrap the Dqlite state directories, synchronize with the peer node +(if available) and recover the database. +``` + +We need the ``2ha_k8s`` service to be restarted once a DRBD failover occurs. +For that, we are going to define a separate service that will be invoked by +Pacemaker. Create a file called ``/etc/systemd/system/2ha_k8s_failover.service`` +containing the following: + +``` +[Unit] +Description=Managed by Pacemaker, restarts 2ha_k8s on failover. +After=network.target home-ubuntu-workspace.mount + +[Service] +Type=oneshot +ExecStart=systemctl restart 2ha_k8s +RemainAfterExit=true +``` + +Reload the systemd configuration and set ``2ha_k8s`` to start automatically. +Notice that ``2ha_k8s_failover`` must not be configured to start automatically, +but instead is going to be managed through Pacemaker. + +``` +sudo systemctl enable 2ha_k8s +sudo systemctl daemon-reload +``` + +Make sure that both nodes have been configured using the above steps before +moving forward. + +We can now define a new Pacemaker resource that will invoke the +``2ha_k8s_failover`` service when a DRBD failover occurs. + +``` +sudo crm configure < +[Dqlite recovery guide]: restore-quorum +[external datastore guide]: external-datastore +[2ha.sh script]: https://github.com/canonical/k8s-snap/blob/main/k8s/hack/2ha.sh +[getting started]: ../tutorial/getting-started +[add/remove nodes]: ../tutorial/add-remove-nodes diff --git a/docs/src/snap/howto/index.md b/docs/src/snap/howto/index.md index 3ae545030..295245127 100644 --- a/docs/src/snap/howto/index.md +++ b/docs/src/snap/howto/index.md @@ -22,6 +22,7 @@ proxy backup-restore refresh-certs restore-quorum +2-node-ha epa contribute support From 33ed437101b8d9abb195a7ee46b83d931f60c28a Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Wed, 25 Sep 2024 10:24:38 +0000 Subject: [PATCH 3/9] Update docs as per PR comments --- docs/src/snap/howto/2-node-ha.md | 78 +++++++++++++++----------------- k8s/hack/2ha.sh | 7 ++- 2 files changed, 41 insertions(+), 44 deletions(-) diff --git a/docs/src/snap/howto/2-node-ha.md b/docs/src/snap/howto/2-node-ha.md index 487a02b82..97589107c 100644 --- a/docs/src/snap/howto/2-node-ha.md +++ b/docs/src/snap/howto/2-node-ha.md @@ -1,37 +1,38 @@ -# 2-Node Active-Active HA using Dqlite +# 2-Node Active-Active High-Availability using Dqlite ## Rationale -High availability is a mandatory requirement for most production-grade +High availability (HA) is a mandatory requirement for most production-grade Kubernetes deployments, usually implying three or more nodes. -However, 2-node HA clusters are desired in some situations due to cost saving -and operational efficiency considerations. Follow this guide to learn how -Canonical Kubernetes can achieve high availability with just two nodes -while using the default datastore, Dqlite. +2-node HA clusters are sometimes preferred for cost savings and operational +efficiency considerations. Follow this guide to learn how Canonical Kubernetes +can achieve high availability with just two nodes while using the default +datastore, Dqlite. Dqlite cannot achieve Raft quorum with less than three nodes. This means that Dqlite will not be able to replicate data and the secondaries will simply forward the queries to the primary node. -In the event of a node failure, the database will have to be recovered by -following the steps outlined in the [Dqlite recovery guide]. +In the event of a node failure, database recovery will require following the +steps in the [Dqlite recovery guide]. ## Proposed solution Since Dqlite data replication is not available in this situation, we propose -using synchronous block level replication through DRBD. +using synchronous block level replication through +[Distributed Replicated Block Device] (DRBD). The cluster monitoring and failover process will be handled by Pacemaker and -Corosync. In the event of a node failure, the DRBD volume will be mounted on -the replica, which can then access the most recent version of the Dqlite database. +Corosync. After a node failure, the DRBD volume will be mounted on the standby +node, allowing access to the latest Dqlite database version. Additional recovery steps are automated and invoked through Pacemaker. ## Alternatives -Another possible approach is to use PostgreSQL with Kine and logical replication. -However, it is outside the scope of this document. +Another possible approach is to use PostgreSQL with Kine and logical +replication. However, it is outside the scope of this document. See the [external datastore guide] for more information on how Canonical Kubernetes can be configured to use other datastores. @@ -40,30 +41,20 @@ Kubernetes can be configured to use other datastores. ### Prerequisites -Make sure that: - -* Both nodes have joined the Kubernetes cluster. +* Ensure both nodes are part of the Kubernetes cluster. See the [getting started] and [add/remove nodes] guides. * The user associated with the HA service has SSH access to the peer node and passwordless sudo configured. For simplicity, the default "ubuntu" user can be used. * We recommend using static IP configuration. -The [2ha.sh script] automates most operations related to the 2-node HA scenario. -Retrieve it like so: - -``` -sudo mkdir -p /var/snap/k8s/common -repo=https://raw.githubusercontent.com/petrutlucian94/k8s-snap -sudo curl $repo/refs/heads/KU-1606/2ha_script/k8s/hack/2ha.sh \ - -o /var/snap/k8s/common/2ha.sh -sudo chmod a+rx /var/snap/k8s/common/2ha.sh -``` +The [2ha.sh script] automates most operations related to the 2-node HA scenario +and is included in the snap. The first step is to install the required packages: ``` -/var/snap/k8s/common/2ha.sh install_packages +/snap/k8s/current/k8s/hack/2ha.sh install_packages ``` ### DRBD @@ -112,7 +103,7 @@ sudo systemctl start rc-local.service ``` Let's configure the DRBD block device that will hold the Dqlite data. -Make sure to use the right node addresses. +Ensure the correct node addresses are used. ``` # Disable the DRBD service, it will be managed through Pacemaker. @@ -143,8 +134,8 @@ sudo drbdadm status ``` Let's create a mount point for the DRBD block device. Non-default mount points -need to be passed to the 2ha.sh script mentioned above, see the script for the -full list of configurable parameters. +need to be passed to the ``2ha.sh`` script mentioned above, see the script for +the full list of configurable parameters. ``` DRBD_MOUNT_DIR=/mnt/drbd0 @@ -231,7 +222,7 @@ Let's define a Pacemaker resource for the DRBD block device, which ensures that the block device will be mounted on the replica in case of a primary node failure. -Pacemaker fencing (stonith) configuration is environment specific and thus +[Pacemaker fencing] (stonith) configuration is environment specific and thus outside the scope of this guide. However, we highly recommend using fencing if possible to reduce the risk of cluster split-brain situations. @@ -257,8 +248,8 @@ EOF Before moving forward, let's ensure that the DRBD Pacemaker resource runs on the primary (voter) Dqlite node. -Remember that in our case only the primary node contains the latest Dqlite data, -which will be transfered to the DRBD device once the clustered service starts. +In this setup, only the primary node holds the latest Dqlite data, which will +be transferred to the DRBD device once the clustered service starts. This is automatically handled by the ``2ha_k8s.sh start_service`` command. ``` @@ -280,7 +271,7 @@ sudo crm resource clear fs_res ### Kubernetes services We can now turn our attention to the Kubernetes services. Ensure that the k8s -snap services no longer start automatically. Instead, they will be manged by a +snap services no longer start automatically. Instead, they will be managed by a wrapper service. ``` @@ -291,8 +282,9 @@ done ``` The next step is to define the wrapper service. Add the following to -``/etc/systemd/system/2ha_k8s.service``. Note that the sample uses the ``ubuntu`` -user, feel free to use a different one as long as the prerequisites are met. +``/etc/systemd/system/2ha_k8s.service``. Note that the sample uses the +``ubuntu`` user, feel free to use a different one as long as the prerequisites +are met. ``` [Unit] @@ -303,7 +295,7 @@ After=network.target pacemaker.service User=ubuntu Group=ubuntu Type=oneshot -ExecStart=/bin/bash /var/snap/k8s/common/2ha.sh start_service +ExecStart=/bin/bash /snap/k8s/current/k8s/hack/2ha.sh start_service ExecStop=/bin/bash sudo snap stop k8s RemainAfterExit=true @@ -313,15 +305,15 @@ WantedBy=multi-user.target ```{note} The ``2ha.sh start_service`` command used by the service wrapper automatically -detects the expected Dqlite role based on the DRBD state and takes the necessary -steps to bootstrap the Dqlite state directories, synchronize with the peer node -(if available) and recover the database. +detects the expected Dqlite role based on the DRBD state and takes the +necessary steps to bootstrap the Dqlite state directories, synchronize with the +peer node (if available) and recover the database. ``` We need the ``2ha_k8s`` service to be restarted once a DRBD failover occurs. For that, we are going to define a separate service that will be invoked by -Pacemaker. Create a file called ``/etc/systemd/system/2ha_k8s_failover.service`` -containing the following: +Pacemaker. Create a file called +``/etc/systemd/system/2ha_k8s_failover.service`` containing the following: ``` [Unit] @@ -416,8 +408,10 @@ sudo drbdadm connect r0 ``` +[Distributed Replicated Block Device]: https://ubuntu.com/server/docs/distributed-replicated-block-device-drbd [Dqlite recovery guide]: restore-quorum [external datastore guide]: external-datastore [2ha.sh script]: https://github.com/canonical/k8s-snap/blob/main/k8s/hack/2ha.sh [getting started]: ../tutorial/getting-started [add/remove nodes]: ../tutorial/add-remove-nodes +[Pacemaker fencing]: https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/html/fencing.html diff --git a/k8s/hack/2ha.sh b/k8s/hack/2ha.sh index 4df08f194..c3c2a0c3a 100755 --- a/k8s/hack/2ha.sh +++ b/k8s/hack/2ha.sh @@ -1,10 +1,13 @@ #!/bin/bash +# This script automates various operations on 2-node HA A-A Canonical K8s +# clusters that use the default datastore, Dqlite. +# # Prerequisites: # * required packages installed using the "install_packages" command. -# * initialized k8s cluster, both nodes joined +# * initialized K8s cluster, both nodes joined # * the current user has ssh access to the peer node. -# - used to handle k8s services and transfer dqlite data +# - used to handle K8s services and transfer Dqlite data # * the current user has passwordless sudo enabled. sourced=0 From 0f8bbca43e7118febe65c178bb69e5d712aefd73 Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Mon, 30 Sep 2024 09:31:42 +0300 Subject: [PATCH 4/9] Rename 2ha.sh to two-node-ha.sh --- .../howto/{2-node-ha.md => two-node-ha.md} | 48 +++++++++---------- k8s/hack/{2ha.sh => two-node-ha.sh} | 2 +- 2 files changed, 25 insertions(+), 25 deletions(-) rename docs/src/snap/howto/{2-node-ha.md => two-node-ha.md} (85%) rename k8s/hack/{2ha.sh => two-node-ha.sh} (99%) diff --git a/docs/src/snap/howto/2-node-ha.md b/docs/src/snap/howto/two-node-ha.md similarity index 85% rename from docs/src/snap/howto/2-node-ha.md rename to docs/src/snap/howto/two-node-ha.md index 97589107c..7b84284f6 100644 --- a/docs/src/snap/howto/2-node-ha.md +++ b/docs/src/snap/howto/two-node-ha.md @@ -48,13 +48,13 @@ Kubernetes can be configured to use other datastores. be used. * We recommend using static IP configuration. -The [2ha.sh script] automates most operations related to the 2-node HA scenario -and is included in the snap. +The [two-node-ha.sh script] automates most operations related to the 2-node HA +scenario and is included in the snap. The first step is to install the required packages: ``` -/snap/k8s/current/k8s/hack/2ha.sh install_packages +/snap/k8s/current/k8s/hack/two-node-ha.sh install_packages ``` ### DRBD @@ -134,8 +134,8 @@ sudo drbdadm status ``` Let's create a mount point for the DRBD block device. Non-default mount points -need to be passed to the ``2ha.sh`` script mentioned above, see the script for -the full list of configurable parameters. +need to be passed to the ``two-node-ha.sh`` script mentioned above, see the +script for the full list of configurable parameters. ``` DRBD_MOUNT_DIR=/mnt/drbd0 @@ -250,7 +250,7 @@ the primary (voter) Dqlite node. In this setup, only the primary node holds the latest Dqlite data, which will be transferred to the DRBD device once the clustered service starts. -This is automatically handled by the ``2ha_k8s.sh start_service`` command. +This is automatically handled by the ``two-node-ha.sh start_service`` command. ``` sudo k8s status @@ -282,7 +282,7 @@ done ``` The next step is to define the wrapper service. Add the following to -``/etc/systemd/system/2ha_k8s.service``. Note that the sample uses the +``/etc/systemd/system/two-node-ha-k8s.service``. Note that the sample uses the ``ubuntu`` user, feel free to use a different one as long as the prerequisites are met. @@ -295,7 +295,7 @@ After=network.target pacemaker.service User=ubuntu Group=ubuntu Type=oneshot -ExecStart=/bin/bash /snap/k8s/current/k8s/hack/2ha.sh start_service +ExecStart=/bin/bash /snap/k8s/current/k8s/hack/two-node-ha.sh start_service ExecStop=/bin/bash sudo snap stop k8s RemainAfterExit=true @@ -304,34 +304,34 @@ WantedBy=multi-user.target ``` ```{note} -The ``2ha.sh start_service`` command used by the service wrapper automatically +The ``two-node-ha.sh start_service`` command used by the service wrapper automatically detects the expected Dqlite role based on the DRBD state and takes the necessary steps to bootstrap the Dqlite state directories, synchronize with the peer node (if available) and recover the database. ``` -We need the ``2ha_k8s`` service to be restarted once a DRBD failover occurs. -For that, we are going to define a separate service that will be invoked by -Pacemaker. Create a file called -``/etc/systemd/system/2ha_k8s_failover.service`` containing the following: +We need the ``two-node-ha-k8s`` service to be restarted once a DRBD failover +occurs. For that, we are going to define a separate service that will be +invoked by Pacemaker. Create a file called +``/etc/systemd/system/two-node-ha-k8s-failover.service`` containing the following: ``` [Unit] -Description=Managed by Pacemaker, restarts 2ha_k8s on failover. +Description=Managed by Pacemaker, restarts two-node-ha-k8s on failover. After=network.target home-ubuntu-workspace.mount [Service] Type=oneshot -ExecStart=systemctl restart 2ha_k8s +ExecStart=systemctl restart two-node-ha-k8s RemainAfterExit=true ``` -Reload the systemd configuration and set ``2ha_k8s`` to start automatically. -Notice that ``2ha_k8s_failover`` must not be configured to start automatically, -but instead is going to be managed through Pacemaker. +Reload the systemd configuration and set ``two-node-ha-k8s`` to start +automatically. Notice that ``two-node-ha-k8s-failover`` must not be configured +to start automatically, but instead is going to be managed through Pacemaker. ``` -sudo systemctl enable 2ha_k8s +sudo systemctl enable two-node-ha-k8s sudo systemctl daemon-reload ``` @@ -339,11 +339,11 @@ Make sure that both nodes have been configured using the above steps before moving forward. We can now define a new Pacemaker resource that will invoke the -``2ha_k8s_failover`` service when a DRBD failover occurs. +``two-node-ha-k8s-failover`` service when a DRBD failover occurs. ``` sudo crm configure < Date: Mon, 30 Sep 2024 09:53:02 +0300 Subject: [PATCH 5/9] s/2-node/two-node --- docs/src/snap/howto/two-node-ha.md | 10 +++++----- k8s/hack/two-node-ha.sh | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md index 7b84284f6..ee314025c 100644 --- a/docs/src/snap/howto/two-node-ha.md +++ b/docs/src/snap/howto/two-node-ha.md @@ -1,11 +1,11 @@ -# 2-Node Active-Active High-Availability using Dqlite +# Two-Node Active-Active High-Availability using Dqlite ## Rationale High availability (HA) is a mandatory requirement for most production-grade Kubernetes deployments, usually implying three or more nodes. -2-node HA clusters are sometimes preferred for cost savings and operational +Two-node HA clusters are sometimes preferred for cost savings and operational efficiency considerations. Follow this guide to learn how Canonical Kubernetes can achieve high availability with just two nodes while using the default datastore, Dqlite. @@ -48,8 +48,8 @@ Kubernetes can be configured to use other datastores. be used. * We recommend using static IP configuration. -The [two-node-ha.sh script] automates most operations related to the 2-node HA -scenario and is included in the snap. +The [two-node-ha.sh script] automates most operations related to the two-node +HA scenario and is included in the snap. The first step is to install the required packages: @@ -288,7 +288,7 @@ are met. ``` [Unit] -Description=K8s service wrapper handling Dqlite recovery for 2-node HA setups. +Description=K8s service wrapper handling Dqlite recovery for two-node HA setups. After=network.target pacemaker.service [Service] diff --git a/k8s/hack/two-node-ha.sh b/k8s/hack/two-node-ha.sh index 0e68e354d..8a2bad9a9 100755 --- a/k8s/hack/two-node-ha.sh +++ b/k8s/hack/two-node-ha.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script automates various operations on 2-node HA A-A Canonical K8s +# This script automates various operations on two-node HA A-A Canonical K8s # clusters that use the default datastore, Dqlite. # # Prerequisites: @@ -794,7 +794,7 @@ Commands: replacing them with symlinks. The existing contents are moved to a backup folder, which can be used as part of the recovery process. - install_packages Install the packages required by the 2-node HA + install_packages Install the packages required by the two-node HA cluster. start_service Initialize the k8s services, taking the following steps: From 8d690c3351cb2b22e677a727a15d0d9937602cda Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Mon, 30 Sep 2024 10:33:05 +0300 Subject: [PATCH 6/9] Address comments --- docs/src/snap/howto/index.md | 2 +- docs/src/snap/howto/two-node-ha.md | 103 ++++++++++++++++------------- 2 files changed, 57 insertions(+), 48 deletions(-) diff --git a/docs/src/snap/howto/index.md b/docs/src/snap/howto/index.md index 295245127..817b5a247 100644 --- a/docs/src/snap/howto/index.md +++ b/docs/src/snap/howto/index.md @@ -22,7 +22,7 @@ proxy backup-restore refresh-certs restore-quorum -2-node-ha +two-node-ha epa contribute support diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md index ee314025c..1ff25f005 100644 --- a/docs/src/snap/howto/two-node-ha.md +++ b/docs/src/snap/howto/two-node-ha.md @@ -1,17 +1,15 @@ # Two-Node Active-Active High-Availability using Dqlite -## Rationale - High availability (HA) is a mandatory requirement for most production-grade Kubernetes deployments, usually implying three or more nodes. Two-node HA clusters are sometimes preferred for cost savings and operational efficiency considerations. Follow this guide to learn how Canonical Kubernetes can achieve high availability with just two nodes while using the default -datastore, Dqlite. +datastore, [Dqlite]. -Dqlite cannot achieve Raft quorum with less than three nodes. This means that -Dqlite will not be able to replicate data and the secondaries will simply +Dqlite cannot achieve a [Raft] quorum with fewer than three nodes. This means +that Dqlite will not be able to replicate data and the secondaries will simply forward the queries to the primary node. In the event of a node failure, database recovery will require following the @@ -23,25 +21,15 @@ Since Dqlite data replication is not available in this situation, we propose using synchronous block level replication through [Distributed Replicated Block Device] (DRBD). -The cluster monitoring and failover process will be handled by Pacemaker and -Corosync. After a node failure, the DRBD volume will be mounted on the standby -node, allowing access to the latest Dqlite database version. +The cluster monitoring and failover process will be handled by [Pacemaker] and +[Corosync]. After a node failure, the DRBD volume will be mounted on the +standby node, allowing access to the latest Dqlite database version. Additional recovery steps are automated and invoked through Pacemaker. -## Alternatives - -Another possible approach is to use PostgreSQL with Kine and logical -replication. However, it is outside the scope of this document. - -See the [external datastore guide] for more information on how Canonical -Kubernetes can be configured to use other datastores. - -## Guide +### Prerequisites: -### Prerequisites - -* Ensure both nodes are part of the Kubernetes cluster. +* Please ensure that both nodes are part of the Kubernetes cluster. See the [getting started] and [add/remove nodes] guides. * The user associated with the HA service has SSH access to the peer node and passwordless sudo configured. For simplicity, the default "ubuntu" user can @@ -57,10 +45,9 @@ The first step is to install the required packages: /snap/k8s/current/k8s/hack/two-node-ha.sh install_packages ``` -### DRBD +### Distributed Replicated Block Device (DRBD) -For the purpose of this guide, we are going to use a loopback device as DRBD -backing storage: +This example uses a loopback device as DRBD backing storage: ``` sudo dd if=/dev/zero of=/opt/drbd0-backstore bs=1M count=2000 @@ -77,6 +64,12 @@ losetup /dev/lodrbd /opt/drbd0-backstore EOF sudo chmod +x /etc/rc.local +``` + +Add a service to automatically execute the ``/etc/rc.local`` script. + +``` + cat < @@ -268,11 +261,10 @@ sudo crm resource move fs_res sudo crm resource clear fs_res ``` -### Kubernetes services +### Managing Kubernetes Snap Services -We can now turn our attention to the Kubernetes services. Ensure that the k8s -snap services no longer start automatically. Instead, they will be managed by a -wrapper service. +For the two-node HA setup k8s snap services should no longer start +automatically. Instead, they will be managed by a wrapper service. ``` for f in `sudo snap services k8s | awk 'NR>1 {print $1}'`; do @@ -281,10 +273,15 @@ for f in `sudo snap services k8s | awk 'NR>1 {print $1}'`; do done ``` +### Preparing the wrapper service + The next step is to define the wrapper service. Add the following to -``/etc/systemd/system/two-node-ha-k8s.service``. Note that the sample uses the -``ubuntu`` user, feel free to use a different one as long as the prerequisites +``/etc/systemd/system/two-node-ha-k8s.service``. + +```{note} +the sample uses the ``ubuntu`` user, feel free to use a different one as long as the prerequisites are met. +``` ``` [Unit] @@ -304,16 +301,17 @@ WantedBy=multi-user.target ``` ```{note} -The ``two-node-ha.sh start_service`` command used by the service wrapper automatically -detects the expected Dqlite role based on the DRBD state and takes the -necessary steps to bootstrap the Dqlite state directories, synchronize with the -peer node (if available) and recover the database. +The ``two-node-ha.sh start_service`` command used by the service wrapper +automatically detects the expected Dqlite role based on the DRBD state. +It then takes the necessary steps to bootstrap the Dqlite state directories, +synchronize with the peer node (if available) and recover the database. ``` -We need the ``two-node-ha-k8s`` service to be restarted once a DRBD failover -occurs. For that, we are going to define a separate service that will be -invoked by Pacemaker. Create a file called -``/etc/systemd/system/two-node-ha-k8s-failover.service`` containing the following: +When a DRBD failover occurs, the ``two-node-ha-k8s`` service needs to be +restarted. To accomplish this,, we are going to define a separate service that +will be invoked by Pacemaker. Create a file called +``/etc/systemd/system/two-node-ha-k8s-failover.service`` containing the +following: ``` [Unit] @@ -338,7 +336,9 @@ sudo systemctl daemon-reload Make sure that both nodes have been configured using the above steps before moving forward. -We can now define a new Pacemaker resource that will invoke the +### Automating the failover procedure + +Define a new Pacemaker resource that will invoke the ``two-node-ha-k8s-failover`` service when a DRBD failover occurs. ``` @@ -352,7 +352,8 @@ quit EOF ``` -The setup is ready, start the HA k8s service on both nodes: +Once the setup is complete on both nodes, start the two-node HA k8s service on +each node: ``` sudo systemctl start two-node-ha-k8s @@ -360,6 +361,9 @@ sudo systemctl start two-node-ha-k8s ## Troubleshooting +Here are some potential problems that may affect two-node HA clusters and how +to address them. + ### Dqlite recovery failing because of unexpected data segments Dqlite recovery may fail if there are data segments past the latest snapshot. @@ -376,7 +380,7 @@ Remove the offending segments and restart the ``two-node-ha-k8s`` service. ### DRBD split brain -The DRBD cluster may enter a split brain state and stop synchronizing. The +The DRBD cluster may enter a [split brain] state and stop synchronizing. The chances increase if fencing (stonith) is not enabled. ``` @@ -408,10 +412,15 @@ sudo drbdadm connect r0 ``` +[Dqlite]: https://dqlite.io/ +[Raft]: https://raft.github.io/ [Distributed Replicated Block Device]: https://ubuntu.com/server/docs/distributed-replicated-block-device-drbd [Dqlite recovery guide]: restore-quorum [external datastore guide]: external-datastore [two-node-ha.sh script]: https://github.com/canonical/k8s-snap/blob/main/k8s/hack/two-node-ha.sh [getting started]: ../tutorial/getting-started [add/remove nodes]: ../tutorial/add-remove-nodes +[Pacemaker]: https://clusterlabs.org/pacemaker/ +[Corosync]: https://clusterlabs.org/corosync.html [Pacemaker fencing]: https://clusterlabs.org/pacemaker/doc/2.1/Pacemaker_Explained/html/fencing.html +[split brain]: https://en.wikipedia.org/wiki/Split-brain_(computing) From 97c287bbadf60134f02616cbbd04fadf7e28e194 Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Mon, 30 Sep 2024 15:15:31 +0300 Subject: [PATCH 7/9] Remove empty lines and add separate note about the A-A cluster --- docs/src/snap/howto/two-node-ha.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md index 1ff25f005..36ed7487d 100644 --- a/docs/src/snap/howto/two-node-ha.md +++ b/docs/src/snap/howto/two-node-ha.md @@ -1,4 +1,4 @@ -# Two-Node Active-Active High-Availability using Dqlite +# Two-Node High-Availability with Dqlite High availability (HA) is a mandatory requirement for most production-grade Kubernetes deployments, usually implying three or more nodes. @@ -6,7 +6,8 @@ Kubernetes deployments, usually implying three or more nodes. Two-node HA clusters are sometimes preferred for cost savings and operational efficiency considerations. Follow this guide to learn how Canonical Kubernetes can achieve high availability with just two nodes while using the default -datastore, [Dqlite]. +datastore, [Dqlite]. Both nodes will be active members of the cluster, sharing +the Kubernetes load. Dqlite cannot achieve a [Raft] quorum with fewer than three nodes. This means that Dqlite will not be able to replicate data and the secondaries will simply @@ -69,8 +70,6 @@ sudo chmod +x /etc/rc.local Add a service to automatically execute the ``/etc/rc.local`` script. ``` - - cat < Date: Mon, 30 Sep 2024 16:33:03 +0300 Subject: [PATCH 8/9] Address comments --- docs/src/snap/howto/two-node-ha.md | 16 +++---- k8s/hack/two-node-ha.sh | 74 +++++++++++++++--------------- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md index 36ed7487d..8c750cfbe 100644 --- a/docs/src/snap/howto/two-node-ha.md +++ b/docs/src/snap/howto/two-node-ha.md @@ -126,8 +126,8 @@ sudo drbdadm status ``` Create a mount point for the DRBD block device. Non-default mount points -need to be passed to the ``two-node-ha.sh`` script mentioned above, see the -script for the full list of configurable parameters. +need to be passed to the ``two-node-ha.sh`` script mentioned above. Please +refer to the script for the full list of configurable parameters. ``` DRBD_MOUNT_DIR=/mnt/drbd0 @@ -214,9 +214,10 @@ Let's define a Pacemaker resource for the DRBD block device, which ensures that the block device will be mounted on the replica in case of a primary node failure. -[Pacemaker fencing] (stonith) configuration is environment specific and thus -outside the scope of this guide. Using fencing is highly recommended if it is -possible to reduce the risk of cluster split-brain situations. +[Pacemaker fencing] (Shoot The Other Node In The Head - STONITH) configuration +is environment specific and thus outside the scope of this guide. Using fencing +is highly recommended, if it is possible, to reduce the risk of cluster +split-brain situations. ``` HAONE_ADDR= @@ -262,7 +263,7 @@ sudo crm resource clear fs_res ### Managing Kubernetes Snap Services -For the two-node HA setup k8s snap services should no longer start +For the two-node HA setup, k8s snap services should no longer start automatically. Instead, they will be managed by a wrapper service. ``` @@ -307,7 +308,7 @@ synchronize with the peer node (if available) and recover the database. ``` When a DRBD failover occurs, the ``two-node-ha-k8s`` service needs to be -restarted. To accomplish this,, we are going to define a separate service that +restarted. To accomplish this, we are going to define a separate service that will be invoked by Pacemaker. Create a file called ``/etc/systemd/system/two-node-ha-k8s-failover.service`` containing the following: @@ -395,7 +396,6 @@ srcversion: C7B8F7076B8D6DB066D84D9 ubuntu@hatwo:~$ dmesg | grep "Split" [ +0.000082] block drbd0: Split-Brain detected but unresolved, dropping connection! - ``` To recover DRBD, use following procedure: diff --git a/k8s/hack/two-node-ha.sh b/k8s/hack/two-node-ha.sh index 8a2bad9a9..f3c4506e9 100755 --- a/k8s/hack/two-node-ha.sh +++ b/k8s/hack/two-node-ha.sh @@ -1,7 +1,7 @@ #!/bin/bash -# This script automates various operations on two-node HA A-A Canonical K8s -# clusters that use the default datastore, Dqlite. +# This script automates various operations on two-node HA Active-Active +# Canonical K8s clusters that use the default datastore, Dqlite. # # Prerequisites: # * required packages installed using the "install_packages" command. @@ -87,8 +87,8 @@ function get_dqlite_node_role() { function get_dqlite_role_from_cluster_yaml() { # Note that the cluster.yaml role may not match the info.yaml role. - # In case of a freshly joined node, info.yaml will have "voter" role - # while cluster.yaml has "spare" role. + # In case of a freshly joined node, info.yaml will show it as a "voter" + # while cluster.yaml lists it as a "spare" node. local clusterYamlPath=$1 local nodeId=$2 @@ -99,7 +99,7 @@ function get_dqlite_role_from_cluster_yaml() { function set_dqlite_node_role() { # The yq snap installs in confined mode, so it's unable to access the - # dqlite config files. + # Dqlite config files. # In order to modify files in-place, we're using sponge. It reads all # the stdin data before opening the output file. local infoYamlPath=$1 @@ -146,7 +146,7 @@ function get_dql_peer_ip() { echo ${addresses[0]} | cut -d ":" -f 1 } -# This function moves the dqlite state directories to the DRBD mount, +# This function moves the Dqlite state directories to the DRBD mount, # replacing them with symlinks. This ensures that the primary will always use # the latest DRBD data. # @@ -156,7 +156,7 @@ function move_statedirs() { sudo mkdir -p $DRBD_MOUNT_DIR/k8s-dqlite sudo mkdir -p $DRBD_MOUNT_DIR/k8sd - log_message "Validating dqlite state directories." + log_message "Validating Dqlite state directories." check_statedir $K8S_DQLITE_STATE_DIR $DRBD_MOUNT_DIR/k8s-dqlite check_statedir $K8SD_STATE_DIR $DRBD_MOUNT_DIR/k8sd @@ -177,7 +177,7 @@ function move_statedirs() { # TODO: consider automating this. We may move the pacemaker resource # ourselves and maybe even copy the remote files through scp or ssh. # However, there's a risk of race conditions. - log_message "DRBD volume mounted on replica, refusing to transfer dqlite files." + log_message "DRBD volume mounted on replica, refusing to transfer Dqlite files." log_message "Move the DRBD volume to the primary node (through the fs_res Pacemaker resource) and try again." log_message "Example: sudo crm resource move fs_res && sudo crm resource clear fs_res" exit 1 @@ -261,7 +261,7 @@ function ensure_drbd_unmounted() { } function ensure_drbd_ready() { - ensure_mount_rw + ensure_mount_rw diskStatus=`sudo drbdadm status r0 | grep disk | head -1 | cut -d ":" -f 2` if [[ $diskStatus != "UpToDate" ]]; then @@ -303,7 +303,7 @@ function wait_for_peer_k8s() { local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId` if [[ -z $peerIp ]]; then - log_message "Couldn't retrieve dqlite peer ip." + log_message "Couldn't retrieve Dqlite peer ip." exit 1 fi @@ -356,8 +356,8 @@ function wait_drbd_resource () { return 1 } -# Based on the drbd volume state, we decide if this node should be a -# dqlite voter or a spare. +# Based on the DRBD volume state, we decide if this node should be a +# Dqlite voter or a spare. function get_expected_dqlite_role() { drbdResRole=`sudo drbdadm status $DRBD_RES_NAME | head -1 | grep role | cut -d ":" -f 2` @@ -397,7 +397,7 @@ function validate_drbd_state() { # After a failover, the state dir points to the shared DRBD volume. # We need to restore the node certificate and config files. function restore_dqlite_confs_and_certs() { - log_message "Restoring dqlite configs and certificates." + log_message "Restoring Dqlite configs and certificates." sudo cp $K8S_DQLITE_STATE_BKP_DIR/info.yaml $K8S_DQLITE_STATE_DIR @@ -431,7 +431,7 @@ function promote_as_primary() { local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId` if [[ -z $peerIp ]]; then - log_message "Couldn't retrieve dqlite peer ip." + log_message "Couldn't retrieve Dqlite peer ip." exit 1 fi @@ -457,17 +457,17 @@ function promote_as_primary() { log_message "The stopped services are going to be restarted after the recovery finishes." else log_message "Couldn't stop k8s services on the peer node." \ - "Assuming that it's stopped and proceeding with the recovery." + "Assuming that the peer node is stopped and proceeding with the recovery." fi fi log_message "Ensuring rw access to DRBD mount." - # Having RW access to the drbd mount implies that this is the primary node. + # Having RW access to the DRBD mount implies that this is the primary node. ensure_mount_rw restore_dqlite_confs_and_certs - log_message "Updating dqlite roles." + log_message "Updating Dqlite roles." # Update info.yaml set_dqlite_node_role $K8S_DQLITE_INFO_YAML $DQLITE_ROLE_VOTER set_dqlite_node_role $K8SD_INFO_YAML $DQLITE_ROLE_VOTER @@ -476,7 +476,7 @@ function promote_as_primary() { set_dqlite_node_as_sole_voter $K8S_DQLITE_CLUSTER_YAML $k8sDqliteNodeId set_dqlite_node_as_sole_voter $K8SD_CLUSTER_YAML $k8sdNodeId - log_message "Restoring dqlite." + log_message "Restoring Dqlite." sudo $K8SD_PATH cluster-recover \ --state-dir=$K8SD_STATE_DIR \ --k8s-dqlite-state-dir=$K8S_DQLITE_STATE_DIR \ @@ -506,10 +506,10 @@ function promote_as_primary() { function process_recovery_files_on_secondary() { local peerIp="$1" - log_message "Ensuring that the drbd volume is unmounted." + log_message "Ensuring that the DRBD volume is unmounted." ensure_drbd_unmounted - log_message "Restoring local dqlite backup files." + log_message "Restoring local Dqlite backup files." sudo cp -r $K8S_DQLITE_STATE_BKP_DIR/. $DRBD_MOUNT_DIR/k8s-dqlite/ sudo cp -r $K8SD_STATE_BKP_DIR/. $DRBD_MOUNT_DIR/k8sd/ @@ -542,7 +542,7 @@ function process_recovery_files_on_secondary() { scp $SSH_USERNAME@$peerIp:/var/snap/k8s/common/$lastK8sDqliteRecoveryTarball /tmp/ sudo tar -xf /tmp/$lastK8sDqliteRecoveryTarball -C $K8S_DQLITE_STATE_DIR - log_message "Updating dqlite roles." + log_message "Updating Dqlite roles." # Update info.yaml set_dqlite_node_role $K8S_DQLITE_INFO_YAML $DQLITE_ROLE_SPARE set_dqlite_node_role $K8SD_INFO_YAML $DQLITE_ROLE_SPARE @@ -550,7 +550,7 @@ function process_recovery_files_on_secondary() { # updated cluster.yaml files. } -# Recover a former primary, now secondary dqlite node. +# Recover a former primary, now secondary Dqlite node. # Run "promote_as_primary" on the ther node first. function rejoin_secondary() { log_message "Recovering secondary node." @@ -563,7 +563,7 @@ function rejoin_secondary() { local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId` if [[ -z $peerIp ]]; then - log_message "Couldn't retrieve dqlite peer ip." + log_message "Couldn't retrieve Dqlite peer ip." exit 1 fi @@ -572,7 +572,7 @@ function rejoin_secondary() { log_message "Adding temporary Pacemaker constraint." # We need to prevent failovers from happening while restoring secondary - # dqlite data, otherwise we may end up overriding or deleting the primary + # Dqlite data, otherwise we may end up overriding or deleting the primary # node data. # # TODO: consider reducing the constraint scope (e.g. resource level constraint @@ -641,10 +641,10 @@ function check_peer_recovery_tarballs() { exit 1 fi - log_message "Retrieving dqlite peer ip." + log_message "Retrieving Dqlite peer ip." local peerIp=`get_dql_peer_ip $K8S_DQLITE_CLUSTER_BKP_YAML $k8sDqliteNodeId` if [[ -z $peerIp ]]; then - log_message "Couldn't retrieve dqlite peer ip." + log_message "Couldn't retrieve Dqlite peer ip." exit 1 fi @@ -669,11 +669,11 @@ function check_peer_recovery_tarballs() { function start_service() { log_message "Initializing node." - # DRBD is the primary source of truth for the dqlite role. + # DRBD is the primary source of truth for the Dqlite role. # We need to wait for it to become available. wait_drbd_resource - # dump the drbd and pacemaker status for debugging purposes. + # dump the DRBD and pacemaker status for debugging purposes. sudo drbdadm status sudo crm status @@ -684,14 +684,14 @@ function start_service() { local expRole=`get_expected_dqlite_role` case $expRole in $DQLITE_ROLE_VOTER) - log_message "Assuming the dqlite voter role (primary)." + log_message "Assuming the Dqlite voter role (primary)." # We'll assume that if the primary stopped, it needs to go through # the recovery process. promote_as_primary ;; $DQLITE_ROLE_SPARE) - log_message "Assuming the dqlite spare role (secondary)." + log_message "Assuming the Dqlite spare role (secondary)." wait_for_peer_k8s @@ -709,14 +709,14 @@ function start_service() { fi ;; *) - log_message "Unexpected dqlite role: $expRole" + log_message "Unexpected Dqlite role: $expRole" exit 1 ;; esac } function clean_recovery_data() { - log_message "Cleaning up dqlite recovery data." + log_message "Cleaning up Dqlite recovery data." rm -f $K8SD_RECOVERY_TARBALL rm -f $K8SD_RECOVERY_TARBALL_BKP rm -f $K8S_DQLITE_STATE_DIR/recovery-k8s-dqlite* @@ -738,7 +738,7 @@ function purge() { # The replicas use the mount dir directly, without a block device # attachment. We need to clean up the mount point as well. # - # We're using another mount with "--bind" to bypass the drbd mount. + # We're using another mount with "--bind" to bypass the DRBD mount. tempdir=`mktemp -d` # We need to mount the parent dir. sudo mount --bind `dirname $DRBD_MOUNT_DIR` $tempdir @@ -790,7 +790,7 @@ Unknown command: $1 usage: $0 Commands: - move_statedirs Move the dqlite state directories to the DRBD mount, + move_statedirs Move the Dqlite state directories to the DRBD mount, replacing them with symlinks. The existing contents are moved to a backup folder, which can be used as part of the recovery process. @@ -798,13 +798,13 @@ Commands: cluster. start_service Initialize the k8s services, taking the following steps: - 1. Based on the drbd state, decide if this node + 1. Based on the DRBD state, decide if this node should assume the primary (dqlite voter) or secondary (spare) role. - 2. If this is the first start, transfer the dqlite + 2. If this is the first start, transfer the Dqlite state directories and create backups. 3. If this node is a primary, promote it and initiate - the dqlite recovery, creating recovery tarballs. + the Dqlite recovery, creating recovery tarballs. Otherwise, copy over the recovery files and join the existing cluster as a spare. 4. Start the k8s services. From 113652413483563eea79275d0b4ee285b76a3e3a Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Tue, 1 Oct 2024 11:30:04 +0300 Subject: [PATCH 9/9] add warning to troubleshooting section --- docs/src/snap/howto/two-node-ha.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/src/snap/howto/two-node-ha.md b/docs/src/snap/howto/two-node-ha.md index 8c750cfbe..cdd894a79 100644 --- a/docs/src/snap/howto/two-node-ha.md +++ b/docs/src/snap/howto/two-node-ha.md @@ -364,6 +364,11 @@ sudo systemctl start two-node-ha-k8s Here are some potential problems that may affect two-node HA clusters and how to address them. +```{warning} +Before taking any of the actions below, please back up the entire Dqlite data +directory to avoid losing data in case something goes wrong. +``` + ### Dqlite recovery failing because of unexpected data segments Dqlite recovery may fail if there are data segments past the latest snapshot.