Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Helm Chart Enhancements: Cluster Mode Deployment, Adding, Resharding, and Deleting Clusters #534

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
132 changes: 132 additions & 0 deletions charts/garnet/templates/add-nodes-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
{{- if .Values.cluster.enabled }}
{{- if .Values.cluster.initJob.enabled }}
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "garnet.fullname" . }}-cluster-add
labels:
{{- include "garnet.labels" . | nindent 4 }}
annotations:
"helm.sh/hook": post-upgrade
spec:
backoffLimit: {{ .Values.cluster.initJob.backoffLimit }}
activeDeadlineSeconds: 1800
ttlSecondsAfterFinished: 600
template:
spec:
restartPolicy: Never
containers:
- name: add-node
image: "{{ .Values.cluster.initJob.image.registry }}/{{ .Values.cluster.initJob.image.repository }}:{{ .Values.cluster.initJob.image.tag | default "latest" }}"
command: ["/bin/sh", "-c"]
args:
- |
garnet_host="{{ include "garnet.fullname" . }}-0.{{ include "garnet.fullname" . }}-headless.{{ .Release.Namespace }}.svc.cluster.local"
Copy link

@Xizt Xizt Aug 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens when pod-0 is down or unknown state ?
With reshard-delete-job.yaml, if pod0 restarts how will it be able to re-shard

garnet_port="{{ .Values.containers.port }}"

echo "Starting node addition process..."

get_current_nodes() {
/usr/local/bin/redis-cli -h "$garnet_host" -p "$garnet_port" CLUSTER NODES | grep master | awk -F ',' '{print $2}' | awk '{print $1}'
}

check_node_ready() {
local node=$1
local retries=5
while [ $retries -gt 0 ]; do
if /usr/local/bin/redis-cli -h "$node" -p "$garnet_port" ping | grep -q "PONG"; then
return 0
else
echo "Node $node is not ready, retrying..."
retries=$((retries - 1))
sleep 5
fi
done
return 1
}

add_node_to_cluster() {
local node=$1
local retries=5
while [ $retries -gt 0 ]; do
/usr/local/bin/redis-cli --cluster add-node "$node:$garnet_port" "$garnet_host:$garnet_port"
if [ $? -eq 0 ]; then
echo "Successfully added node $node to the cluster"
return 0
else
echo "Failed to add node $node, retrying..."
retries=$((retries - 1))
sleep 5
fi
done
return 1
}


ensure_cluster_consistency() {
local consistent=false
while [ "$consistent" = false ]; do
output=$(/usr/local/bin/redis-cli --cluster check "$garnet_host:$garnet_port" 2>&1)
echo "$output"
if echo "$output" | grep -q "All nodes agree about slots configuration"; then
consistent=true
else
echo "Waiting for cluster consistency..."
sleep 10
fi
done
}

rebalance_cluster() {
local rebalanced=false
local attempts=0
local max_attempts=5
while [ "$rebalanced" = false ] && [ $attempts -lt $max_attempts ]; do
output=$(/usr/local/bin/redis-cli --cluster rebalance --cluster-use-empty-masters --cluster-yes "$garnet_host:$garnet_port" 2>&1)
echo "$output"
if echo "$output" | grep -q "ERR I don't know about node"; then
echo "Rebalancing encountered an error, retrying..."
sleep 10
attempts=$((attempts + 1))
else
rebalanced=true
fi
done

if [ "$rebalanced" = false ]; then
echo "Failed to rebalance the cluster after $max_attempts attempts."
exit 1
fi
}

get_desired_nodes() {
for i in $(seq 0 $(({{ .Values.cluster.statefulSet.replicas }} - 1))); do
printf "%s-%d.%s-headless.%s.svc.cluster.local " "{{ include "garnet.fullname" . }}" "$i" "{{ include "garnet.fullname" . }}" "{{ .Release.Namespace }}"
done
}

current_nodes=$(get_current_nodes)
desired_nodes=$(get_desired_nodes)

for node in $desired_nodes; do
if ! echo "$current_nodes" | grep -q "$node"; then
echo "Checking readiness of node $node"
if check_node_ready "$node"; then
echo "Node $node is ready, adding to the cluster"
add_node_to_cluster "$node"
else
echo "Failed to add node $node after multiple attempts"
exit 1
fi
fi
done

echo "Ensuring cluster consistency..."
ensure_cluster_consistency

echo "Rebalancing the cluster..."
rebalance_cluster
echo "Cluster rebalancing completed."

{{- end }}
{{- end }}
124 changes: 124 additions & 0 deletions charts/garnet/templates/cluster-sts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
{{- if .Values.cluster.enabled }}
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: {{ include "garnet.fullname" . }}
labels:
{{- include "garnet.labels" . | nindent 4 }}
{{- with .Values.statefulSet.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
serviceName: {{ include "garnet.fullname" . }}-headless
replicas: {{ .Values.cluster.statefulSet.replicas }}
revisionHistoryLimit: {{ .Values.statefulSet.revisionHistoryLimit }}
updateStrategy:
{{- with .Values.statefulSet.updateStrategy }}
{{- toYaml . | nindent 4 }}
{{- end }}
selector:
matchLabels:
{{- include "garnet.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "garnet.selectorLabels" . | nindent 8 }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "garnet.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.image.registry }}/{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command: ["./GarnetServer"]
{{- if .Values.cluster.containers.args }}
args:
{{- range .Values.cluster.containers.args }}
- {{ . | quote }}
{{- end }}
{{- end }}
ports:
- name: garnet
containerPort: {{ .Values.containers.port }}
protocol: TCP
{{- with .Values.containers.livenessProbe }}
livenessProbe:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.containers.readinessProbe }}
readinessProbe:
{{- toYaml . | nindent 12 }}
{{- end }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
- name: data
mountPath: /data
- name: cluster
mountPath: /app/cluster
{{- with .Values.extraVolumeMounts }}
{{- toYaml . | nindent 10}}
{{- end }}
{{- with .Values.dnsConfig }}
dnsConfig:
{{- toYaml . | nindent 8 }}
{{- end }}
dnsPolicy: {{ .Values.dnsPolicy }}
{{- with .Values.initContainers }}
initContainers:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- if or (eq .Values.persistence.enabled false) .Values.extraVolumes }}
volumes:
{{- if (eq .Values.persistence.enabled false) }}
- emptyDir: {}
name: data
- emptyDir: {}
name: cluster
{{- end }}
{{- with .Values.extraVolumes }}
{{- toYaml . | nindent 6}}
{{- end }}
{{- end }}
{{- if .Values.persistence.enabled }}
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: [ "ReadWriteOnce" ]
{{- if .Values.volumeClaimTemplates.storageClassName }}
{{- if (eq "-" .Values.volumeClaimTemplates.storageClassName) }}
storageClassName: ""
{{- else }}
storageClassName: {{ .Values.volumeClaimTemplates.storageClassName }}
{{- end }}
{{- end }}
resources:
requests:
storage: {{ .Values.volumeClaimTemplates.requestsStorage }}
{{- end }}
{{- end }}
54 changes: 54 additions & 0 deletions charts/garnet/templates/init-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{{- if .Values.cluster.enabled }}
{{- if .Values.cluster.initJob.enabled }}
apiVersion: batch/v1
kind: Job
metadata:
name: "{{ .Release.Name }}-manage-cluster"
labels:
{{- include "garnet.labels" . | nindent 4 }}
{{- with .Values.cluster.initJob.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
backoffLimit: {{ .Values.cluster.initJob.backoffLimit }}
ttlSecondsAfterFinished: {{ .Values.cluster.initJob.ttlSecondsAfterFinished }}
template:
metadata:
labels:
{{- include "garnet.labels" . | nindent 8 }}
spec:
containers:
- name: init
image: "{{ .Values.cluster.initJob.image.registry }}/{{ .Values.cluster.initJob.image.repository }}:{{ .Values.cluster.initJob.image.tag | default "latest" }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- /bin/sh
- -c
- |
echo "Waiting for DNS propagation..."
sleep 10

# Wait for Redis to be ready using ping
njnicko marked this conversation as resolved.
Show resolved Hide resolved
echo "Waiting for redis to be ready..."
until /usr/local/bin/redis-cli -h {{ include "garnet.fullname" . }}-{{ sub (int .Values.cluster.statefulSet.replicas) 1 }}.{{ include "garnet.fullname" . }}-headless.{{ .Release.Namespace }}.svc.cluster.local -p {{ .Values.containers.port }} ping; do
echo "Waiting for redis to be ready..."
sleep 10
done

# Check how many clusters are ok
cluster_slots_ok=$(/usr/local/bin/redis-cli -h {{ include "garnet.fullname" . }}-0.{{ include "garnet.fullname" . }}-headless.{{ .Release.Namespace }}.svc.cluster.local -p {{ .Values.containers.port }} CLUSTER INFO 2>/dev/null | grep cluster_slots_ok | awk -F ':' '{print $2}')

# Create clusters if not created
if [ "$cluster_slots_ok" -ne 16384 ]; then
echo "Cluster is not fully created. Creating cluster..."
/usr/local/bin/redis-cli --cluster create {{- range $i := until (int .Values.cluster.statefulSet.replicas) }} {{ printf "%s-%d.%s-headless.%s.svc.cluster.local:%d " (include "garnet.fullname" $) $i (include "garnet.fullname" $) $.Release.Namespace (int $.Values.containers.port) }} {{- end }} --cluster-yes
else
echo "Cluster is already created and all slots are covered."
fi

# Additional wait time to ensure stability
sleep 10
restartPolicy: Never
{{- end }}
{{- end }}
Loading