default:
if status.Members.Writer == "" {
var candidate *corev1.Pod
// ... candidate selection here
// new writer gets selected and the labels for the new writer are updated:
// labels are now `reader=false; role=writer`
// Writer Status is updated: status.Members.Writer
status.Members.Writer = candidate.Name
if err := gc.podControl.PatchPodLabels(galera, candidate.DeepCopy(), RemoveReader(), AddWriter()); err != nil {
return err
}
}
if status.Members.BackupWriter == "" {
// unrelated logic
}
// here is where the problem starts - `status.Members.Writer` is updated and the writer node has the correct labels
// but the operation was done on a `DeepCopy` and this overwrites the labels for the writer node with the old labels
for _, pod := range readyPods {
if pod.Name == status.Members.Writer {
if pod.DeepCopy().Labels[apigalera.GaleraReaderLabel] == "true" {
return gc.podControl.PatchPodLabels(galera, pod.DeepCopy(), RemoveReader())
}
}
}
See my comments in the code, but basically what happens is, the newly selected writer nodes get's the role label added and right after there is another update for the new writer node which resets the labels to the old labels again and additionally sets the reader label to false, leaving the one node with only reader=false and no role.
This patch seems to fix the issue:
--- a/pkg/controllers/cluster/galera_control.go
+++ b/pkg/controllers/cluster/galera_control.go
@@ -978,38 +987,38 @@ func (gc *defaultGaleraControl) setLabels(galera *apigalera.Galera, readyPods []
if err := gc.podControl.PatchPodLabels(galera, candidate.DeepCopy(), RemoveReader(), AddWriter()); err != nil {
return err
}
- }
-
- if status.Members.BackupWriter == "" {
- var candidate *corev1.Pod
- for _, pod := range readyPods {
- if _, exist := pod.Labels[apigalera.GaleraBackupLabel]; exist {
- continue
- }
- if _, exist := pod.Labels[apigalera.GaleraRoleLabel]; exist {
- continue
- }
- // needed because readyPods is not update by previous patching for the Writer
- if status.Members.Writer != pod.Name {
- candidate = pod
- break
+ } else {
+ if status.Members.BackupWriter == "" {
+ var candidate *corev1.Pod
+ for _, pod := range readyPods {
+ if _, exist := pod.Labels[apigalera.GaleraBackupLabel]; exist {
+ continue
+ }
+ if _, exist := pod.Labels[apigalera.GaleraRoleLabel]; exist {
+ continue
+ }
+ // needed because readyPods is not update by previous patching for the Writer
+ if status.Members.Writer != pod.Name {
+ candidate = pod
+ break
+ }
}
- }
- if candidate == nil {
- return errors.New(fmt.Sprintf("no BackupWriter Role candidate for galera %s/%s ", galera.Namespace, galera.Name))
- }
+ if candidate == nil {
+ return errors.New(fmt.Sprintf("no BackupWriter Role candidate for galera %s/%s ", galera.Namespace, galera.Name))
+ }
- status.Members.Writer = candidate.Name
- if err := gc.podControl.PatchPodLabels(galera, candidate.DeepCopy(), AddReader(), AddBackupWriter()); err != nil {
- return err
+ status.Members.Writer = candidate.Name
+ if err := gc.podControl.PatchPodLabels(galera, candidate.DeepCopy(), AddReader(), AddBackupWriter()); err != nil {
+ return err
+ }
}
- }
- for _, pod := range readyPods {
- if pod.Name == status.Members.Writer {
- if pod.DeepCopy().Labels[apigalera.GaleraReaderLabel] == "true" {
- return gc.podControl.PatchPodLabels(galera, pod.DeepCopy(), RemoveReader())
+ for _, pod := range readyPods {
+ if pod.Name == status.Members.Writer {
+ if pod.DeepCopy().Labels[apigalera.GaleraReaderLabel] == "true" {
+ return gc.podControl.PatchPodLabels(galera, pod.DeepCopy(), RemoveReader(), AddWriter())
+ }
}
}
}
I am not sure if I am causing other sideeffects with this patch though.
When using more than 3 replicas (5, 7, 9) and deleting the writer pod on a fully populated cluster, the operator creates replicas+1 pods.
Situation:
Action: delete pod-1
Situation:
After a while:
After some more time:
Now there are 6 pods, pod-2 is not recognized as a member of the cluster anymore because its label is
reader=false
without any role. Caused by the logic here: https://github.com/Orange-OpenSource/galera-operator/blob/master/pkg/controllers/cluster/galera_control.go#L626The problem why this happens seems to be in
setLabels
: https://github.com/Orange-OpenSource/galera-operator/blob/master/pkg/controllers/cluster/galera_control.go#L959See my comments in the code, but basically what happens is, the newly selected writer nodes get's the role label added and right after there is another update for the new writer node which resets the labels to the old labels again and additionally sets the reader label to false, leaving the one node with only
reader=false
and no role.This patch seems to fix the issue:
I am not sure if I am causing other sideeffects with this patch though.