mariadb-operator / mariadb-operator

🦭 Run and operate MariaDB in a cloud native way
MIT License
462 stars 84 forks source link

[Bug] Update probes with galera is not working #744

Open mperochon opened 1 month ago

mperochon commented 1 month ago

Documentation

Hello,

I am trying to synchronize my nodes in recovery mode, but the data to replicate exceeds 120GB.

The replication fails because the probes fail (error code 500) and the pods keep restarting infinitely.

I tried updating the probes by modifying the values of readinessProbe and livenessProbe, but they are not applied to my pods. I still have the default values.

Expected behaviour Replication of all data on all nodes.

Steps to reproduce the bug

  1. Restore a large backup on the node 0 pvc storage
  2. Deploy Galeradb with 3 nodes
  3. take on look on the node-1 logs.

Debug information

Environment details:

Additional context

This my configuration

{
                "apiVersion": "k8s.mariadb.com/v1alpha1",
                "kind": "MariaDB",
                "metadata": {
                    "name": `${CDK_PREFIX_STACK}-${CDK_ENVIRONMENT}-mariadb-test`,
                    "namespace": this.config.galeraDB.namespace,
                },
                "spec": {
                    "image": "docker-registry1.mariadb.com/library/mariadb:10.6.18",
                    "username": GALERA_DB_USER,
                    "passwordSecretKeyRef": {
                        "name": `${CDK_PREFIX_STACK}-${CDK_ENVIRONMENT}-galera-db-credentials`,
                        "key": "GALERA_DB_PASSWORD",
                    },
                    "database": this.config.galeraDB.defaultDBName,
                    "rootPasswordSecretKeyRef": {
                        "name": `${CDK_PREFIX_STACK}-${CDK_ENVIRONMENT}-galera-db-credentials`,
                        "key": "GALERA_DB_ROOT_PASSWORD",
                    },
                    "podSecurityContext": {
                        "runAsUser": 999,
                        "runAsGroup": 999,
                        "fsGroup": 999
                    },
                    "metrics": {
                        "enabled": true,
                    },
                    "storage": {
                        "volumeClaimTemplate": {
                            "storageClassName": this.config.galeraDB.storageClassName,
                            "accessModes": ["ReadWriteOnce"],
                            "resources": {
                                "requests": {
                                    "storage": this.config.galeraDB.storageSize
                                }
                            }
                        }
                    },
                    "replicas": 3,
                    "livenessProbe": {
                        "initialDelaySeconds": 3600,
                        "periodSeconds": 3600,
                        "successThreshold": 1,
                        "timeoutSeconds": 3600,
                    },
                    "readinessProbe": {
                        "initialDelaySeconds": 3600,
                        "periodSeconds": 3600,
                        "successThreshold": 1,
                        "timeoutSeconds": 3600,
                    },
                    "galera": {
                        "enabled": "true",
                        "recovery": {
                            "enabled": true,
                            "minClusterSize": 3,
                            "clusterMonitorInterval": "2h0m0s",
                            "clusterHealthyTimeout": "2h0m0s",
                            "clusterBootstrapTimeout": "2h0m0s",
                            "podRecoveryTimeout": "1h30m0s",
                            "podSyncTimeout": "2h0m0s"
                        },
                        "livenessProbe": {
                            "initialDelaySeconds": 3600,
                            "periodSeconds": 3600,
                            "successThreshold": 1,
                            "timeoutSeconds": 3600,
                        },
                        "readinessProbe": {
                            "initialDelaySeconds": 3600,
                            "periodSeconds": 3600,
                            "successThreshold": 1,
                            "timeoutSeconds": 3600,
                        },
                        "replicaThreads": 8,
                        "agent": {
                            "livenessProbe": {
                                "initialDelaySeconds": 3600,
                                "periodSeconds": 3600,
                                "successThreshold": 1,
                                "timeoutSeconds": 3600,
                            },
                            "readinessProbe": {
                                "initialDelaySeconds": 3600,
                                "periodSeconds": 3600,
                                "successThreshold": 1,
                                "timeoutSeconds": 3600,
                            },
                        }
                    },  
                    "affinity": {
                        "podAntiAffinity": {
                            "requiredDuringSchedulingIgnoredDuringExecution": [
                                {
                                    "topologyKey": "kubernetes.io/hostname"
                                }
                            ],
                        },
                    },
                    "podDisruptionBudget": {
                        "maxUnavailable": "66%"
                    },
                    "service": {
                        "type": "LoadBalancer",
                        "metadata": {
                            "annotations": {
                                "service.beta.kubernetes.io/ovh-loadbalancer-allowed-sources": `${this.config.privateNetwork.primarySubnet.cidr},${this.config.privateNetwork.secondarySubnet.cidr}`
                            }
                        },
                    },
                    "primaryService": {
                        "type": "LoadBalancer",
                        "metadata": {
                            "annotations": {
                                "service.beta.kubernetes.io/ovh-loadbalancer-allowed-sources": `${this.config.privateNetwork.primarySubnet.cidr},${this.config.privateNetwork.secondarySubnet.cidr}`
                            }
                        },
                    },
                    "secondaryService": {
                        "type": "LoadBalancer",
                        "metadata": {
                            "annotations": {
                                "service.beta.kubernetes.io/ovh-loadbalancer-allowed-sources": `${this.config.privateNetwork.primarySubnet.cidr},${this.config.privateNetwork.secondarySubnet.cidr}`
                            }
                        },
                    }
                }
            }
mmontes11 commented 1 month ago

Hey there @mperochon !

I have updated the v0.0.30 (yet to be released in the next few weeks) with some important considerations regarding restoring backups:

probes fail (error code 500)

Probes are failing most likely because the credentials provided via spec.rootPasswordSecretKeyRef don't match the database internal state after the backup is restored. In other words, spec.rootPasswordSecretKeyRef should match the root password credentials provided in the backup.

the data to replicate exceeds 120GB.

Be sure to provide enough compute resources in the restore job to make sure the restoration process doesn't get stucked:

mmontes11 commented 1 month ago

Just another note regarding the probes, you are able to tweak the probe thresholds but not the probe command. See:

mmontes11 commented 1 month ago

Another question @mperochon:

Has the backup you are trying to restore been taken on an external database? I'm specially curious about: Does it have a DROP TABLE mysql.global_priv; statement?

If so, please take a look at this, it describes exactly this case:

TL;DR;

mperochon commented 1 month ago

Hi @mmontes11,

We are copying all physical files to PVC storage of node 1.

To generate the backup, we are using this command:

mariabackup --copy-back --target-dir=/mnt/backupdata/latest/

Once the process is terminated, we are connecting the PVC with the backup on the first Galera node (node-0) (/var/lib/mysql).

After that, when Galera starts, the replication also starts but fails after 3 minutes due to the probes failure.

Let me know if you need any more test from me

mmontes11 commented 1 month ago

What I've mentioned here applies for logical backups taken with mariadb-backup, which is not your case. The Galera backup limitations still apply though.

It will be very useful to get the logs from your agent container to understand why the probes are failing:

 kubectl logs mariadb-galera-0 -c agent

I can't really advice on your procedure, but here it is another way you can restore physical backups via initContainers. This approach will restore the physical backup on each node before it starts, so all the nodes will start with the same data:

As you can see, you will have to place physical backup in a PVC named mariabackup beforehand.

We don't currently support physical backups natively, but we have plans for them in our roadmap. We plan to implement PITR based on physical backups and binary logs.

mperochon commented 1 month ago

Hi @mmontes11,

Thanks for your help and I made the test with the initContainers option and I get this error when I deploy the mardiadbs object :

Error reconciling Init: Job.batch "k8-dev-mariadb-test-init" is invalid: spec.template.spec.initContainers[0].volumeMounts[3].name: Not found: "galera"

I just added theses tree parameters : initContainers, volumes and volumesMount.

This my deployment file :

{
                "apiVersion": "k8s.mariadb.com/v1alpha1",
                "kind": "MariaDB",
                "metadata": {
                    "name": `${CDK_PREFIX_STACK}-${CDK_ENVIRONMENT}-mariadb-test`,
                    "namespace": this.config.galeraDB.namespace,
                },
                "spec": {
                    "image": "docker-registry1.mariadb.com/library/mariadb:10.6.18",
                    "initContainers": [
                        {
                            "image": "docker-registry1.mariadb.com/library/mariadb:10.6.18",
                            "args": [
                                "mariadb-backup",
                                "--copy-back",
                                "--target-dir=/mnt/backup/latest/"
                            ]
                        }
                    ],
                    "volumeMounts": [
                        {
                            "name": "mariabackup",
                            "mountPath": "/mnt/backup/"
                        }
                    ],
                    "username": GALERA_DB_USER,
                    "passwordSecretKeyRef": {
                        "name": `${CDK_PREFIX_STACK}-${CDK_ENVIRONMENT}-galera-db-credentials`,
                        "key": "GALERA_DB_PASSWORD",
                    },
                    "database": this.config.galeraDB.defaultDBName,
                    "rootPasswordSecretKeyRef": {
                        "name": `${CDK_PREFIX_STACK}-${CDK_ENVIRONMENT}-galera-db-credentials`,
                        "key": "GALERA_DB_ROOT_PASSWORD",
                    },
                    "podSecurityContext": {
                        "runAsUser": 999,
                        "runAsGroup": 999,
                        "fsGroup": 999
                    },
                    "metrics": {
                        "enabled": true,
                    },
                    "storage": {
                        "size": "250Gi"
                    },
                    "volumes": [
                        {
                            "name": "mariabackup",
                            "persistentVolumeClaim": {
                                "claimName": "pvc-backup"
                            }
                        }
                    ],
                    "replicas": 4,
                    "galera": {
                        "enabled": "true",
                        "replicaThreads": 10,
                    },  
                    "affinity": {
                        "podAntiAffinity": {
                            "requiredDuringSchedulingIgnoredDuringExecution": [
                                {
                                    "topologyKey": "kubernetes.io/hostname"
                                }
                            ],
                        },
                    },
                    "podDisruptionBudget": {
                        "maxUnavailable": "66%"
                    },
                    "service": {
                        "type": "LoadBalancer",
                        "metadata": {
                            "annotations": {
                                "service.beta.kubernetes.io/ovh-loadbalancer-allowed-sources": `${this.config.privateNetwork.primarySubnet.cidr},${this.config.privateNetwork.secondarySubnet.cidr}`
                            }
                        },
                    },
                    "primaryService": {
                        "type": "LoadBalancer",
                        "metadata": {
                            "annotations": {
                                "service.beta.kubernetes.io/ovh-loadbalancer-allowed-sources": `${this.config.privateNetwork.primarySubnet.cidr},${this.config.privateNetwork.secondarySubnet.cidr}`
                            }
                        },
                    },
                    "secondaryService": {
                        "type": "LoadBalancer",
                        "metadata": {
                            "annotations": {
                                "service.beta.kubernetes.io/ovh-loadbalancer-allowed-sources": `${this.config.privateNetwork.primarySubnet.cidr},${this.config.privateNetwork.secondarySubnet.cidr}`
                            }
                        },
                    }
                }
}