cockroachdb / cockroach

CockroachDB — the cloud native, distributed SQL database designed for high availability, effortless scale, and control over data placement.
https://www.cockroachlabs.com
Other
29.77k stars 3.76k forks source link

bulkio: remove older backups after a certain number of days #56572

Open fabiog1901 opened 3 years ago

fabiog1901 commented 3 years ago

Within the Backup schedules introduced in 20.2, a new request is in to add the ability to be able to remove older backups after a certain number of days.

An example setup:

After x number of days we may no longer require the backups to be kept on S3. It would be easier and safer if there was a retention period within the scheduler to delete older backups, rather than manually delete them on S3 directly.

The main drive for this is to reduce the consumption of space on S3.

Zendesk: 6470

gz#6740

Epic CRDB-8014

Jira issue: CRDB-2926

a-vogel-tappert commented 2 years ago

Hi, any update to this topic? I am also trying to remove older backups from object storage based on a retention value. Thanks in dvance.

dankinder commented 2 years ago

You can pretty much do this with S3 Lifecycle rules to expire objects in your bucket.

chilicheech commented 2 months ago

This would still be useful for nodelocal backups though.

dwt commented 2 months ago

I have implemented this as a cron job for our node local backups with something like this:

#!/usr/bin/env python

from datetime import datetime, timedelta, timezone
import os
from pathlib import Path
from pprint import pformat
from dataclasses import dataclass
import shutil

import click

@dataclass
class Backup:
    backup_time: datetime
    metadata_path: Path
    data_path: Path

    def remove(self):
        self.metadata_path.unlink()
        shutil.rmtree(self.data_path)

# REFACT consider to move into the Dataclass
def read_backups(backup_dir: Path) -> list[Backup]:
    metadata_dir = backup_dir / 'metadata' / 'latest'

    if backup_dir.exists() and not metadata_dir.exists():
        click.echo('No backups yet')
        return []

    backups = []
    for metadata_file in metadata_dir.iterdir():
        backup_name = metadata_file.read_text().strip().lstrip(os.path.sep)
        # format is $year/month/day-$hour$minute$second.$subseconds?
        # <year>/<month>/<day>-<timestamp> according to https://www.cockroachlabs.com/docs/stable/show-backup.html#view-a-list-of-the-available-full-backup-subdirectories
        backup_time = datetime.strptime(backup_name, '%Y/%m/%d-%H%M%S.%f').replace(tzinfo=timezone.utc)
        backups.append(Backup(
            backup_time=backup_time,
            metadata_path=metadata_file,
            data_path=backup_dir / backup_name,
        ))

    return sorted(backups, key=lambda backup: backup.backup_time)

# REFACT consider to move into the Dataclass
def find_unaccounted_backups(backup_dir: Path) -> list[Backup]:
    # backups without an entry in latest
    accounted_backup_dirs = [
        backup.data_path.absolute()
        for backup in read_backups(backup_dir)
    ]

    unaccounted = []
    for backup_path in backup_dir.rglob('????/*/*/'):
        if backup_path.is_dir() and backup_path.absolute() not in accounted_backup_dirs:
            unaccounted.append(backup_path)
    return unaccounted

# REFACT consider to move into the Dataclass
def backups_to_remove(backups: list[Backup], keep_by_number: int, keep_by_days) -> list[Backup]:
    backups = sorted(backups, key=lambda backup: backup.backup_time, reverse=True)
    oldest_to_keep = datetime.now(tz=timezone.utc) - timedelta(days=keep_by_days)
    def should_remove(backup, index) -> bool:
        return backup.backup_time <= oldest_to_keep \
            and index >= keep_by_number

    remove = []
    for index, backup in enumerate(backups):
        if should_remove(backup, index):
            remove.append(backup)
    return sorted(remove, key=lambda backup: backup.backup_time)

# REFACT remove, unused? Haven't yet seen this in practice.
def remove_backups_where_data_file_is_missing(backup_dir):
    backups = read_backups(backup_dir)
    assert not backups[1].data_path.exists()
    backups[1].remove()

@click.command()
@click.argument('backup_root', type=click.Path(
    exists=True, dir_okay=True, file_okay=False, path_type=Path,
))
@click.option(
    'should_remove_old_backups',
    '--remove-old-backups',
    is_flag=True,
    help="Backups older than 10 days and more than 10 backups"
)
@click.option(
    '--keep-by-number',
    default=10,
    help="How many backups to keep at minimum"
)
@click.option(
    '--keep-by-days',
    default=10,
    help="How many days of backups to keep at minimum"
)
@click.option(
    'should_remove_unaccounted_backups',
    '--remove-unaccounted-backups',
    is_flag=True, 
    help="Backups that are not listed in the latest folder"
)
def main(
    backup_root,
    should_remove_old_backups,
    should_remove_unaccounted_backups,
    keep_by_number,
    keep_by_days,
):
    click.echo(f'backup-thinner was invoked with parameters: \n{pformat(locals())}')
    if should_remove_old_backups:
        click.echo('Removing old backups')
        for backup in backups_to_remove(
            read_backups(backup_root),
            keep_by_days=keep_by_days,
            keep_by_number=keep_by_number
        ):
            click.echo(f"Remove {backup.data_path!s} ")
            backup.remove()
    if should_remove_unaccounted_backups:
        click.echo("Removing unaccounted backups")
        broken = find_unaccounted_backups(backup_root)
        for path in broken:
            # REFACT move into Backup() class?
            click.echo(f"Remove {path!s}")
            shutil.rmtree(path)

if __name__ == '__main__':
    main()

tests

from pathlib import Path
from datetime import datetime, timedelta, timezone
import re

import pytest
from click.testing import CliRunner
import thin_out_backups as mut

# TODO remove empty backup directories to make it more clear that they are gone

def test_keep_backups_by_age():
    old = mut.Backup(datetime.now(timezone.utc) - timedelta(days=30), None, None)
    young = mut.Backup(datetime.now(timezone.utc) - timedelta(days=1), None, None)
    assert old in mut.backups_to_remove([old, young], keep_by_number=0, keep_by_days=1)
    assert young not in mut.backups_to_remove([old, young], keep_by_number=0, keep_by_days=2)
    assert old in mut.backups_to_remove([old, young], keep_by_number=0, keep_by_days=29)
    assert old not in mut.backups_to_remove([old, young], keep_by_number=0, keep_by_days=31)

def test_keep_backups_by_number():
    old = mut.Backup(datetime.now(timezone.utc) - timedelta(days=30), None, None)
    young = mut.Backup(datetime.now(timezone.utc) - timedelta(days=1), None, None)
    assert old in mut.backups_to_remove([old, young], keep_by_number=1, keep_by_days=0)
    assert young not in mut.backups_to_remove([old, young], keep_by_number=1, keep_by_days=0)
    assert 0 == len(mut.backups_to_remove([old, young], keep_by_number=2, keep_by_days=0))

def test_keep_backups_by_both():
    old = mut.Backup(datetime.now(timezone.utc) - timedelta(days=3), None, None)
    young = mut.Backup(datetime.now(timezone.utc) - timedelta(days=1), None, None)
    assert 0 == len(mut.backups_to_remove([old, young], keep_by_number=1, keep_by_days=4))
    assert 0 == len(mut.backups_to_remove([old, young], keep_by_number=2, keep_by_days=2))

@pytest.fixture
def backup_dir(tmp_path):
    backup_dir = Path(tmp_path) / 'backups'
    latest = backup_dir / 'metadata' / 'latest'
    latest.mkdir(parents=True)

    (latest / 'LATEST-1').write_text('/2023/05/24-151016.43')
    concrete_backup = backup_dir / '2023' / '05' / '24-151016.43'
    concrete_backup.mkdir(parents=True)
    (concrete_backup / 'somefile').write_text('some content')

    # missing data
    (latest / 'LATEST-2').write_text('/2023/05/24-151016.52')

    # missing metadata
    unaccounted = backup_dir / '2023' / '05' / '24-151016.23'
    unaccounted.mkdir(parents=True)
    (unaccounted / 'somefile').write_text('some content')

    yield backup_dir

def test_find_full_backup_sets(backup_dir):
    backups = mut.read_backups(backup_dir)
    assert 2 == len(backups)
    assert backups[0].backup_time == datetime(2023, 5, 24, 15, 10, 16, 430000, tzinfo=timezone.utc)
    assert backups[1].backup_time == datetime(2023, 5, 24, 15, 10, 16, 520000, tzinfo=timezone.utc)

def test_remove_backups(backup_dir):
    backups = mut.read_backups(backup_dir)
    assert backups[0].metadata_path.exists()
    assert backups[0].data_path.exists()

    backups[0].remove()
    assert not backups[0].metadata_path.exists()
    assert not backups[0].data_path.exists()

def test_find_files_not_belonging_to_any_backup(backup_dir):
    unaccounted = []
    print(mut.find_unaccounted_backups(backup_dir))
    assert 1 == len(mut.find_unaccounted_backups(backup_dir))

    unaccounted.append(backup_dir / '2023' / '04' / '24-151016.23')
    unaccounted[-1].mkdir(parents=True)
    (unaccounted[-1] / 'somefile').write_text('some content')
    assert 2 == len(mut.find_unaccounted_backups(backup_dir))

    unaccounted.append(backup_dir / '2022' / '04' / '24-151016.23')
    unaccounted[-1].mkdir(parents=True)
    (unaccounted[-1] / 'somefile').write_text('some content')
    assert 3 == len(mut.find_unaccounted_backups(backup_dir))

@pytest.fixture
def runner():
    return CliRunner()

def test_logs_invoked_action(backup_dir, runner):
    result = runner.invoke(mut.main, [str(backup_dir), '--remove-old-backups'])
    assert 'old backups' in result.output
    assert 'unnaccounted backups' not in result.output

    result = runner.invoke(mut.main, [str(backup_dir), '--remove-unaccounted-backups'])
    assert 'old backups' not in result.output
    assert 'unaccounted backups' in result.output

def test_delete_stuff_from_command_line(backup_dir, runner):
    result = runner.invoke(mut.main, [
        str(backup_dir),
        '--remove-old-backups',
        '--keep-by-number', '1',
        '--keep-by-days', '0'
    ])
    assert 'old backups' in result.output
    assert re.search(r'Remove .*/2023/05/24-151016.43', result.output)

    unaccounted = backup_dir / '2023' / '05' / '24-151016.23'
    result = runner.invoke(mut.main, [str(backup_dir), '--remove-unaccounted-backups'])
    assert 'unaccounted backups' in result.output
    assert not unaccounted.exists()

def test_logs_deleted_and_kept_backups(backup_dir, runner):
    result = runner.invoke(mut.main, [str(backup_dir), '--remove-unaccounted-backups'])
    assert 'old backups' not in result.output
    assert 'unaccounted backups' in result.output
    assert re.search(r"Remove .*/2023/05/24-151016.23", result.output)

def test_copes_with_empty_backup_directory_missing(tmp_path, runner):
    backup_path = tmp_path / 'extern/backups/'
    backup_path.mkdir(parents=True)
    result = runner.invoke(
        mut.main,
        [str(backup_path), '--remove-old-backups'],
        catch_exceptions=False,
    )
    assert 'No backups yet' in result.output, result.output
    assert 0 == result.exit_code, result.output

# TODO later
def test_logs_unaccounted_backups(): pass

Having something like that be part of cockroachdb would have significantly eased a first local deployment to allow other devs to try and evaluate cockroachdb in a small deployment.