man-group / ArcticDB

ArcticDB is a high performance, serverless DataFrame database built for the Python Data Science ecosystem.
http://arcticdb.io
Other
1.52k stars 93 forks source link

delete_snapshot can delete data keys that it shouldn't #1958

Closed alexowens90 closed 2 weeks ago

alexowens90 commented 3 weeks ago

Repro (could be smaller). Final assertion fails because a data key needed for the read has been deleted

from arcticdb.version_store._store import NativeVersionStore
import numpy as np
import pandas as pd

def create_data_frame(num_columns: int, start_index: int, end_index : int) -> pd.DataFrame:
    '''
        Creates a data frame with specified number of columns with integer index starting
        from specified and ending in specified position. The content of the dataframe is 
        integer random numbers ['start_index', 'end_index')
    '''
    rows = end_index - start_index
    cols = ['COL_%d' % i for i in range(num_columns)]
    df = pd.DataFrame(np.random.randint(start_index, 
                                        end_index, 
                                        size=(rows, num_columns)), 
                                        columns=cols)
    df.index = np.arange(start_index, end_index, 1).tolist()
    return df

def test_bug(basic_store):
    lib = basic_store
    symbol1 = "sym1"
    df_1_1 = create_data_frame(1, 2, 3)
    df_1_2 = create_data_frame(1, 4, 5)
    df_1_combined = pd.concat([df_1_1, df_1_2])
    lib.write(symbol1, df_1_1)
    lib.append(symbol1, df_1_2)

    symbol2 = "sym2"
    df_2_0 = create_data_frame(1, -10, -9)
    df_2_1 = create_data_frame(1, -8, -7)
    lib.write(symbol2, df_2_0)
    lib.append(symbol2, df_2_1)

    snap1 = "snap1"
    snap1_vers = {symbol1 : 0, symbol2 : 1}
    #snap1_vers = {symbol1 : 2}
    lib.snapshot(snap1, versions=snap1_vers)

    lib.delete_version(symbol1, 0)
    lib.delete_version(symbol2, 1)

    lib.delete_snapshot(snap1)

    # confirm afer deletion of versions all is as expected
    # as well as deleting the snapshot wipes the versions effectivly
    assert sorted(lib.list_snapshots()) == [] 
    assert df_2_0.equals(lib.read(symbol2).data)
vasil-pashov commented 3 weeks ago

Original repro:

from arcticdb.version_store._store import NativeVersionStore
import numpy as np
import pandas as pd

def create_data_frame(num_columns: int, start_index: int, end_index : int) -> pd.DataFrame:
    '''
        Creates a data frame with specified number of columns with integer index starting
        from specified and ending in specified position. The content of the dataframe is 
        integer random numbers ['start_index', 'end_index')
    '''
    rows = end_index - start_index
    cols = ['COL_%d' % i for i in range(num_columns)]
    df = pd.DataFrame(np.random.randint(start_index, 
                                        end_index, 
                                        size=(rows, num_columns)), 
                                        columns=cols)
    df.index = np.arange(start_index, end_index, 1).tolist()
    return df

def test_bug(basic_store):
    lib = basic_store
    symbol1 = "sym1"
    df_1_1 = create_data_frame(1, 2, 3)
    df_1_2 = create_data_frame(1, 4, 5)
    df_1_combined = pd.concat([df_1_1, df_1_2])
    lib.write(symbol1, df_1_1)
    lib.append(symbol1, df_1_2)

    symbol2 = "sym2"
    df_2_0 = create_data_frame(1, -10, -9)
    df_2_1 = create_data_frame(1, -8, -7)
    lib.write(symbol2, df_2_0)
    lib.append(symbol2, df_2_1)

    snap1 = "snap1"
    snap1_vers = {symbol1 : 0, symbol2 : 1}
    #snap1_vers = {symbol1 : 2}
    lib.snapshot(snap1, versions=snap1_vers)

    lib.delete_version(symbol1, 0)
    lib.delete_version(symbol2, 1)

    lib.delete_snapshot(snap1)

    # confirm afer deletion of versions all is as expected
    # as well as deleting the snapshot wipes the versions effectivly
    assert sorted(lib.list_snapshots()) == [] 
    assert df_2_0.equals(lib.read(symbol2).data)
    assert df_1_combined.equals(lib.read(symbol1).data)