apache / arrow

Apache Arrow is the universal columnar format and multi-language toolbox for fast data interchange and in-memory analytics
https://arrow.apache.org/
Apache License 2.0
14.64k stars 3.56k forks source link

[C++][Parquet] Cannot read encrypted parquet datasets via _metadata file #41719

Open AudriusButkevicius opened 6 months ago

AudriusButkevicius commented 6 months ago

Describe the bug, including details regarding any error messages, version, and platform.

Fails with:

Cannot decrypt ColumnMetadata. FileDecryption is not setup correctly

This is using plaintext footer.

Reproducer:

import os

import pyarrow.parquet.encryption as pe
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow as pa
import base64
import polars as pl

class KmsClient(pe.KmsClient):
    def unwrap_key(self, wrapped_key, master_key_identifier):        
        return base64.b64decode(wrapped_key)

    def wrap_key(self, key_bytes, master_key_identifier):
        return base64.b64encode(key_bytes)

def write(location):
    cf = pe.CryptoFactory(lambda *a, **k: KmsClient())
    df = pl.DataFrame({
        "col1": [1, 2, 3],
        "col2": [1, 2, 3],
        "year": [2020, 2020, 2021]
    })
    ecfg = pe.EncryptionConfiguration(
        footer_key="TEST",
        column_keys={
            "TEST": ["col2"]
        },
        double_wrapping=False,
        plaintext_footer=False,
    )    
    table = df.to_arrow()
    parquet_encryption_cfg = ds.ParquetEncryptionConfig(
        cf, pe.KmsConnectionConfig(), ecfg
    )

    metadata_collector = []

    pq.write_to_dataset(
        table,
        location,
        partitioning=ds.partitioning(
            schema=pa.schema([
                pa.field("year", pa.int16())
            ]),
            flavor="hive"
        ),
        encryption_config=parquet_encryption_cfg,
        metadata_collector=metadata_collector
    )

    pq.write_metadata(
        pa.schema(
            field
            for field in table.schema
            if field.name != "year"
        ),
        os.path.join(location, "_metadata"),
        metadata_collector
    )

def read(location):
    decryption_config = pe.DecryptionConfiguration(cache_lifetime=300)
    kms_connection_config = pe.KmsConnectionConfig()
    cf = pe.CryptoFactory(lambda *a, **k: KmsClient())
    parquet_decryption_cfg = ds.ParquetDecryptionConfig(
        cf, kms_connection_config, decryption_config
    )

    decryption_properties = cf.file_decryption_properties(
        kms_connection_config, decryption_config)
    pq_scan_opts = ds.ParquetFragmentScanOptions(
        decryption_config=parquet_decryption_cfg,
        # If using build from master
        # decryption_properties=decryption_properties
    )
    pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts)

    dataset = ds.parquet_dataset(
        os.path.join(location, "_metadata"),
        format=pformat,
        partitioning=ds.partitioning(
            schema=pa.schema([
                pa.field("year", pa.int16())
            ]),
            flavor="hive"
        )
    )
    print(dataset.to_table())

if __name__ == '__main__':
    location = r"/tmp/dataset-test"
    os.makedirs(location, exist_ok=True)
    write(location)
    read(location)

Presumably the metadata read out of _metadata file is not decrypted or the footer indicates incorrectly whether it's encrypted or not.

Tried with latest master which contains: https://github.com/apache/arrow/commit/bd444106af494b3d4c6cce0af88f6ce2a6a327eb

Component(s)

C++, Python

AudriusButkevicius commented 6 months ago

Seems you can rebuild the dataset from what parquet_dataset returned:

    from pyarrow import fs
    filesystem = fs.LocalFileSystem()    
    remade_dataset = ds.FileSystemDataset(
        [
            pformat.make_fragment(
                fragment.path,
                filesystem,
                fragment.partition_expression,
                [rg.id for rg in fragment.row_groups]
            )
            for fragment in dataset.get_fragments()
        ],
        dataset.schema,
        pformat,
    )
    print(remade_dataset.to_table())

but I assume this re-fetches the metadata (instead of using it from the _metadata file), beating the purpose of having the _metadata file in the first place.

AudriusButkevicius commented 6 months ago

Actually, I think the issue might be with writing the data. I think the _metadata file has no encryption algorithm set, so it doesn't even attempt to decrypt the metadata.

AudriusButkevicius commented 6 months ago

I think we'd need an equivalent of:

https://github.com/apache/arrow/blob/5e1a4fd8a4ed3630c9549c611222d2d6c32357ca/cpp/src/parquet/file_writer.cc#L546

but based on:

https://github.com/apache/arrow/blob/5e1a4fd8a4ed3630c9549c611222d2d6c32357ca/cpp/src/parquet/file_writer.cc#L551