deephaven / deephaven-core

Deephaven Community Core
Other
257 stars 80 forks source link

Parquet writing of partitioned tables allows duplication of data indexes #6415

Open lbooker42 opened 14 hours ago

lbooker42 commented 14 hours ago

Description

As a user, I am able to request that an index be created and written that duplicates the implicit index on a partitioned column. This results in an exception when the table is read from disk and coalesced.

Steps to reproduce

        final TableDefinition definition = TableDefinition.of(
                ColumnDefinition.ofInt("PC1").withPartitioning(),
                ColumnDefinition.ofLong("I"));
        final Table inputData = ((QueryTable) TableTools.emptyTable(10)
                .updateView("PC1 = (ii%2==0)? null : (int)(ii%2)",
                        "I = ii"))
                .withDefinitionUnsafe(definition);
        final PartitionedTable partitionedTable = inputData.partitionBy("PC1");
        final File parentDir = new File(rootFile, "writeKeyValuePartitionedDataWithNullKeys");
        final ParquetInstructions writeInstructions = ParquetInstructions.builder()
                .setGenerateMetadataFiles(true)
// FORCE AN INDEX TO BE WRITTEN FOR A PARTITIONED COLUMN
                .addIndexColumns("PC1")
////////////////////////////////////////////////////////
                .build();
        writeKeyValuePartitionedTable(partitionedTable, parentDir.getAbsolutePath(), writeInstructions);
        final Table fromDisk = readTable(parentDir.getPath(),
                EMPTY.withLayout(ParquetInstructions.ParquetFileLayout.KV_PARTITIONED)).select();
        assertTableEquals(inputData.sort("PC1"), fromDisk.sort("PC1"));
        final Table fromDiskWithMetadata = readTable(new File(parentDir, "_common_metadata").getPath()).select();
        assertTableEquals(inputData.sort("PC1"), fromDiskWithMetadata.sort("PC1"));

Expected results

Not sure, should we throw an exception explaining that this index is implied? Or ignore and allow the partitioning column index to populate this on read?

Actual results

Error initializing location sizes
io.deephaven.engine.table.impl.locations.TableDataException: Error initializing location sizes
    at io.deephaven.engine.table.impl.SourceTable.lambda$initializeLocationSizes$3(SourceTable.java:214)
    at io.deephaven.engine.table.impl.perf.QueryPerformanceRecorder.withNugget(QueryPerformanceRecorder.java:353)
    at io.deephaven.engine.table.impl.SourceTable.initializeLocationSizes(SourceTable.java:208)
    at io.deephaven.engine.table.impl.SourceTable.initialize(SourceTable.java:122)
    at io.deephaven.engine.table.impl.SourceTable.doCoalesce(SourceTable.java:292)
    at io.deephaven.engine.table.impl.SourceTable.doCoalesce(SourceTable.java:35)
    at io.deephaven.engine.table.impl.UncoalescedTable.coalesce(UncoalescedTable.java:78)
    at io.deephaven.engine.table.impl.UncoalescedTable.select(UncoalescedTable.java:236)
    at io.deephaven.engine.table.impl.UncoalescedTable.select(UncoalescedTable.java:43)
    at io.deephaven.api.TableOperationsDefaults.select(TableOperationsDefaults.java:109)
    at io.deephaven.parquet.table.ParquetTableReadWriteTest.writeKeyValuePartitionedDataWithNullKeysAndExtraIndex(ParquetTableReadWriteTest.java:1402)
    at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
    at java.base/java.lang.reflect.Method.invoke(Method.java:580)
    at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59)
    at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
    at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56)
    at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
    at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26)
    at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27)
    at io.deephaven.engine.testutil.junit4.EngineCleanup$1.evaluate(EngineCleanup.java:34)
    at org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306)
    at org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100)
    at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366)
    at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103)
    at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63)
    at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331)
    at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79)
    at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329)
    at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66)
    at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293)
    at org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306)
    at org.junit.runners.ParentRunner.run(ParentRunner.java:413)
    at org.gradle.api.internal.tasks.testing.junit.JUnitTestClassExecutor.runTestClass(JUnitTestClassExecutor.java:112)
    at org.gradle.api.internal.tasks.testing.junit.JUnitTestClassExecutor.execute(JUnitTestClassExecutor.java:58)
    at org.gradle.api.internal.tasks.testing.junit.JUnitTestClassExecutor.execute(JUnitTestClassExecutor.java:40)
    at org.gradle.api.internal.tasks.testing.junit.AbstractJUnitTestClassProcessor.processTestClass(AbstractJUnitTestClassProcessor.java:54)
    at org.gradle.api.internal.tasks.testing.SuiteTestClassProcessor.processTestClass(SuiteTestClassProcessor.java:53)
    at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
    at java.base/java.lang.reflect.Method.invoke(Method.java:580)
    at org.gradle.internal.dispatch.ReflectionDispatch.dispatch(ReflectionDispatch.java:36)
    at org.gradle.internal.dispatch.ReflectionDispatch.dispatch(ReflectionDispatch.java:24)
    at org.gradle.internal.dispatch.ContextClassLoaderDispatch.dispatch(ContextClassLoaderDispatch.java:33)
    at org.gradle.internal.dispatch.ProxyDispatchAdapter$DispatchingInvocationHandler.invoke(ProxyDispatchAdapter.java:92)
    at jdk.proxy1/jdk.proxy1.$Proxy4.processTestClass(Unknown Source)
    at org.gradle.api.internal.tasks.testing.worker.TestWorker$2.run(TestWorker.java:183)
    at org.gradle.api.internal.tasks.testing.worker.TestWorker.executeAndMaintainThreadName(TestWorker.java:132)
    at org.gradle.api.internal.tasks.testing.worker.TestWorker.execute(TestWorker.java:103)
    at org.gradle.api.internal.tasks.testing.worker.TestWorker.execute(TestWorker.java:63)
    at org.gradle.process.internal.worker.child.ActionExecutionWorker.execute(ActionExecutionWorker.java:56)
    at org.gradle.process.internal.worker.child.SystemApplicationClassLoaderWorker.call(SystemApplicationClassLoaderWorker.java:121)
    at org.gradle.process.internal.worker.child.SystemApplicationClassLoaderWorker.call(SystemApplicationClassLoaderWorker.java:71)
    at worker.org.gradle.process.internal.worker.GradleWorkerMain.run(GradleWorkerMain.java:69)
    at worker.org.gradle.process.internal.worker.GradleWorkerMain.main(GradleWorkerMain.java:74)
Caused by: java.lang.IllegalStateException: Attempted to add a duplicate index MergedDataIndex-1791706561[1] for key columns [io.deephaven.engine.table.impl.sources.regioned.RegionedColumnSourceInt$Partitioning@3f80d8c]
    at io.deephaven.engine.table.impl.indexer.DataIndexer.addDataIndex(DataIndexer.java:305)
    at io.deephaven.engine.table.impl.sources.regioned.RegionedColumnSourceManager.initialize(RegionedColumnSourceManager.java:344)
    at io.deephaven.engine.table.impl.SourceTable.lambda$initializeLocationSizes$3(SourceTable.java:212)

Versions