apache / pinot

Apache Pinot - A realtime distributed OLAP datastore
https://pinot.apache.org/
Apache License 2.0
5.52k stars 1.29k forks source link

Error building segments on real-time table with inverted index #11969

Open dragondgold opened 1 year ago

dragondgold commented 1 year ago

I'm using Apache Pinot 1.0.0 and I'm having some trouble when creating an inverted index on a multi-value column. This is my table config:

{
    "tableName": "devices",
    "tableType": "REALTIME",
    "upsertConfig": {
        "mode": "FULL",
        "comparisonColumn": "timestamp",
        "enableSnapshot": true,
        "enablePreload": true
    },
    "tenants": {},
    "segmentsConfig": {
        "timeColumnName": "timestamp",
        "timeType": "SECONDS",
        "retentionTimeUnit": "DAYS",
        "retentionTimeValue": "90",
        "replication": "1"
    },
    "tableIndexConfig": {
        "loadMode": "MMAP",
        "invertedIndexColumns": [
            "segments"
        ],
        "streamConfigs": {
            "streamType": "kafka",
            "stream.kafka.topic.name": "events-realtime",
            "stream.kafka.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
            "stream.kafka.consumer.type": "simple",
            "stream.kafka.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory",
            "stream.kafka.broker.list": "{brokers}",
            "security.protocol": "SSL",
            "realtime.segment.flush.threshold.time": "3600000",
            "realtime.segment.flush.threshold.size": "20000"
        }
    },
    "routing": {
        "instanceSelectorType": "strictReplicaGroup"
    },
    "fieldConfigList": [],
    "metadata": {
        "customConfigs": {}
    }
}

And this is my schema:

{
    "metricFieldSpecs": [],
    "primaryKeyColumns": ["device_id"],
    "dimensionFieldSpecs": [
        {
            "name": "country",
            "dataType": "STRING"
        },
        {
            "name": "device_id",
            "dataType": "STRING"
        },
        {
            "name": "device_type",
            "dataType": "STRING"
        },
        {
            "name": "segments",
            "dataType": "INT",
            "singleValueField": false
        },
        {
            "name": "options",
            "dataType": "INT",
            "singleValueField": false
        },
        {
            "name": "relation_id",
            "dataType": "STRING"
        },
        {
            "name": "client",
            "dataType": "INT"
        }
    ],
    "dateTimeFieldSpecs": [
        {
            "name": "timestamp",
            "dataType": "LONG",
            "format": "1:SECONDS:EPOCH",
            "granularity": "1:DAYS"
        }
    ],
    "schemaName": "devices"
}

As soon as Pinot starts consuming events from Kafka I get these errors:

pinot-server        | 2023/11/04 23:41:17.587 ERROR [LLRealtimeSegmentDataManager_devices__5__1__20231104T2341Z] [devices__5__1__20231104T2341Z] Could not build segment
pinot-server        | java.lang.RuntimeException: Error occurred while reading row during indexing
pinot-server        |   at org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl.build(SegmentIndexCreationDriverImpl.java:232) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.realtime.converter.RealtimeSegmentConverter.build(RealtimeSegmentConverter.java:121) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.core.data.manager.realtime.LLRealtimeSegmentDataManager.buildSegmentInternal(LLRealtimeSegmentDataManager.java:935) [pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.core.data.manager.realtime.LLRealtimeSegmentDataManager.buildSegmentForCommit(LLRealtimeSegmentDataManager.java:842) [pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.core.data.manager.realtime.LLRealtimeSegmentDataManager$PartitionConsumer.run(LLRealtimeSegmentDataManager.java:754) [pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at java.lang.Thread.run(Thread.java:829) [?:?]
pinot-server        | Caused by: java.lang.IndexOutOfBoundsException
pinot-server        |   at java.nio.Buffer.checkIndex(Buffer.java:693) ~[?:?]
pinot-server        |   at java.nio.DirectByteBuffer.getInt(DirectByteBuffer.java:758) ~[?:?]
pinot-server        |   at org.apache.pinot.segment.spi.memory.PinotByteBuffer.getInt(PinotByteBuffer.java:137) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.io.reader.impl.FixedByteSingleValueMultiColReader.getInt(FixedByteSingleValueMultiColReader.java:105) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.realtime.impl.forward.FixedByteMVMutableForwardIndex.getDictIdMV(FixedByteMVMutableForwardIndex.java:250) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.spi.index.mutable.MutableForwardIndex.getDictIdMV(MutableForwardIndex.java:225) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.segment.readers.PinotSegmentColumnReader.getValue(PinotSegmentColumnReader.java:98) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.segment.readers.PinotSegmentRecordReader.getRecord(PinotSegmentRecordReader.java:227) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.segment.readers.PinotSegmentRecordReader.next(PinotSegmentRecordReader.java:210) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl.build(SegmentIndexCreationDriverImpl.java:225) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   ... 5 more
pinot-server        | 2023/11/04 23:41:17.589 ERROR [LLRealtimeSegmentDataManager_devices__5__1__20231104T2341Z] [devices__5__1__20231104T2341Z] Could not build segment for devices__5__1__20231104T2341Z
pinot-server        | 2023/11/04 23:41:17.635 ERROR [LLRealtimeSegmentDataManager_devices__2__1__20231104T2341Z] [devices__2__1__20231104T2341Z] Could not build segment
pinot-server        | java.lang.RuntimeException: Error occurred while reading row during indexing
pinot-server        |   at org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl.build(SegmentIndexCreationDriverImpl.java:232) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.realtime.converter.RealtimeSegmentConverter.build(RealtimeSegmentConverter.java:121) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.core.data.manager.realtime.LLRealtimeSegmentDataManager.buildSegmentInternal(LLRealtimeSegmentDataManager.java:935) [pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.core.data.manager.realtime.LLRealtimeSegmentDataManager.buildSegmentForCommit(LLRealtimeSegmentDataManager.java:842) [pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.core.data.manager.realtime.LLRealtimeSegmentDataManager$PartitionConsumer.run(LLRealtimeSegmentDataManager.java:754) [pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at java.lang.Thread.run(Thread.java:829) [?:?]
pinot-server        | Caused by: java.lang.IndexOutOfBoundsException
pinot-server        |   at java.nio.Buffer.checkIndex(Buffer.java:693) ~[?:?]
pinot-server        |   at java.nio.DirectByteBuffer.getInt(DirectByteBuffer.java:758) ~[?:?]
pinot-server        |   at org.apache.pinot.segment.spi.memory.PinotByteBuffer.getInt(PinotByteBuffer.java:137) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.io.reader.impl.FixedByteSingleValueMultiColReader.getInt(FixedByteSingleValueMultiColReader.java:105) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.realtime.impl.forward.FixedByteMVMutableForwardIndex.getDictIdMV(FixedByteMVMutableForwardIndex.java:250) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.spi.index.mutable.MutableForwardIndex.getDictIdMV(MutableForwardIndex.java:225) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.segment.readers.PinotSegmentColumnReader.getValue(PinotSegmentColumnReader.java:98) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.segment.readers.PinotSegmentRecordReader.getRecord(PinotSegmentRecordReader.java:227) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.segment.readers.PinotSegmentRecordReader.next(PinotSegmentRecordReader.java:210) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   at org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl.build(SegmentIndexCreationDriverImpl.java:225) ~[pinot-all-1.0.0-jar-with-dependencies.jar:1.0.0-b6bdf6c9686b286a149d2d1aea4a385ee98f3e79]
pinot-server        |   ... 5 more

The data is ingested into the table anyways, but when running a query like select count(*) from devices where segments = 560 I get this error:

Error Code: 200

QueryExecutionError:
java.lang.IndexOutOfBoundsException
    at java.base/java.nio.Buffer.checkIndex(Buffer.java:693)
    at java.base/java.nio.DirectByteBuffer.getInt(DirectByteBuffer.java:758)
    at org.apache.pinot.segment.spi.memory.PinotByteBuffer.getInt(PinotByteBuffer.java:137)
    at org.apache.pinot.segment.local.io.reader.impl.FixedByteSingleValueMultiColReader.getInt(FixedByteSingleValueMultiColReader.java:105)

If I change the table flush size from:

"realtime.segment.flush.threshold.size": "20000"

To:

"realtime.segment.flush.threshold.size": "200000"

The errors are gone and everything works as expected. This some sample data in case it's useful: sample_data.csv.zip

xiangfu0 commented 1 year ago

cc: @KKcorps

KKcorps commented 1 year ago

The error seems to occur at segment commit according to the logs. This makes sense as well cause increasing the flush threshold fixes it. It is actually just delaying the error for you instead of being an actual fix.

Will take a look

dragondgold commented 1 year ago

The error seems to occur at segment commit according to the logs. This makes sense as well cause increasing the flush threshold fixes it. It is actually just delaying the error for you instead of being an actual fix.

Will take a look

I think it's fixing it because after increasing the flush threshold to 200k I ingested around 100 million rows without any issues, that's the weird thing

KKcorps commented 1 year ago

Tried with both 1.0.0 and master but unable to repro this issue on my end.

I noticed that the number of values in the MV column are around 3000 per row and so took that into account as well while publishing the data

Here's the code I am using for data gen

public class DeviceEventPublisher {

  private static final Random random = new Random();
  private static final String KAFKA_TOPIC = "mv_ingestion_repro";
  private static final String BOOTSTRAP_SERVERS = "localhost:19092"; // Update with your Kafka server details
  public static final int NUM_EVENTS_TO_PUBLISH = 30000;
  public static final int MV_ARRAY_LENGTH = 3203;
  private KafkaProducer<String, String> producer;

  public DeviceEventPublisher() {
    Properties properties = new Properties();
    properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, BOOTSTRAP_SERVERS);
    properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
    properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName());
    this.producer = new KafkaProducer<>(properties);
  }

  public void publishEvent(DeviceEvent event) {
    try {
      String jsonData = JsonUtils.objectToString(event);
      producer.send(new ProducerRecord<>(KAFKA_TOPIC, jsonData));
    } catch (Exception e) {
      System.out.println("Exception publishing event: " + e);
    }
  }

  public static void main(String[] args) {
    DeviceEventPublisher publisher = new DeviceEventPublisher();

    // Generate and publish 10 random events
    for (int i = 0; i < NUM_EVENTS_TO_PUBLISH; i++) {
      DeviceEvent event = generateRandomEvent();
      publisher.publishEvent(event);

      if (i % 1000 == 0) {
        System.out.println("Published " + i + " events");
      }
    }

    System.out.println("Published " + NUM_EVENTS_TO_PUBLISH + " events to Kafka topic " + KAFKA_TOPIC);

    publisher.producer.close();
  }

  private static DeviceEvent generateRandomEvent() {
    DeviceEvent event = new DeviceEvent();

    // Random values for each field
    event.setCountry("Country" + random.nextInt(100));
    event.setDeviceId(UUID.randomUUID().toString());
    event.setDeviceType("Type" + random.nextInt(5));
    event.setSegments(generateRandomIntList());
    event.setOptions(generateRandomIntList());
    event.setRelationId(UUID.randomUUID().toString());
    event.setClient(random.nextInt(1000));
    event.setTimestamp(System.currentTimeMillis() / 1000L);

    return event;
  }

  @JsonNaming(PropertyNamingStrategies.SnakeCaseStrategy.class)
  private static class DeviceEvent {
    private String country;
    private String deviceId;
    private String deviceType;
    private List<Integer> segments;
    private List<Integer> options;
    private String relationId;
    private int client;
    private long timestamp;

    public String getCountry() {
      return country;
    }

    public void setCountry(String country) {
      this.country = country;
    }

    public String getDeviceId() {
      return deviceId;
    }

    public void setDeviceId(String deviceId) {
      this.deviceId = deviceId;
    }

    public String getDeviceType() {
      return deviceType;
    }

    public void setDeviceType(String deviceType) {
      this.deviceType = deviceType;
    }

    public List<Integer> getSegments() {
      return segments;
    }

    public void setSegments(List<Integer> segments) {
      this.segments = segments;
    }

    public List<Integer> getOptions() {
      return options;
    }

    public void setOptions(List<Integer> options) {
      this.options = options;
    }

    public String getRelationId() {
      return relationId;
    }

    public void setRelationId(String relationId) {
      this.relationId = relationId;
    }

    public int getClient() {
      return client;
    }

    public void setClient(int client) {
      this.client = client;
    }

    public long getTimestamp() {
      return timestamp;
    }

    public void setTimestamp(long timestamp) {
      this.timestamp = timestamp;
    }
  }

  private static List<Integer> generateRandomIntList() {
    List<Integer> list = new ArrayList<>();
    int size = MV_ARRAY_LENGTH;
    for (int i = 0; i < size; i++) {
      list.add(random.nextInt(10000));
    }
    return list;
  }
}
KKcorps commented 1 year ago
Screenshot 2023-11-17 at 10 46 42 AM
KKcorps commented 1 year ago

I feel like it could be some corrupt row that might be causing this although not sure.