apache / pinot

Apache Pinot - A realtime distributed OLAP datastore
https://pinot.apache.org/
Apache License 2.0
5.39k stars 1.26k forks source link

NullPointer on TEXT_MATCH Queries #6295

Open pabrahamusa opened 3 years ago

pabrahamusa commented 3 years ago

Hello,

Currently I am having a config with following for real time ingestion I have enabled Text Index. When I try to query the data with TEXT_MATCH an error is throwing. Why it is so, Is this the right way to enable Text indexing?

Error: [ { "errorCode": 200, "message": "QueryExecutionError:\njava.lang.NullPointerException\n\tat org.apache.pinot.core.operator.filter.TextMatchFilterOperator.getNextBlock(TextMatchFilterOperator.java:45)\n\tat org.apache.pinot.core.operator.filter.TextMatchFilterOperator.getNextBlock(TextMatchFilterOperator.java:30)\n\tat org.apache.pinot.core.operator.BaseOperator.nextBlock(BaseOperator.java:49)\n\tat org.apache.pinot.core.operator.DocIdSetOperator.getNextBlock(DocIdSetOperator.java:62)\n\tat org.apache.pinot.core.operator.DocIdSetOperator.getNextBlock(DocIdSetOperator.java:35)\n\tat org.apache.pinot.core.operator.BaseOperator.nextBlock(BaseOperator.java:49)\n\tat org.apache.pinot.core.operator.ProjectionOperator.getNextBlock(ProjectionOperator.java:57)\n\tat org.apache.pinot.core.operator.ProjectionOperator.getNextBlock(ProjectionOperator.java:30)\n\tat org.apache.pinot.core.operator.BaseOperator.nextBlock(BaseOperator.java:49)\n\tat org.apache.pinot.core.operator.transform.TransformOperator.getNextBlock(TransformOperator.java:92)\n\tat org.apache.pinot.core.operator.transform.TransformOperator.getNextBlock(TransformOperator.java:39)\n\tat org.apache.pinot.core.operator.BaseOperator.nextBlock(BaseOperator.java:49)\n\tat org.apache.pinot.core.operator.query.SelectionOnlyOperator.getNextBlock(SelectionOnlyOperator.java:77)\n\tat org.apache.pinot.core.operator.query.SelectionOnlyOperator.getNextBlock(SelectionOnlyOperator.java:38)" } ]

Config

 {
      "tableName": "log",
      "tableType": "REALTIME",
      "ingestionConfig": {
      },
      "segmentsConfig": {
        "timeColumnName": "five_mins_epoch",
        "timeType": "MINUTES",
        "retentionTimeUnit": "DAYS",
        "retentionTimeValue": "7",
        "schemaName": "log",
        "replication": "2",
        "replicasPerPartition": "2",
        "segmentPushType": "APPEND",
        "completionConfig": {
           "completionMode": "DOWNLOAD"
        }
      },
      "tenants": {
       },
      "tableIndexConfig": {
        "loadMode": "MMAP",
        "sortedColumn": ["timemillis"],
        "fieldConfigList": [
            {
              "name": "log",
              "encodingType": "RAW",
              "indexType": "TEXT",
              "properties": {
                "enableQueryCacheForTextIndex": "true",
                "deriveNumDocsPerChunkForRawIndex": "true"
                }
            },
            {
              "name": "container_name",
              "encodingType": "RAW",
              "indexType": "TEXT",
              "properties": {
                "enableQueryCacheForTextIndex": "true",
                "deriveNumDocsPerChunkForRawIndex": "true"
                }
            },
            {
              "name": "pod_name",
              "encodingType": "RAW",
              "indexType": "TEXT",
              "properties": {
                "enableQueryCacheForTextIndex": "true",
                "deriveNumDocsPerChunkForRawIndex": "true"
                }
            },
            {
              "name": "namespace_name",
              "encodingType": "RAW",
              "indexType": "TEXT",
              "properties": {
                "enableQueryCacheForTextIndex": "true",
                "deriveNumDocsPerChunkForRawIndex": "true"
                }
            },
            {
              "name": "host",
              "encodingType": "RAW",
              "indexType": "TEXT",
              "properties": {
                "enableQueryCacheForTextIndex": "true",
                "deriveNumDocsPerChunkForRawIndex": "true"
               }
            },
            {
              "name": "cluster",
              "encodingType": "RAW",
              "indexType": "TEXT",
              "properties": {
                "enableQueryCacheForTextIndex": "true",
                "deriveNumDocsPerChunkForRawIndex": "true"
              }
            }],
        "streamConfigs": {
          "streamType": "kafka",
          "stream.kafka.consumer.type": "simple",
          "stream.kafka.topic.name": "all_logs",
          "stream.kafka.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
          "stream.kafka.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory",
          "stream.kafka.zk.broker.url": "cp-zookeeper-headless.logging.svc.cluster.local:2181",
          "stream.kafka.broker.list": "cp-kafka-headless.logging.svc.cluster.local:9092",
          "realtime.segment.flush.threshold.time": "12h",
          "realtime.segment.flush.threshold.size": "100000",
          "stream.kafka.consumer.prop.auto.offset.reset": "smallest"
        }
      },
      "metadata": {
        "customConfigs": {}
      }
    }

  schema.json: 
    {
      "schemaName": "log",
      "dimensionFieldSpecs": [
        {
          "name": "log",
          "dataType": "STRING",
          "maxLength": "10000",
          "defaultNullValue": ""
        },
        {
          "name": "cluster",
          "dataType": "STRING",
          "defaultNullValue": ""
        },
        {
          "name": "container_name",
          "dataType": "STRING",
          "defaultNullValue": ""
        },
        {
          "name": "namespace_name",
          "dataType": "STRING",
          "defaultNullValue": ""
        },
        {
          "name": "pod_name",
          "dataType": "STRING",
          "defaultNullValue": ""
        },
        {
          "name": "host",
          "dataType": "STRING",
          "defaultNullValue": ""
        },
        {
          "name": "timemillis",
          "dataType": "LONG",
          "defaultNullValue": "100000"
        }
      ],
      "metricFieldSpecs": [],
      "timeFieldSpec": {
        "incomingGranularitySpec": {
          "timeType": "MINUTES",
          "dataType": "LONG",
          "timeFormat": "EPOCH",
          "name": "five_mins_epoch"
        },
        "outgoingGranularitySpec": {
          "dataType": "LONG",
          "timeType": "MINUTES",
          "timeFormat": "EPOCH",
          "name": "five_mins_epoch"
        }
      }
    }
pabrahamusa commented 3 years ago

fixed it, the fieldConfigList was out of place, it should be the main attribute.

pabrahamusa commented 3 years ago

is it possible to have both inverted Index and Text Index at the same time?

kishoreg commented 3 years ago

Not as of now. Nothing in the design that prevents it but we have added some artificial checks. Why would you want this?

pabrahamusa commented 3 years ago

I am assuming regexp_like() will not work for Text Index , it would be nice if we can use both. Also thinking of moving some fields into inverted index and others to TEXT.