trinodb / trino

Official repository of Trino, the distributed SQL query engine for big data, formerly known as PrestoSQL (https://trino.io)
https://trino.io
Apache License 2.0
10.08k stars 2.91k forks source link

HIVE_CANNOT_OPEN_SPLIT when reading AVRO with decimal fields. #21728

Open calloc opened 4 months ago

calloc commented 4 months ago

I ran the query on a Trino engine and by omitting decimal fields the query works. It used to work fine before 21 Apr, 2024. I suspect some changes

the query is very simple

SELECT * FROM test_table LIMIT 1 -- does NOT work 

while

-- does WORK
SELECT
    _timestamp 
    , driverid
    , course
    , horizontalaccuracy
    , locationtimestamp
    , provider
    , speed
    -- , loc_lat
    -- , loc_lon
    -- , raw_loc_lat
    -- , raw_loc_lon
    , raw_course
    , is_interpolated
    , is_cold_start
    , loc_provider
    , course_provider
    , vehicleid
    , vcuid
    , plate
    , infoupdatedat
    , _id
    , _index
    , synced_at
    , load_date
    , driveareaid
    , data_date
    , data_hour
FROM test_table 
LIMIT 1

The error message is

HIVE_CANNOT_OPEN_SPLIT: Avro type resolution error when initializing split from s3://....

Attached link is the sample avro files https://drive.google.com/file/d/1h-jj8SjW4Ob9Xcf-hesQbhM6HL7GbnCE/view?usp=sharing

and the table is created using this query

CREATE EXTERNAL TABLE `test_table`(
  `_timestamp` timestamp COMMENT 'from deserializer', 
  `driverid` string COMMENT 'from deserializer', 
  `course` double COMMENT 'from deserializer', 
  `horizontalaccuracy` double COMMENT 'from deserializer', 
  `locationtimestamp` timestamp COMMENT 'from deserializer', 
  `provider` string COMMENT 'from deserializer', 
  `speed` double COMMENT 'from deserializer', 
  `loc_lat` binary COMMENT 'from deserializer', 
  `loc_lon` binary COMMENT 'from deserializer', 
  `raw_loc_lat` binary COMMENT 'from deserializer', 
  `raw_loc_lon` binary COMMENT 'from deserializer', 
  `raw_course` double COMMENT 'from deserializer', 
  `is_interpolated` int COMMENT 'from deserializer', 
  `is_cold_start` int COMMENT 'from deserializer', 
  `loc_provider` string COMMENT 'from deserializer', 
  `course_provider` string COMMENT 'from deserializer', 
  `vehicleid` string COMMENT 'from deserializer', 
  `vcuid` string COMMENT 'from deserializer', 
  `plate` string COMMENT 'from deserializer', 
  `infoupdatedat` timestamp COMMENT 'from deserializer', 
  `_id` string COMMENT 'from deserializer', 
  `_index` string COMMENT 'from deserializer', 
  `synced_at` timestamp COMMENT 'from deserializer')
PARTITIONED BY ( 
  `load_date` string, 
  `driveareaid` string, 
  `data_date` string, 
  `data_hour` string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.avro.AvroSerDe' 
WITH SERDEPROPERTIES ( 
  'avro.schema.literal'='{\"type\":\"record\",\"name\":\"topLevelRecord\",\"fields\":[{\"name\":\"_timestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"driverId\",\"type\":[\"string\",\"null\"]},{\"name\":\"course\",\"type\":[\"double\",\"null\"]},{\"name\":\"horizontalAccuracy\",\"type\":[\"double\",\"null\"]},{\"name\":\"locationTimestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"speed\",\"type\":[\"double\",\"null\"]},{\"name\":\"loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_course\",\"type\":[\"double\",\"null\"]},{\"name\":\"is_interpolated\",\"type\":[\"int\",\"null\"]},{\"name\":\"is_cold_start\",\"type\":[\"int\",\"null\"]},{\"name\":\"loc_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"course_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"vehicleId\",\"type\":[\"string\",\"null\"]},{\"name\":\"vcuId\",\"type\":[\"string\",\"null\"]},{\"name\":\"plate\",\"type\":[\"string\",\"null\"]},{\"name\":\"infoUpdatedAt\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"_id\",\"type\":[\"string\",\"null\"]},{\"name\":\"_index\",\"type\":[\"string\",\"null\"]},{\"name\":\"synced_at\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]}]}') 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION
  's3://test_bucket/test_table/'
TBLPROPERTIES (
  'CrawlerSchemaDeserializerVersion'='1.0', 
  'CrawlerSchemaSerializerVersion'='1.0', 
  'UPDATED_BY_CRAWLER'='test_table', 
  'averageRecordSize'='193', 
  'avro.schema.literal'='{\"type\":\"record\",\"name\":\"topLevelRecord\",\"fields\":[{\"name\":\"_timestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"driverId\",\"type\":[\"string\",\"null\"]},{\"name\":\"course\",\"type\":[\"double\",\"null\"]},{\"name\":\"horizontalAccuracy\",\"type\":[\"double\",\"null\"]},{\"name\":\"locationTimestamp\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"speed\",\"type\":[\"double\",\"null\"]},{\"name\":\"loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lat\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lat\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_loc_lon\",\"type\":[{\"type\":\"fixed\",\"name\":\"fixed\",\"namespace\":\"topLevelRecord.raw_loc_lon\",\"size\":9,\"logicalType\":\"decimal\",\"precision\":20,\"scale\":15},\"null\"]},{\"name\":\"raw_course\",\"type\":[\"double\",\"null\"]},{\"name\":\"is_interpolated\",\"type\":[\"int\",\"null\"]},{\"name\":\"is_cold_start\",\"type\":[\"int\",\"null\"]},{\"name\":\"loc_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"course_provider\",\"type\":[\"string\",\"null\"]},{\"name\":\"vehicleId\",\"type\":[\"string\",\"null\"]},{\"name\":\"vcuId\",\"type\":[\"string\",\"null\"]},{\"name\":\"plate\",\"type\":[\"string\",\"null\"]},{\"name\":\"infoUpdatedAt\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]},{\"name\":\"_id\",\"type\":[\"string\",\"null\"]},{\"name\":\"_index\",\"type\":[\"string\",\"null\"]},{\"name\":\"synced_at\",\"type\":[{\"type\":\"long\",\"logicalType\":\"timestamp-millis\"},\"null\"]}]}', 
  'classification'='avro', 
  'compressionType'='none', 
  'objectCount'='42313', 
  'partition_filtering.enabled'='true', 
  'recordCount'='722328740', 
  'sizeKey'='159416670061', 
  'typeOfData'='file')
findinpath commented 3 months ago

cc @jklamer

jklamer commented 3 months ago

@calloc any chance you have the stack trace for the Avro type resolution error exception? That should give us an idea what the issue may be.