Open taiyang-li opened 14 hours ago
As a comparison, if we make sure written map column doesn't contain null, the issue doesn't appear any more. And the written and read columns are consistent.
drop table if exists tmp.tnmpnn;
create table tmp.tnmpnn using parquet as
with data_source as (
select
id as uid,
case when random() < 0.1 then null else floor(random() * 100) end as rec_room_id,
case when random() < 0.1 then null else floor(random() * 100) end as room_id,
case when random() < 0.1 then null else floor(random() * 100) end as dispatch_id,
case when random() < 0.1 then null else floor(random() * 100) end as gift_value_total,
case when random() < 0.1 then null else floor(random() * 100) end as follow_channel,
case when random() < 0.1 then null else floor(random() * 100) end as follow_user,
case when random() < 0.1 then null else floor(random() * 100) end as followed_channel,
case when random() < 0.1 then null else floor(random() * 100) end as need_filter,
case when random() < 0.1 then null else floor(random() * 100) end as mic_time
from range(100000)
)
select
uid,
rec_room_id,
room_id,
dispatch_id,
str_to_map(
concat(
'gift_value_total:', coalesce(gift_value_total, 0),
',follow_channel:', coalesce(follow_channel, 0),
',follow_user:', coalesce(follow_user, 0),
',followed_channel:', coalesce(followed_channel, 0),
',filter:', coalesce(need_filter, 0),
',mic_time:', coalesce(mic_time, 0)
)
) as label_map,
mic_time
from
data_source;
:) select count(1) from hdfs('hdfs://bigocluster/apps/hive/warehouse/tmp.db/tnmpnn/part*.parquet') where label_map is not null and toInt64OrZero(label_map['mic_time']) != mic_time;
SELECT count(1)
FROM hdfs('hdfs://bigocluster/apps/hive/warehouse/tmp.db/tnmpnn/part*.parquet')
WHERE (label_map IS NOT NULL) AND (toInt64OrZero(label_map['mic_time']) != mic_time)
Query id: 7d7d1518-6c24-4603-b0b7-8e5b6b61aa44
┌─count(1)─┐
1. │ 0 │
└──────────┘
1 row in set. Elapsed: 16.334 sec. Processed 100.00 thousand rows, 1.52 MB (6.12 thousand rows/s., 92.91 KB/s.)
Peak memory usage: 27.36 MiB.
Backend
CH (ClickHouse)
Bug description
Spark version
None
Spark configurations
No response
System information
No response
Relevant logs
No response