SELECT distinct - seems to only work on the source data and not on the results as expected/documented? or am i misunderstanding. Also does not work in sub-querry
example:
Using S3A connector through HIVE / Parquete tables.
select count(*) from minio.pedw.c920 --> 189,473
select count(distinct alf_pe) from minio.pedw.c920; --> 71,628
select distinct alf_pe from minio.pedw.c920; --> 189k rows but should be 71k rows
also as sub-query
create table minio.pedw.person_age as (
with t1 as (
select distinct
alf_e,
date_diff('year',wob,current_date) as person_age,
gndr_cd
from minio.pedw.sailf
)
select t1.*,
case
when t1.person_age <= 16 then 1
when t1.person_age > 16 and t1.person_age <= 30 then 2
when t1.person_age > 30 and t1.person_age <= 55 then 3
else 4
end as person_age_band
from t1;
)
person_age - 1.47PB rows - should be 4.7M
works if i create an intermediate table first
create table minio.pedw.person_age1 as (
select
alf_e,
date_diff('year',wob,current_date) as person_age,
gndr_cd
from minio.pedw.sailf ) ;
create table minio.pedw.person_age2 as
select distinct *,
case
when person_age <= 16 then 1
when person_age > 16 and person_age <= 30 then 2
when person_age > 30 and person_age <= 55 then 3
else 4
end as person_age_band
from minio.pedw.person_age1;
person_age1 - 1.47PB rows
person_age2 - 4.7M rows as should be
SELECT distinct - seems to only work on the source data and not on the results as expected/documented? or am i misunderstanding. Also does not work in sub-querry
example: Using S3A connector through HIVE / Parquete tables. select count(*) from minio.pedw.c920 --> 189,473 select count(distinct alf_pe) from minio.pedw.c920; --> 71,628 select distinct alf_pe from minio.pedw.c920; --> 189k rows but should be 71k rows
also as sub-query
person_age - 1.47PB rows - should be 4.7M
works if i create an intermediate table first
person_age1 - 1.47PB rows person_age2 - 4.7M rows as should be