NYCPlanning / db-factfinder

data ETL for population fact finder (decennial + acs)
https://nycplanning.github.io/db-factfinder/factfinder/
MIT License
2 stars 3 forks source link

QAQC Summary Tables #208

Closed mgraber closed 2 years ago

mgraber commented 3 years ago

Queries for QAQC

SELECT 
    pff_variable,
    geotype,
    -- c
    SUM(c_diff_flag) as c_diffs,
    AVG(ABS(NULLIF(c_diff, 0))) as avg_abs_c_diff,
    MAX(ABS(c_diff)) as max_abs_c_diff,
    SUM(c_pff_null_flag) as c_ppf_null_sample_not,
    SUM(c_sample_null_flag) as c_sample_null_pff_not,
    -- e
    SUM(e_diff_flag) as e_diffs,
    AVG(ABS(NULLIF(e_diff, 0))) as avg_abs_e_diff,
    MAX(ABS(e_diff)) as max_abs_e_diff,
    SUM(e_pff_null_flag) as e_ppf_null_sample_not,
    SUM(e_sample_null_flag) as e_sample_null_pff_not,
    -- m
    SUM(m_diff_flag) as m_diffs,
    AVG(ABS(NULLIF(m_diff, 0))) as avg_abs_m_diff,
    MAX(ABS(m_diff)) as max_abs_m_diff,
    SUM(m_pff_null_flag) as m_ppf_null_sample_not,
    SUM(m_sample_null_flag) as m_sample_null_pff_not,
    -- p
    SUM(p_diff_flag) as p_diffs,
    AVG(ABS(NULLIF(p_diff, 0))) as avg_abs_p_diff,
    MAX(ABS(p_diff)) as max_abs_p_diff,
    SUM(p_pff_null_flag) as p_ppf_null_sample_not,
    SUM(p_sample_null_flag) as p_sample_null_pff_not,
    -- z
    SUM(z_diff_flag) as z_diffs,
    AVG(ABS(NULLIF(z_diff, 0))) as avg_abs_z_diff,
    MAX(ABS(z_diff)) as max_abs_z_diff,
    SUM(z_pff_null_flag) as z_ppf_null_sample_not,
    SUM(z_sample_null_flag) as z_sample_null_pff_not
FROM (
    SELECT
        a.labs_geoid,
        a.census_geoid,
        a.geotype,
        a.pff_variable,
        a."domain" as sample_domain,
        CASE WHEN a.c <> 0 THEN ((b.c - a.c)/a.c) * 100 ELSE NULL END as c_pct_diff,
        CASE WHEN a.e <> 0 THEN ((b.e - a.e)/a.e) * 100 ELSE NULL END as e_pct_diff,
        CASE WHEN a.m <> 0 THEN ((b.m - a.m)/a.m) * 100 ELSE NULL END as m_pct_diff,
        CASE WHEN a.p <> 0 THEN ((b.p - a.p)/a.p) * 100 ELSE NULL END as p_pct_diff,
        CASE WHEN a.z <> 0 THEN ((b.z - a.z)/a.z) * 100 ELSE NULL END as z_pct_diff,
        b.c-a.c as c_diff,
        b.e-a.e as e_diff,
        b.m-a.m as m_diff,
        b.p-a.p as p_diff,
        b.z-a.z as z_diff,
        (ABS(b.c-a.c) > 0)::int as c_diff_flag,
        (ABS(b.e-a.e) > 0)::int as e_diff_flag,
        (ABS(b.m-a.m) > 0)::int as m_diff_flag,
        (ABS(b.p-a.p) > 0)::int as p_diff_flag,
        (ABS(b.z-a.z) > 0)::int as z_diff_flag,
        (b.c IS NULL AND a.c IS NOT NULL)::int as c_pff_null_flag,
        (b.c IS NOT NULL AND a.c IS NULL)::int as c_sample_null_flag,
        (b.e IS NULL AND a.e IS NOT NULL)::int as e_pff_null_flag,
        (b.e IS NOT NULL AND a.e IS NULL)::int as e_sample_null_flag,
        (b.m IS NULL AND a.m IS NOT NULL)::int as m_pff_null_flag,
        (b.m IS NOT NULL AND a.m IS NULL)::int as m_sample_null_flag,
        (b.p IS NULL AND a.p IS NOT NULL)::int as p_pff_null_flag,
        (b.p IS NOT NULL AND a.p IS NULL)::int as p_sample_null_flag,
        (b.z IS NULL AND a.z IS NOT NULL)::int as z_pff_null_flag,
        (b.z IS NOT NULL AND a.z IS NULL)::int as z_sample_null_flag,
        a.c as c_sample, b.c as c_ff, 
        a.e as e_sample, b.e as e_ff, 
        a.m as m_sample, b.m as m_ff,
        a.p as p_sample, b.p as p_ff,
        a.z as z_sample, b.z as z_ff
    FROM pff_acs."sample-Y2019-G2010_to_2020" a
    JOIN pff_acs."staging-Y2019-G2010_to_2020" b
    ON a.labs_geoid = b.labs_geoid
    AND a.geotype = b.geotype
    AND a.pff_variable = b.pff_variable) compare
GROUP BY pff_variable, geotype
ORDER BY geotype;
SPTKL commented 3 years ago

Major value difference check

SELECT * FROM (
    select
        a.census_geoid,
        a.pff_variable,
        a.e, b.e as e_sample,
        (CASE WHEN b.e != 0 THEN abs((a.e-b.e)/b.e) END) as e_diff_pct,
        a.m, b.m as m_sample,
        (CASE WHEN b.m != 0 THEN abs((a.m-b.m)/b.m) END)as m_diff_pct,
        a.p, b.p as p_sample,
        (CASE WHEN b.p != 0 THEN abs((a.p-b.p)/b.p) END) as p_diff_pct,
        a.z, b.z as z_sample,
        (CASE WHEN b.z != 0 THEN abs((a.z-b.z)/b.z) END) as z_diff_pct
    from pff_acs."staging-Y2019-G2010_to_2020" a
    LEFT JOIN pff_acs."sample-Y2019-G2010_to_2020" b
    ON a.pff_variable = b.pff_variable
    AND a.census_geoid = b.census_geoid
) a WHERE e_diff_pct > 0.1 or m_diff_pct > 0.1 or p_diff_pct > 0.1 or z_diff_pct > 0.1
and a.pff_variable not like 'avg%';

Null checks

this one is for detecting places where we are missing values in e,m,p,z

SELECT * 
FROM (
select
a.census_geoid,
a.pff_variable,
a.e, b.e as e_sample,
a.m, b.m as m_sample,
a.p, b.p as p_sample,
a.z, b.z as z_sample
from pff_acs."staging-Y2010-G2010_to_2020" a
LEFT JOIN pff_acs."sample-Y2010-G2010_to_2020" b
ON a.pff_variable = b.pff_variable
AND a.census_geoid = b.census_geoid
) a 
WHERE 
(e IS NULL and e_sample is not NULL) OR 
(m IS NULL and m_sample is not NULL) OR 
(p IS NULL and p_sample is not NULL) OR 
(z IS NULL and z_sample is not NULL) 
;
mgraber commented 3 years ago

Variables that aren't flagged as not in profile in 2018 meta, but where we're getting non-NULL p and z & sample has NULL p & z for 2019:

Variables that aren in profile in 2010 meta, but where we're getting non-NULL p and z & sample has NULL p & z for 2010: