NYCPlanning / db-safegraph

SafeGraph ETL
4 stars 1 forks source link

Sector numbers do not equal the total of all the subsectors in that sector for Zip code files (daily_zip_poivisits_by_sector and daily_zip_poivisits_by_subsector) #88

Open AmandaDoyle opened 3 years ago

AmandaDoyle commented 3 years ago

Example: For 6/30/20, the sector file contains 179 visits for sector 72, ZIP code 10001 in MN. The subsector file contains 1992 visits for subsector 722 for that ZIP code.

mgraber commented 3 years ago

Confirmed that this problem is for several dates, and also applies to borough aggregations. For example:

-- Check for differences between sector and subsector visits by boro

WITH daily_visits AS(
SELECT safegraph_place_id, poi_cbg, date_add('day', row_number() over(partition by safegraph_place_id), date_start) AS date_current, CAST(visits AS SMALLINT) as visits
FROM (
  SELECT
     safegraph_place_id,
     poi_cbg,
     CAST(SUBSTR(date_range_start, 1, 10) AS DATE) as date_start,
     CAST(SUBSTR(date_range_end, 1, 10) AS DATE) as date_end,
     cast(json_parse(visits_by_day) as array<varchar>) as a
  FROM safegraph.weekly_patterns
  WHERE SUBSTR(poi_cbg,1,5) IN ('36085','36081','36061','36047','36005')
  AND CAST('2020-10-01' AS DATE) < dt
  AND CAST('2020-11-01' AS DATE) > dt
) b
CROSS JOIN UNNEST(a) as t(visits)
),

sectors AS(
SELECT
   a.date_current as date,
   (CASE WHEN SUBSTR(a.poi_cbg,1,5) = '36005' THEN 'BX'
        WHEN SUBSTR(a.poi_cbg,1,5) = '36047' THEN 'BK'
        WHEN SUBSTR(a.poi_cbg,1,5) = '36061' THEN 'MN'
        WHEN SUBSTR(a.poi_cbg,1,5) = '36081' THEN 'QN'
        WHEN SUBSTR(a.poi_cbg,1,5) = '36085' THEN 'SI'
   END) as borough,
   (CASE WHEN SUBSTR(a.poi_cbg,1,5) = '36005' THEN 2
        WHEN SUBSTR(a.poi_cbg,1,5) = '36047' THEN 3
        WHEN SUBSTR(a.poi_cbg,1,5) = '36061' THEN 1
        WHEN SUBSTR(a.poi_cbg,1,5) = '36081' THEN 4
        WHEN SUBSTR(a.poi_cbg,1,5) = '36085' THEN 5
   END) as borocode,
   SUBSTR(a.poi_cbg,1,5) as fips_county,
   SUBSTR(b.naics_code,1,2) as sector,
   SUM(a.visits) as total_visits
FROM daily_visits a
LEFT JOIN (
      SELECT distinct safegraph_place_id, naics_code
      FROM "safegraph"."core_poi"
      WHERE region = 'NY' AND dt = CAST('2020-10-01' AS DATE)
    ) b  
    ON a.safegraph_place_id=b.safegraph_place_id
GROUP BY a.date_current,
         SUBSTR(a.poi_cbg,1,5),
         SUBSTR(b.naics_code,1,2)
ORDER BY date, borocode),

subsectors AS(
SELECT
   a.date_current as date,
   (CASE WHEN SUBSTR(a.poi_cbg,1,5) = '36005' THEN 'BX'
        WHEN SUBSTR(a.poi_cbg,1,5) = '36047' THEN 'BK'
        WHEN SUBSTR(a.poi_cbg,1,5) = '36061' THEN 'MN'
        WHEN SUBSTR(a.poi_cbg,1,5) = '36081' THEN 'QN'
        WHEN SUBSTR(a.poi_cbg,1,5) = '36085' THEN 'SI'
   END) as borough,
   (CASE WHEN SUBSTR(a.poi_cbg,1,5) = '36005' THEN 2
        WHEN SUBSTR(a.poi_cbg,1,5) = '36047' THEN 3
        WHEN SUBSTR(a.poi_cbg,1,5) = '36061' THEN 1
        WHEN SUBSTR(a.poi_cbg,1,5) = '36081' THEN 4
        WHEN SUBSTR(a.poi_cbg,1,5) = '36085' THEN 5
   END) as borocode,
   SUBSTR(a.poi_cbg,1,5) as fips_county,
   SUBSTR(b.naics_code,1,3) as subsector,
   SUM(a.visits) as total_visits
FROM daily_visits a
LEFT JOIN (
      SELECT distinct safegraph_place_id, naics_code
      FROM "safegraph"."core_poi"
      WHERE region = 'NY' AND dt = CAST('2020-10-01' AS DATE)
    ) b  
    ON a.safegraph_place_id=b.safegraph_place_id
GROUP BY a.date_current,
         SUBSTR(a.poi_cbg,1,5),
         SUBSTR(b.naics_code,1,3)
ORDER BY date, borocode)

SELECT a.borough, a.date, a.sector, a.total_visits as sector_visits, SUM(b.total_visits) as sum_subsector_visits,
ABS(SUM(b.total_visits) - a.total_visits) as diff
FROM sectors a
JOIN subsectors b
ON a.sector = SUBSTR(b.subsector,1,2)
AND a.date = b.date
AND a.borough = b.borough
GROUP BY a.borough, a.date, a.sector, a.total_visits
ORDER BY diff DESC