NYCPlanning / db-safegraph

SafeGraph ETL
4 stars 1 forks source link

OPS: Urgent Use Case #63

Open SPTKL opened 3 years ago

SPTKL commented 3 years ago

Objective:

Proposed schema adding onto poi/weekly_nyc_poivisits

-- poi info table
< poi info table > (
    safegraph_place_id text, 
    address text,
    poi_cbg text,
    naics_code varchar(6),
    latitude numeric,
    longitude numeric
    area_square_feet numeric,
    includes_parking_lot boolean,
    is_synthetic boolean,
    zonename text,
    zonecolor text
);

-- weekly visitation by poi
< weekly visitation > (
    safegraph_place_id text, 
    year_week text,
    poi text,
    poi_cbg text,
    visits_weekday numeric,
    visits_weekend numeric,
    visits_all numeric
);

-- cbg info table: the zone covering the majority of the cbg
< cbg zone lookup > (
    cbg text,
    zonename text,
    zonecolor text
);

-- weekly visitation by poi and visitor_home_cbg
< weekly visitation by visitor_home_cbg> (
    safegraph_place_id text, 
    year_week text,
    poi text,
    poi_cbg text,
    visitor_home_cbg text, 
    visits numeric
);

Data Loading

ETL Queries

POI Table with id, category, geom, square footage, zonename and zonecolor

SELECT
    a.safegraph_place_id, 
    a.naics_code, 
    a.top_category, 
    a.sub_category,
    a.latitude,
    a.longitude,
    a.area_square_feet,
    a.includes_parking_lot,
    a.is_synthetic,
    b.zonename,
    b.zonecolor
FROM (
SELECT 
    a.safegraph_place_id, 
    a.naics_code, 
    a.top_category, 
    a.sub_category,
    a.latitude,
    a.longitude,
    b.area_square_feet,
    b.includes_parking_lot,
    b.is_synthetic
FROM "safegraph"."core_poi" a
LEFT JOIN "safegraph"."geo_supplement" b
ON a.safegraph_place_id = b.safegraph_place_id
WHERE a.region = 'NY'
    AND a.dt = CAST('2020-10-01' AS DATE)
    AND b.dt = CAST('2020-08-17' AS DATE)
) a
LEFT JOIN "safegraph"."zones" b
ON ST_WITHIN(ST_POINT(a.longitude, a.latitude), ST_POLYGON(b.wkt))
mgraber commented 3 years ago

Weekly visitation by home cbg


WITH dataset AS(
SELECT date_start, safegraph_place_id, poi_cbg, map_keys(a) as origins, a
FROM (
  SELECT
     safegraph_place_id,
     poi_cbg,
     CAST(SUBSTR(date_range_start, 1, 10) AS DATE) as date_start,
     CAST(json_parse(visitor_home_cbgs) as map<varchar, varchar>) as a
  FROM safegraph.weekly_patterns
  WHERE SUBSTR(poi_cbg,1,5) IN ('36085','36081','36061','36047','36005')
  AND CAST('2020-10-01' AS DATE) < dt
  AND CAST('2020-10-20' AS DATE) > dt
) b
)

SELECT 
    safegraph_place_id,
    CAST(EXTRACT(year from date_start) AS VARCHAR)||'-'||LPAD(CAST(EXTRACT(week from date_start) AS VARCHAR),2,'0') as year_week,
    poi_cbg,
    visitor_home_cbg,
    CAST(a[visitor_home_cbg] AS SMALLINT) as visits
FROM dataset
CROSS JOIN unnest(origins) t(visitor_home_cbg)
mgraber commented 3 years ago

Weekly visitation by POI

WITH daily_visits AS(
SELECT safegraph_place_id, poi_cbg, date_add('day', row_number() over(partition by safegraph_place_id), date_start) AS date_current, CAST(visits AS SMALLINT) as visits
FROM (
  SELECT
     safegraph_place_id,
     poi_cbg,
     CAST(SUBSTR(date_range_start, 1, 10) AS DATE) as date_start,
     CAST(SUBSTR(date_range_end, 1, 10) AS DATE) as date_end,
     cast(json_parse(visits_by_day) as array<varchar>) as a
  FROM safegraph.weekly_patterns
  WHERE SUBSTR(poi_cbg,1,5) IN ('36085','36081','36061','36047','36005')
  AND CAST('2020-10-01' AS DATE) < dt
  AND CAST('2020-10-20' AS DATE) > dt
) b
CROSS JOIN UNNEST(a) as t(visits)
)
SELECT
   safegraph_place_id,
   CAST(EXTRACT(year from date_current) AS VARCHAR)||'-'||LPAD(CAST(EXTRACT(week from date_current) AS VARCHAR),2,'0') as year_week,
   poi_cbg,
   SUM(CASE WHEN EXTRACT(dow from date_current) NOT IN (0, 6) THEN visits END) as visits_weekday,
   SUM(CASE WHEN EXTRACT(dow from date_current) IN (0, 6) THEN visits END) as visits_weekend,
   SUM(visits) as visits_total
FROM daily_visits
GROUP BY EXTRACT(year from date_current), 
         EXTRACT(week from date_current),
         safegraph_place_id,
         poi_cbg
ORDER BY year_week, poi_cbg
SPTKL commented 3 years ago

Initial phase of implementation complete

SPTKL commented 3 years ago

70