Per our conversation on Slack, it would be great if this package could process the open_hours field from SafeGraph (see here for spec). I had hoped to write up a PR but, having compared my amateurish attempt to the existing codebase, maybe it's better if I just supply the code I put together here and you decide how to proceed.
library(data.table)
library(SafeGraphR)
library(fst)
library(magrittr)
# Load Core POI data ----
core_poi <- read_many_csvs(dir = "/data1/safegraph/core_poi/2020/11/06/11/")
# Limit to POI that give open hours
open_hours_only <- core_poi[open_hours != ""]
convert_hour_str <- function(time_str, midnight_is_zero = TRUE) {
# Convert an %H:%M time string to numeric, e.g., "08:15" -> 8.25
time_POSIX <- as.POSIXlt(time_str, format = "%H:%M")
result <- hour(time_POSIX) + minute(time_POSIX) / 60
if (!midnight_is_zero) {
result[result == 0] <- 24
}
return(result)
}
convert_JSON_hours <- function(hours_clean) {
# Convert a JSON string listing hours open and closed into a data.table
# hours_clean <- unique_hours$open_hours_clean[96] # DEBUG
hour_list <- jsonlite::fromJSON(hours_clean) # This takes a long, long time.
# Keep only non-empty
hour_list <- hour_list[lapply(hour_list,length)>0]
hour_dt <- rbindlist(lapply(hour_list, as.data.table), idcol = "dow")
setnames(hour_dt, c("V1", "V2"), c("open", "close"))
hour_dt[, `:=`(open = convert_hour_str(open),
close = convert_hour_str(close, midnight_is_zero = F))]
hour_dt
}
expand_hours <- function(dt) {
# dt <- open_hours_only[1:10000] # DEBUG
# To save on parsing time, get unique values of open_hours
unique_hours <- dt[, .N, by = open_hours] %>% .[, N := NULL]
# Remove extra escaped quotes
unique_hours[, open_hours_clean := stringr::str_replace_all(open_hours, '\\"\\"','\\"')]
# Get a data.table where each obs is row-by-dow-open/close interval
unique_hours_dt <- unique_hours[, convert_JSON_hours(open_hours_clean), by = open_hours]
# Merge (M:M) back to original dataset
dt_final <- merge(dt[, .(placekey, open_hours)],
unique_hours_dt,
by = "open_hours",
allow.cartesian = T)
dt_final <- dt_final[, .(placekey, dow, open, close)]
dt_final
}
expanded_hours <- expand_hours(open_hours_only[sample(.N, 100)])
Per our conversation on Slack, it would be great if this package could process the
open_hours
field from SafeGraph (see here for spec). I had hoped to write up a PR but, having compared my amateurish attempt to the existing codebase, maybe it's better if I just supply the code I put together here and you decide how to proceed.