huizezhang-sherry / cubble

A tidy structure for spatio-temporal vector data
https://huizezhang-sherry.github.io/cubble/
Other
55 stars 9 forks source link

Error on mismatched sizes in making a cubble #9

Closed dicook closed 1 year ago

dicook commented 1 year ago
library(tidyverse)
library(tsibble)
library(cubble)
library(ozmaps)

# Read and handle missing
covid <- read_csv("https://raw.githubusercontent.com/numbats/eda/master/data/melb_lga_covid.csv") %>%
  mutate(Buloke = as.numeric(ifelse(Buloke == "null", "0", Buloke))) %>%
  mutate(Hindmarsh = as.numeric(ifelse(Hindmarsh == "null", "0", Hindmarsh))) %>%
   mutate(Towong = as.numeric(ifelse(Towong == "null", "0", Towong))) %>%
  pivot_longer(cols = Alpine:Yarriambiack, names_to="NAME", values_to="cases") %>%
  mutate(Date = ydm(paste0("2020/",Date))) %>%
  mutate(cases=replace_na(cases, 0))

# Create a tsibble
covid_ts <- as_tsibble(covid, key=NAME, index=Date)

# Get spatial polygons from ozmaps
data("abs_lga")

# Create cubble
covid_cb <- as_cubble(
  list(spatial = abs_lga, temporal = covid_ts),
  key = NAME, index = Date, 
  output = "unmatch")

gives the following error:

Error:
! Tibble columns must have compatible sizes.
• Size 84: Existing data.
• Size 78: Column `temporal`.
ℹ Only values of size one are recycled.
Run `rlang::last_error()` to see where the error occurred.

All the time series are of the same length according to my check.

dicook commented 1 year ago

Getting a bit further, but now finding that the time series are empty:

covid <- read_csv("https://raw.githubusercontent.com/numbats/eda/master/data/melb_lga_covid.csv") %>%
  mutate(Buloke = as.numeric(ifelse(Buloke == "null", "0", Buloke))) %>%
  mutate(Hindmarsh = as.numeric(ifelse(Hindmarsh == "null", "0", Hindmarsh))) %>%
   mutate(Towong = as.numeric(ifelse(Towong == "null", "0", Towong))) %>%
  pivot_longer(cols = Alpine:Yarriambiack, names_to="NAME", values_to="cases") %>%
  mutate(Date = ydm(paste0("2020/",Date))) %>%
  mutate(cases=replace_na(cases, 0))

covid <- covid %>%
  group_by(NAME) %>%
  mutate(new_cases = cases - dplyr::lag(cases))

covid <- covid %>%
  select(-cases) %>%
  rename(lga = NAME, date=Date, cases = new_cases) 
covid_ts <- as_tsibble(covid, key=lga, index=date)

lga <- strayr::read_absmap("lga2018") |>
  rename(lga = lga_name_2018) |>
  filter(state_name_2016 == "Victoria") 

covid_matching <- as_cubble(
  list(spatial = lga, temporal = covid_ts),
  key = lga, index = date, 
  output = "unmatch")

lga <- lga %>% 
  mutate(lga = ifelse(lga == "Colac-Otway (S)", "Colac Otway (S)", lga)) 
covid_matching <- as_cubble(
  list(spatial = lga, temporal = covid_ts),
  key = lga, index = date, 
  output = "unmatch")
covid_matching$paired %>% print(n=80)
covid_matching$others
lga <- lga %>%
  filter(!(lga %in% covid_matching$others$spatial))

covid_cb <- as_cubble(
  list(spatial = lga, temporal = covid_ts),
  key = lga, index = date, coords = c(cent_long, cent_lat))

The cubble happily is created without any errors, but the time series component is empty:

> covid_cb
Error in data$ts[ts_size][[1]] : subscript out of bounds
huizezhang-sherry commented 1 year ago

Hi Di, this is me reproducing your example after some changes in the cubble code. Could you check if this meets your need?

library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(cubble)
library(tsibble)
#> 
#> Attaching package: 'tsibble'
#> The following objects are masked from 'package:cubble':
#> 
#>     index, key_data, key_vars
#> The following object is masked from 'package:lubridate':
#> 
#>     interval
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, union
covid <- read_csv("https://raw.githubusercontent.com/numbats/eda/master/data/melb_lga_covid.csv") %>%
  mutate(Buloke = as.numeric(ifelse(Buloke == "null", "0", Buloke))) %>%
  mutate(Hindmarsh = as.numeric(ifelse(Hindmarsh == "null", "0", Hindmarsh))) %>%
  mutate(Towong = as.numeric(ifelse(Towong == "null", "0", Towong))) %>%
  pivot_longer(cols = Alpine:Yarriambiack, names_to="NAME", values_to="cases") %>%
  mutate(Date = ydm(paste0("2020/",Date))) %>%
  mutate(cases=replace_na(cases, 0))
#> Rows: 112 Columns: 80
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (4): Date, Buloke, Hindmarsh, Towong
#> dbl (76): Alpine, Ararat, Ballarat, Banyule, Bass Coast, Baw Baw, Bayside, B...
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

covid <- covid %>%
  group_by(NAME) %>%
  mutate(new_cases = cases - dplyr::lag(cases)) %>% 
  select(-cases) %>%
  rename(lga = NAME, date=Date, cases = new_cases) 
covid_ts <- as_tsibble(covid, key=lga, index=date)

lga <- strayr::read_absmap("lga2018") |>
  rename(lga = lga_name_2018) |>
  filter(state_name_2016 == "Victoria") 

covid_matching <- as_cubble(
  list(spatial = lga, temporal = covid_ts),
  key = lga, index = date,coords = c(cent_long, cent_lat))
#> ! Some sites in the temporal table don't have spatial information
#> ! Some sites in the spatial table don't have temporal information
#> ! Use argument `output = "unmatch"` to check on the unmatched key

# instructed from above to use `output = "unmatch"` to check on matching
covid_matching <- as_cubble(
  list(spatial = lga, temporal = covid_ts),
  key = lga, index = date,coords = c(cent_long, cent_lat), 
  output = "unmatch")
covid_matching$paired
#> # A tibble: 78 × 2
#>    spatial        temporal  
#>    <chr>          <chr>     
#>  1 Alpine (S)     Alpine    
#>  2 Ararat (RC)    Ararat    
#>  3 Ballarat (C)   Ballarat  
#>  4 Banyule (C)    Banyule   
#>  5 Bass Coast (S) Bass Coast
#>  6 Baw Baw (S)    Baw Baw   
#>  7 Bayside (C)    Bayside   
#>  8 Benalla (RC)   Benalla   
#>  9 Boroondara (C) Boroondara
#> 10 Brimbank (C)   Brimbank  
#> # … with 68 more rows
covid_matching$others
#> $temporal
#> [1] "Colac Otway"
#> 
#> $spatial
#> [1] "Colac-Otway (S)"                       
#> [2] "Unincorporated Vic"                    
#> [3] "No usual address (Vic.)"               
#> [4] "Migratory - Offshore - Shipping (Vic.)"

# modify the lga suggested by the matching summary
lga <- lga %>% 
  mutate(lga = ifelse(lga == "Colac-Otway (S)", "Colac Otway", lga)) %>% 
  filter(!(lga %in% covid_matching$others$spatial))

# create a cubble again 
covid_cb <- as_cubble(
  list(spatial = lga, temporal = covid_ts),
  key = lga, index = date, coords = c(cent_long, cent_lat))
covid_cb
#> # cubble:   lga [78]: nested form [sf]
#> # bbox:     [141.27, -38.63, 148.3, -34.86]
#> # temporal: date [date], cases [dbl]
#>    lga_code_2018 state_code_2016 state_name_2016 areasqkm_2018 cent_long
#>    <chr>         <chr>           <chr>                   <dbl>     <dbl>
#>  1 20110         2               Victoria               4788.       147.
#>  2 20260         2               Victoria               4211.       143.
#>  3 20570         2               Victoria                739.       144.
#>  4 20660         2               Victoria                 62.5      145.
#>  5 20740         2               Victoria                866.       146.
#>  6 20830         2               Victoria               4028.       146.
#>  7 20910         2               Victoria                 37.2      145.
#>  8 21010         2               Victoria               2353.       146.
#>  9 21110         2               Victoria                 60.2      145.
#> 10 21180         2               Victoria                123.       145.
#> # … with 68 more rows, and 4 more variables: cent_lat <dbl>, lga <chr>,
#> #   ts <list>, geometry <MULTIPOLYGON [°]>

Created on 2022-09-19 by the reprex package (v2.0.1)

dicook commented 1 year ago

Yes, this works! Great, I like it and makes it much easier for the user now.