poissonconsulting / newdata

An R Package to Generate New Data Frames for Prediction
https://poissonconsulting.github.io/newdata/
Other
2 stars 1 forks source link

Add function to manually add factor, ordered, date, time etc as text that inherits levels, tz etc of existing column? #17

Open joethorley opened 1 year ago

joethorley commented 1 year ago

This would be really helpful when manually setting values.

joethorley commented 12 months ago

It could be named something like xreplace_values() and would be a non-empty vector of unique values.

joethorley commented 12 months ago

@krlmlr and @aylapear - for discussion on Thursday

krlmlr commented 12 months ago

Can you please sketch, for some simple input, what the desired output should look like?

joethorley commented 12 months ago

This is what I am imagining for factors

library(tibble)
library(newdata)

data <- tibble(fct = factor(c("a", "d"), levels = c("a", "b", "c", "d")))
data
# A tibble: 2 × 1
fct  
<fct>
1 a    
2 d    

xnew_data(data, xreplace_values(fct, "c"))
# A tibble: 2 × 1
fct  
<fct>
1 c

xnew_data(data, xreplace_values(fct, c("c", "d")))
# A tibble: 2 × 1
fct  
<fct>
  1 c
  2 d

xnew_data(data, xreplace_values(fct, c("c", "d", "d")))
  # A tibble: 2 × 1
fct  
<fct>
  1 c    
  2 d    

xnew_data(data, xreplace_values(fct, c("e")))
> warning factor level unrecognized
# A tibble: 2 × 1
fct  
<fct>
  1 NA

xnew_data(data, xreplace_values(fct, c("e", "f")))
> warning factor level unrecognized
# A tibble: 2 × 1
fct  
<fct>
  1 NA

xnew_data(data, xreplace_values(fct, c("e", "c")))
> warning factor level unrecognized
# A tibble: 2 × 1
fct  
<fct>
  1 NA
  2 c    
joethorley commented 12 months ago
library(tibble)
library(newdata)

data <- newdata::obs_data
print(data)
# lgl     int   dbl chr      fct      ord      dte        dtt                 hms   
# <lgl> <int> <dbl> <chr>    <fct>    <ord>    <date>     <dttm>              <time>
# 1 TRUE      1   1   most     most     most     1970-01-02 1969-12-31 16:00:01 00'01"
# 2 FALSE     4   4.5 most     most     most     1970-01-05 1969-12-31 16:00:04 00'04"
# 3 NA        6   8.2 a rarity a rarity a rarity 1970-01-07 1969-12-31 16:00:06 00'06"

xnew_data(data)
# # A tibble: 1 × 9
# lgl     int   dbl chr   fct     ord      dte        dtt                 hms   
# <lgl> <int> <dbl> <chr> <fct>   <ord>    <date>     <dttm>              <time>
# 1 FALSE     3  4.57 most  not obs a rarity 1970-01-04 1969-12-31 16:00:03 00'03"

# this replaces dte with integer -100
xnew_data(data, dte = 1L)
# # A tibble: 1 × 9
#   lgl     int   dbl chr   fct     ord        dte dtt                 hms   
#   <lgl> <int> <dbl> <chr> <fct>   <ord>    <int> <dttm>              <time>
# 1 FALSE     3  4.57 most  not obs a rarity     -100 1969-12-31 16:00:03 00'03"

# whereas this replaces dte with as.Date("2020-03-04)
xnew_data(data, dte = xreplace_values(1L))
# # A tibble: 1 × 9
#   lgl     int   dbl chr   fct     ord        dte dtt                 hms   
#   <lgl> <int> <dbl> <chr> <fct>   <ord>    <int> <dttm>              <time>
# 1 FALSE     3  4.57 most  not obs a rarity  2020-03-04 1969-12-31 16:00:03 00'03"

# whereas this replaces dte with as.Date(c(-100L, -200L))
xnew_data(data, dte = xreplace_values(1L))
# # A tibble: 1 × 9
#   lgl     int   dbl chr   fct     ord        dte dtt                 hms   
#   <lgl> <int> <dbl> <chr> <fct>   <ord>    <int> <dttm>              <time>
# 1 FALSE     3  4.57 most  not obs a rarity  1969-09-23 1969-12-31 16:00:03 00'03"
# 2 FALSE     3  4.57 most  not obs a rarity  1969-06-15 1969-12-31 16:00:03 00'03"

It would work the same in the sense that it attempts to coerce to the type but in the case of factors preserves the levels and date times preserves the time zone.

Maybe a name like xcoerce_values() would be better?

joethorley commented 12 months ago

I'm not sure whether to sort?

krlmlr commented 11 months ago

Not sure about sorting, can be done by the user? They might want a particular order too.

krlmlr commented 11 months ago
library(newdata)
library(vctrs)
library(tidyr)

obs_data
#> # A tibble: 3 × 9
#>   lgl     int   dbl chr      fct     ord   dte        dtt                 hms   
#>   <lgl> <int> <dbl> <chr>    <fct>   <ord> <date>     <dttm>              <time>
#> 1 TRUE      1   1   most     most    most  1970-01-02 1969-12-31 16:00:01 00'01"
#> 2 FALSE     4   4.5 most     most    most  1970-01-05 1969-12-31 16:00:04 00'04"
#> 3 NA        6   8.2 a rarity a rari… a ra… 1970-01-07 1969-12-31 16:00:06 00'06"

# Works today:

obs_data |>
  xnew_data(dte = vec_cast(Sys.Date(), dte))
#> # A tibble: 1 × 9
#>   lgl     int   dbl chr   fct     ord      dte        dtt                 hms   
#>   <lgl> <int> <dbl> <chr> <fct>   <ord>    <date>     <dttm>              <time>
#> 1 FALSE     3  4.57 most  not obs a rarity 2023-10-18 1969-12-31 16:00:03 00'03"

obs_data |>
  xnew_data(dtt = vec_cast(Sys.time(), dtt))
#> # A tibble: 1 × 9
#>   lgl     int   dbl chr   fct     ord      dte        dtt                 hms   
#>   <lgl> <int> <dbl> <chr> <fct>   <ord>    <date>     <dttm>              <time>
#> 1 FALSE     3  4.57 most  not obs a rarity 1970-01-04 2023-10-18 09:31:28 00'03"

obs_data |>
  xnew_data(nesting(fct = vec_cast("most", fct)))
#> # A tibble: 1 × 9
#>   lgl     int   dbl chr   fct   ord      dte        dtt                 hms   
#>   <lgl> <int> <dbl> <chr> <fct> <ord>    <date>     <dttm>              <time>
#> 1 FALSE     3  4.57 most  most  a rarity 1970-01-04 1969-12-31 16:00:03 00'03"

# Goal:

obs_data |>
  xnew_data(vec_cast(Sys.Date(), dte))
#> # A tibble: 1 × 9
#>   lgl     int   dbl chr   fct     ord      dte        dtt                 hms   
#>   <lgl> <int> <dbl> <chr> <fct>   <ord>    <date>     <dttm>              <time>
#> 1 FALSE     3  4.57 most  not obs a rarity 2023-10-18 1969-12-31 16:00:03 00'03"

obs_data |>
  xnew_data(vec_cast(Sys.time(), dtt))
#> # A tibble: 1 × 9
#>   lgl     int   dbl chr   fct     ord      dte        dtt                 hms   
#>   <lgl> <int> <dbl> <chr> <fct>   <ord>    <date>     <dttm>              <time>
#> 1 FALSE     3  4.57 most  not obs a rarity 1970-01-04 2023-10-18 09:31:28 00'03"

obs_data |>
  xnew_data(nesting(vec_cast("most", fct)))
#> # A tibble: 1 × 9
#>   lgl     int   dbl chr   fct   ord      dte        dtt                 hms   
#>   <lgl> <int> <dbl> <chr> <fct> <ord>    <date>     <dttm>              <time>
#> 1 FALSE     3  4.57 most  most  a rarity 1970-01-04 1969-12-31 16:00:03 00'03"

Faked on 2023-10-18 with reprex v2.0.2

krlmlr commented 11 months ago
library(newdata)
library(vctrs)
library(tidyr)

xcoerce <- function(..., .data = xnew_data_env$data) {
  values <- list2(...)
  stopifnot(all(names2(values) != ""))

  stopifnot(all(names(values) %in% names(.data)))

  vec_cast(as_tibble(values), .data[names(values)])
}

environment(xcoerce) <- asNamespace("newdata")

obs_data |>
  xnew_data(xcoerce(
    dte = Sys.Date(),
    dtt = Sys.time(),
    fct = "most"
  ))
#> # A tibble: 1 × 9
#>   lgl     int   dbl chr   fct   ord      dte        dtt                 hms   
#>   <lgl> <int> <dbl> <chr> <fct> <ord>    <date>     <dttm>              <time>
#> 1 FALSE     3  4.57 most  most  a rarity 2023-10-18 2023-10-18 09:43:34 00'03"

Created on 2023-10-18 with reprex v2.0.2

joethorley commented 11 months ago

We seem to be getting the sorting for free and I am fine with that.