Open joethorley opened 1 year ago
It could be named something like xreplace_values()
and would be a non-empty vector of unique values.
@krlmlr and @aylapear - for discussion on Thursday
Can you please sketch, for some simple input, what the desired output should look like?
This is what I am imagining for factors
library(tibble)
library(newdata)
data <- tibble(fct = factor(c("a", "d"), levels = c("a", "b", "c", "d")))
data
# A tibble: 2 × 1
fct
<fct>
1 a
2 d
xnew_data(data, xreplace_values(fct, "c"))
# A tibble: 2 × 1
fct
<fct>
1 c
xnew_data(data, xreplace_values(fct, c("c", "d")))
# A tibble: 2 × 1
fct
<fct>
1 c
2 d
xnew_data(data, xreplace_values(fct, c("c", "d", "d")))
# A tibble: 2 × 1
fct
<fct>
1 c
2 d
xnew_data(data, xreplace_values(fct, c("e")))
> warning factor level unrecognized
# A tibble: 2 × 1
fct
<fct>
1 NA
xnew_data(data, xreplace_values(fct, c("e", "f")))
> warning factor level unrecognized
# A tibble: 2 × 1
fct
<fct>
1 NA
xnew_data(data, xreplace_values(fct, c("e", "c")))
> warning factor level unrecognized
# A tibble: 2 × 1
fct
<fct>
1 NA
2 c
library(tibble)
library(newdata)
data <- newdata::obs_data
print(data)
# lgl int dbl chr fct ord dte dtt hms
# <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
# 1 TRUE 1 1 most most most 1970-01-02 1969-12-31 16:00:01 00'01"
# 2 FALSE 4 4.5 most most most 1970-01-05 1969-12-31 16:00:04 00'04"
# 3 NA 6 8.2 a rarity a rarity a rarity 1970-01-07 1969-12-31 16:00:06 00'06"
xnew_data(data)
# # A tibble: 1 × 9
# lgl int dbl chr fct ord dte dtt hms
# <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
# 1 FALSE 3 4.57 most not obs a rarity 1970-01-04 1969-12-31 16:00:03 00'03"
# this replaces dte with integer -100
xnew_data(data, dte = 1L)
# # A tibble: 1 × 9
# lgl int dbl chr fct ord dte dtt hms
# <lgl> <int> <dbl> <chr> <fct> <ord> <int> <dttm> <time>
# 1 FALSE 3 4.57 most not obs a rarity -100 1969-12-31 16:00:03 00'03"
# whereas this replaces dte with as.Date("2020-03-04)
xnew_data(data, dte = xreplace_values(1L))
# # A tibble: 1 × 9
# lgl int dbl chr fct ord dte dtt hms
# <lgl> <int> <dbl> <chr> <fct> <ord> <int> <dttm> <time>
# 1 FALSE 3 4.57 most not obs a rarity 2020-03-04 1969-12-31 16:00:03 00'03"
# whereas this replaces dte with as.Date(c(-100L, -200L))
xnew_data(data, dte = xreplace_values(1L))
# # A tibble: 1 × 9
# lgl int dbl chr fct ord dte dtt hms
# <lgl> <int> <dbl> <chr> <fct> <ord> <int> <dttm> <time>
# 1 FALSE 3 4.57 most not obs a rarity 1969-09-23 1969-12-31 16:00:03 00'03"
# 2 FALSE 3 4.57 most not obs a rarity 1969-06-15 1969-12-31 16:00:03 00'03"
It would work the same in the sense that it attempts to coerce to the type but in the case of factors preserves the levels and date times preserves the time zone.
Maybe a name like xcoerce_values() would be better?
I'm not sure whether to sort?
Not sure about sorting, can be done by the user? They might want a particular order too.
library(newdata)
library(vctrs)
library(tidyr)
obs_data
#> # A tibble: 3 × 9
#> lgl int dbl chr fct ord dte dtt hms
#> <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
#> 1 TRUE 1 1 most most most 1970-01-02 1969-12-31 16:00:01 00'01"
#> 2 FALSE 4 4.5 most most most 1970-01-05 1969-12-31 16:00:04 00'04"
#> 3 NA 6 8.2 a rarity a rari… a ra… 1970-01-07 1969-12-31 16:00:06 00'06"
# Works today:
obs_data |>
xnew_data(dte = vec_cast(Sys.Date(), dte))
#> # A tibble: 1 × 9
#> lgl int dbl chr fct ord dte dtt hms
#> <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
#> 1 FALSE 3 4.57 most not obs a rarity 2023-10-18 1969-12-31 16:00:03 00'03"
obs_data |>
xnew_data(dtt = vec_cast(Sys.time(), dtt))
#> # A tibble: 1 × 9
#> lgl int dbl chr fct ord dte dtt hms
#> <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
#> 1 FALSE 3 4.57 most not obs a rarity 1970-01-04 2023-10-18 09:31:28 00'03"
obs_data |>
xnew_data(nesting(fct = vec_cast("most", fct)))
#> # A tibble: 1 × 9
#> lgl int dbl chr fct ord dte dtt hms
#> <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
#> 1 FALSE 3 4.57 most most a rarity 1970-01-04 1969-12-31 16:00:03 00'03"
# Goal:
obs_data |>
xnew_data(vec_cast(Sys.Date(), dte))
#> # A tibble: 1 × 9
#> lgl int dbl chr fct ord dte dtt hms
#> <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
#> 1 FALSE 3 4.57 most not obs a rarity 2023-10-18 1969-12-31 16:00:03 00'03"
obs_data |>
xnew_data(vec_cast(Sys.time(), dtt))
#> # A tibble: 1 × 9
#> lgl int dbl chr fct ord dte dtt hms
#> <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
#> 1 FALSE 3 4.57 most not obs a rarity 1970-01-04 2023-10-18 09:31:28 00'03"
obs_data |>
xnew_data(nesting(vec_cast("most", fct)))
#> # A tibble: 1 × 9
#> lgl int dbl chr fct ord dte dtt hms
#> <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
#> 1 FALSE 3 4.57 most most a rarity 1970-01-04 1969-12-31 16:00:03 00'03"
Faked on 2023-10-18 with reprex v2.0.2
library(newdata)
library(vctrs)
library(tidyr)
xcoerce <- function(..., .data = xnew_data_env$data) {
values <- list2(...)
stopifnot(all(names2(values) != ""))
stopifnot(all(names(values) %in% names(.data)))
vec_cast(as_tibble(values), .data[names(values)])
}
environment(xcoerce) <- asNamespace("newdata")
obs_data |>
xnew_data(xcoerce(
dte = Sys.Date(),
dtt = Sys.time(),
fct = "most"
))
#> # A tibble: 1 × 9
#> lgl int dbl chr fct ord dte dtt hms
#> <lgl> <int> <dbl> <chr> <fct> <ord> <date> <dttm> <time>
#> 1 FALSE 3 4.57 most most a rarity 2023-10-18 2023-10-18 09:43:34 00'03"
Created on 2023-10-18 with reprex v2.0.2
We seem to be getting the sorting for free and I am fine with that.
This would be really helpful when manually setting values.