tidyverse / lubridate

Make working with dates in R just that little bit easier
https://lubridate.tidyverse.org
GNU General Public License v3.0
724 stars 207 forks source link

Fractional Seconds with conversion and rounding/truncation? #1163

Open muschellij2 opened 3 months ago

muschellij2 commented 3 months ago

I think this is related to #502 so if the answer is "POSIXct precision < 1 second isn't expected", then please let me know. Also related to https://github.com/tidyverse/readr/issues/1394 in some ways.

Issue

Fractional seconds are not being converted one-to-one.

Overall issue is that I have data in a datatime format and I needed to convert datetimes with milliseconds (high frequency accelerometer signals) to a character and checks failed because the original data and the written data are now discrepant and I am trying to understand why.

This also related to readr functionality for reading in dates @jennybc

library(readr)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
options(digits.secs = 3)
print_wide = function(df) {
  print(df, width = 1000)
}

Create a Data set

The record in question is record 2 and the milliseconds (987) and how that is transformed

tfile = tempfile(fileext = ".csv")
values = c("time,x,y,z",                                 
           "2012-05-16 08:35:32.974,-0.909,0.182,-0.589",
           "2012-05-16 08:35:32.987,-0.927,0.082,-0.381")
writeLines(values, tfile)

We can confirm second record it is still 987

readLines(tfile)
#> [1] "time,x,y,z"                                 
#> [2] "2012-05-16 08:35:32.974,-0.909,0.182,-0.589"
#> [3] "2012-05-16 08:35:32.987,-0.927,0.082,-0.381"

Read the data in using read.csv

y = read.csv(tfile)
y = tibble::as_tibble(y)
print_wide(y)
#> # A tibble: 2 × 4
#>   time                         x     y      z
#>   <chr>                    <dbl> <dbl>  <dbl>
#> 1 2012-05-16 08:35:32.974 -0.909 0.182 -0.589
#> 2 2012-05-16 08:35:32.987 -0.927 0.082 -0.381

Time is 987 for the milliseonds

print_wide(y)
#> # A tibble: 2 × 4
#>   time                         x     y      z
#>   <chr>                    <dbl> <dbl>  <dbl>
#> 1 2012-05-16 08:35:32.974 -0.909 0.182 -0.589
#> 2 2012-05-16 08:35:32.987 -0.927 0.082 -0.381

# remove unneded columns 
y = y %>% 
  select(-x, -y, -z)

as_datetime turns it into 986, though numerically it’s still 987, but when converting to character it is 986

y$new_time = lubridate::as_datetime(y$time)

It is stored numerically as 987

as.numeric(y$new_time)[2] %% 1
#> [1] 0.987

Now format/strftime use 986 when converting to char, baking the 986 in vs 987. This is the main issue I see with the conversion and I don’t understand exactly why.

y$format_time = format(y$new_time, "%Y-%m-%d %H:%M:%OS3")
y$strftime_time = strftime(y$new_time, "%Y-%m-%d %H:%M:%OS3")
print_wide(y)
#> # A tibble: 2 × 4
#>   time                    new_time                format_time            
#>   <chr>                   <dttm>                  <chr>                  
#> 1 2012-05-16 08:35:32.974 2012-05-16 08:35:32.973 2012-05-16 08:35:32.973
#> 2 2012-05-16 08:35:32.987 2012-05-16 08:35:32.986 2012-05-16 08:35:32.986
#>   strftime_time          
#>   <chr>                  
#> 1 2012-05-16 04:35:32.973
#> 2 2012-05-16 04:35:32.986

Digits does nothing

y$format_time = format(y$new_time, "%Y-%m-%d %H:%M:%OS3", digits = 4)
print_wide(y)
#> # A tibble: 2 × 4
#>   time                    new_time                format_time            
#>   <chr>                   <dttm>                  <chr>                  
#> 1 2012-05-16 08:35:32.974 2012-05-16 08:35:32.973 2012-05-16 08:35:32.973
#> 2 2012-05-16 08:35:32.987 2012-05-16 08:35:32.986 2012-05-16 08:35:32.986
#>   strftime_time          
#>   <chr>                  
#> 1 2012-05-16 04:35:32.973
#> 2 2012-05-16 04:35:32.986

Here we try different methods for extracting the date (parse_date_time, which as_datetime uses), with the same results

y$new_time = parse_date_time(y$time, orders = c("ymdTz", "ymdT"))
y$format_time = format(y$new_time, "%Y-%m-%d %H:%M:%OS3")
y$strftime_time = strftime(y$new_time, "%Y-%m-%d %H:%M:%OS3")
print_wide(y)
#> # A tibble: 2 × 4
#>   time                    new_time                format_time            
#>   <chr>                   <dttm>                  <chr>                  
#> 1 2012-05-16 08:35:32.974 2012-05-16 08:35:32.973 2012-05-16 08:35:32.973
#> 2 2012-05-16 08:35:32.987 2012-05-16 08:35:32.986 2012-05-16 08:35:32.986
#>   strftime_time          
#>   <chr>                  
#> 1 2012-05-16 04:35:32.973
#> 2 2012-05-16 04:35:32.986

Trying with train = FALSE

y$new_time = parse_date_time(y$time, orders = c("ymdTz", "ymdT"), train = FALSE)
y$format_time = format(y$new_time, "%Y-%m-%d %H:%M:%OS3")
y$strftime_time = strftime(y$new_time, "%Y-%m-%d %H:%M:%OS3")
print_wide(y)
#> # A tibble: 2 × 4
#>   time                    new_time                format_time            
#>   <chr>                   <dttm>                  <chr>                  
#> 1 2012-05-16 08:35:32.974 2012-05-16 08:35:32.973 2012-05-16 08:35:32.973
#> 2 2012-05-16 08:35:32.987 2012-05-16 08:35:32.986 2012-05-16 08:35:32.986
#>   strftime_time          
#>   <chr>                  
#> 1 2012-05-16 04:35:32.973
#> 2 2012-05-16 04:35:32.986

strptime gives back 987 however and works. This is a partial solution (do not use readr::read_csv (as below), and use strptime to get the exact data back).

y$new_time = strptime(y$time, "%Y-%m-%d %H:%M:%OS")
y$format_time = format(y$new_time, "%Y-%m-%d %H:%M:%OS3")
y$strftime_time = strftime(y$new_time, "%Y-%m-%d %H:%M:%OS3")
print_wide(y)
#> # A tibble: 2 × 4
#>   time                    new_time                format_time            
#>   <chr>                   <dttm>                  <chr>                  
#> 1 2012-05-16 08:35:32.974 2012-05-16 08:35:32.974 2012-05-16 08:35:32.974
#> 2 2012-05-16 08:35:32.987 2012-05-16 08:35:32.987 2012-05-16 08:35:32.987
#>   strftime_time          
#>   <chr>                  
#> 1 2012-05-16 08:35:32.974
#> 2 2012-05-16 08:35:32.987

Using read_csv

x = readr::read_csv(tfile)
#> Rows: 2 Columns: 4
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl  (3): x, y, z
#> dttm (1): time
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print_wide(x)
#> # A tibble: 2 × 4
#>   time                         x     y      z
#>   <dttm>                   <dbl> <dbl>  <dbl>
#> 1 2012-05-16 08:35:32.973 -0.909 0.182 -0.589
#> 2 2012-05-16 08:35:32.986 -0.927 0.082 -0.381

x = x %>% 
  select(-x, -y, -z)

We see the same results here as lubridate such that format/strftime will embed the 986

x$format_time = format(x$time, "%Y-%m-%d %H:%M:%OS3")
x$strftime_time = strftime(x$time, "%Y-%m-%d %H:%M:%OS3")
as.numeric(x$time)[2] %% 1
#> [1] 0.987
print_wide(x)
#> # A tibble: 2 × 3
#>   time                    format_time             strftime_time          
#>   <dttm>                  <chr>                   <chr>                  
#> 1 2012-05-16 08:35:32.973 2012-05-16 08:35:32.973 2012-05-16 04:35:32.973
#> 2 2012-05-16 08:35:32.986 2012-05-16 08:35:32.986 2012-05-16 04:35:32.986

And now writing out the result now has 986 embedded, not 987

tfile2 = tempfile(fileext = ".csv")
write_csv(x, tfile2)
readLines(tfile2)
#> [1] "time,format_time,strftime_time"                                          
#> [2] "2012-05-16T08:35:32.973Z,2012-05-16 08:35:32.973,2012-05-16 04:35:32.973"
#> [3] "2012-05-16T08:35:32.986Z,2012-05-16 08:35:32.986,2012-05-16 04:35:32.986"

Created on 2024-04-24 with reprex v2.1.0

Session info ``` r sessioninfo::session_info() #> ─ Session info ─────────────────────────────────────────────────────────────── #> setting value #> version R version 4.3.1 (2023-06-16) #> os macOS Sonoma 14.4.1 #> system x86_64, darwin20 #> ui X11 #> language (EN) #> collate en_US.UTF-8 #> ctype en_US.UTF-8 #> tz America/New_York #> date 2024-04-24 #> pandoc 3.1.11.1 @ /usr/local/bin/ (via rmarkdown) #> #> ─ Packages ─────────────────────────────────────────────────────────────────── #> package * version date (UTC) lib source #> bit 4.0.5 2022-11-15 [1] CRAN (R 4.3.0) #> bit64 4.0.5 2020-08-30 [1] CRAN (R 4.3.0) #> cli 3.6.2 2023-12-11 [1] CRAN (R 4.3.0) #> crayon 1.5.2 2022-09-29 [1] CRAN (R 4.3.0) #> digest 0.6.34 2024-01-11 [1] CRAN (R 4.3.0) #> dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.0) #> evaluate 0.23 2023-11-01 [1] CRAN (R 4.3.0) #> fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.0) #> fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0) #> fs 1.6.3 2023-07-20 [1] CRAN (R 4.3.0) #> generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0) #> glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.0) #> hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.0) #> htmltools 0.5.7 2023-11-03 [1] CRAN (R 4.3.0) #> knitr 1.45 2023-10-30 [1] CRAN (R 4.3.0) #> lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.0) #> lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.0) #> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0) #> pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0) #> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0) #> purrr 1.0.2 2023-08-10 [1] CRAN (R 4.3.0) #> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.3.0) #> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.3.0) #> R.oo 1.26.0 2024-01-24 [1] CRAN (R 4.3.2) #> R.utils 2.12.3 2023-11-18 [1] CRAN (R 4.3.0) #> R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0) #> readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.0) #> reprex 2.1.0 2024-01-11 [1] CRAN (R 4.3.0) #> rlang 1.1.3 2024-01-10 [1] CRAN (R 4.3.0) #> rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.3.0) #> rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.3.2) #> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0) #> styler 1.10.2 2023-08-29 [1] CRAN (R 4.3.0) #> tibble 3.2.1 2023-03-20 [1] CRAN (R 4.3.0) #> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.3.0) #> timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.0) #> tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.0) #> utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.0) #> vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.0) #> vroom 1.6.5 2023-12-05 [1] CRAN (R 4.3.0) #> withr 3.0.0 2024-01-16 [1] CRAN (R 4.3.0) #> xfun 0.41 2023-11-01 [1] CRAN (R 4.3.0) #> yaml 2.3.8 2023-12-11 [1] CRAN (R 4.3.0) #> #> [1] /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/library #> #> ────────────────────────────────────────────────────────────────────────────── ```