tidyverse / lubridate

Make working with dates in R just that little bit easier
https://lubridate.tidyverse.org
GNU General Public License v3.0
731 stars 207 forks source link

Spanish month labels #781

Open dominicroye opened 5 years ago

dominicroye commented 5 years ago

If I use Spanish or French Locale for abbreviated version of the month labels, these are displayed with "ene\.", but it should be only "ene.". I'm using Windows 10.

> library(lubridate)
> Sys.getlocale("LC_TIME")
## [1] "Spanish_Spain.1252"

> dt <- seq(ymd("2018-01-01"), ymd("2018-12-31"), "day")

> head(month(dt, label = TRUE))
## [1] ene\\. ene\\. ene\\. ene\\. ene\\. ene\\.
## 12 Levels: ene\\. < feb\\. < mar\\. < abr\\. < may\\. < ... < dic\\.

> Sys.setlocale("LC_TIME", "French")
## [1] "French_France.1252"
> head(month(dt, label = TRUE))
## [1] janv\\. janv\\. janv\\. janv\\. janv\\. janv\\.
## 12 Levels: janv\\. < févr\\. < mars < avr\\. < mai < juin < ... < déc\\.

> Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
> head(month(dt, label = TRUE))
## [1] Jan Jan Jan Jan Jan Jan
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
hadley commented 4 years ago

This appears to be because the month names are regular expressions; so we also need to store canonical names for output.

vspinu commented 4 years ago

This is surely a Windows only issue. Probably some regex bug indeed.

@dominicroye could you please provide the output of the following (with local replaced by your French and Spanish locale names)?

Sys.setlocale("LC_TIME", "es_ES.utf8")
format <- "%a@%A@%b@%B@%p@"
enc2utf8(unique(format(lubridate:::.date_template, format = format)))
##  [1] "jue@jueves@ene@enero@@"      "lun@lunes@feb@febrero@@"     "mar@martes@mar@marzo@@"     
##  [4] "dom@domingo@abr@abril@@"     "vie@viernes@may@mayo@@"      "mar@martes@jun@junio@@"     
##  [7] "vie@viernes@jul@julio@@"     "mié@miércoles@ago@agosto@@"  "mar@martes@sep@septiembre@@"
## [10] "vie@viernes@oct@octubre@@"   "mar@martes@nov@noviembre@@"  "sáb@sábado@dic@diciembre@@" 

Also the value of

str(.get_locale_regs("...your_locales..."))
dominicroye commented 4 years ago

es_ES.utf8 doesn't exist in Windows.

Here is my output from your code:

SPANISH

> Sys.setlocale("LC_TIME", "Spanish_Spain.1252")
> format <- "%a@%A@%b@%B@%p@"
> enc2utf8(unique(format(lubridate:::.date_template, format = format)))
 [1] "ju.@jueves@ene.@enero@@"      "lu.@lunes@feb.@febrero@@"     "ma.@martes@mar.@marzo@@"     
 [4] "do.@domingo@abr.@abril@@"     "vi.@viernes@may.@mayo@@"      "ma.@martes@jun.@junio@@"     
 [7] "vi.@viernes@jul.@julio@@"     "mi.@miércoles@ago.@agosto@@"  "ma.@martes@sep.@septiembre@@"
[10] "vi.@viernes@oct.@octubre@@"   "ma.@martes@nov.@noviembre@@"  "sá.@sábado@dic.@diciembre@@" 

> str(lubridate:::.get_locale_regs("Spanish_Spain.1252"))
List of 6
 $ alpha_flex : Named chr [1:6] "((?<b_b>ene\\.|feb\\.|mar\\.|abr\\.|may\\.|jun\\.|jul\\.|ago\\.|sep\\.|oct\\.|nov\\.|dic\\.)|(?<B_b>enero|febre"| __truncated__ "(?<B_B>enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre)(?![[:alpha:]])" "((?<a_a>ju\\.|lu\\.|ma\\.|do\\.|vi\\.|mi\\.|sá\\.)|(?<A_a>jueves|lunes|martes|domingo|viernes|miércoles|sábado)"| __truncated__ "(?<A_A>jueves|lunes|martes|domingo|viernes|miércoles|sábado)(?![[:alpha:]])" ...
  ..- attr(*, "names")= chr [1:6] "b" "B" "a" "A" ...
 $ num_flex   : Named chr [1:24] "(?<d>[012]?[1-9]|3[01]|[12]0)(?!\\d)" "(?<q>[0]?[1-4])(?!\\d)" "(?<H>2[0-4]|[01]?\\d)(?!\\d)" "(?<H>2[0-4]|[01]?\\d)(?!\\d)" ...
  ..- attr(*, "names")= chr [1:24] "d" "q" "H" "h" ...
 $ alpha_exact: Named chr [1:6] "((?<b_b_e>ene\\.|feb\\.|mar\\.|abr\\.|may\\.|jun\\.|jul\\.|ago\\.|sep\\.|oct\\.|nov\\.|dic\\.)|(?<B_b_e>enero|f"| __truncated__ "(?<B_B_e>enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre)(?![[:alpha:]])" "((?<a_a_e>ju\\.|lu\\.|ma\\.|do\\.|vi\\.|mi\\.|sá\\.)|(?<A_a_e>jueves|lunes|martes|domingo|viernes|miércoles|sáb"| __truncated__ "(?<A_A_e>jueves|lunes|martes|domingo|viernes|miércoles|sábado)(?![[:alpha:]])" ...
  ..- attr(*, "names")= chr [1:6] "b" "B" "a" "A" ...
 $ num_exact  : Named chr [1:24] "(?<d_e>[012][1-9]|3[01]|[12]0)" "(?<q_e>[0][1-4])" "(?<H_e>2[0-4]|[01]\\d)" "(?<H_e>2[0-4]|[01]\\d)" ...
  ..- attr(*, "names")= chr [1:24] "d" "q" "H" "h" ...
 $ wday_names :List of 2
  ..$ abr : chr [1:7] "do\\." "lu\\." "ma\\." "mi\\." ...
  ..$ full: chr [1:7] "domingo" "lunes" "martes" "miércoles" ...
 $ month_names:List of 2
  ..$ abr : chr [1:12] "ene\\." "feb\\." "mar\\." "abr\\." ...
  ..$ full: chr [1:12] "enero" "febrero" "marzo" "abril" ...

FRENCH

> Sys.setlocale("LC_TIME", "French_France.1252")
> format <- "%a@%A@%b@%B@%p@"
> enc2utf8(unique(format(lubridate:::.date_template, format = format)))
 [1] "jeu.@jeudi@janv.@janvier@@"    "lun.@lundi@févr.@février@@"    "mar.@mardi@mars@mars@@"       
 [4] "dim.@dimanche@avr.@avril@@"    "ven.@vendredi@mai@mai@@"       "mar.@mardi@juin@juin@@"       
 [7] "ven.@vendredi@juil.@juillet@@" "mer.@mercredi@août@août@@"     "mar.@mardi@sept.@septembre@@" 
[10] "ven.@vendredi@oct.@octobre@@"  "mar.@mardi@nov.@novembre@@"    "sam.@samedi@déc.@décembre@@"  

> str(lubridate:::.get_locale_regs("French_France.1252"))
List of 6
 $ alpha_flex : Named chr [1:6] "((?<b_b>janv\\.|févr\\.|mars|avr\\.|mai|juin|juil\\.|août|sept\\.|oct\\.|nov\\.|déc\\.)|(?<B_b>janvier|février|"| __truncated__ "(?<B_B>janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)(?![[:alpha:]])" "((?<a_a>jeu\\.|lun\\.|mar\\.|dim\\.|ven\\.|mer\\.|sam\\.)|(?<A_a>jeudi|lundi|mardi|dimanche|vendredi|mercredi|s"| __truncated__ "(?<A_A>jeudi|lundi|mardi|dimanche|vendredi|mercredi|samedi)(?![[:alpha:]])" ...
  ..- attr(*, "names")= chr [1:6] "b" "B" "a" "A" ...
 $ num_flex   : Named chr [1:24] "(?<d>[012]?[1-9]|3[01]|[12]0)(?!\\d)" "(?<q>[0]?[1-4])(?!\\d)" "(?<H>2[0-4]|[01]?\\d)(?!\\d)" "(?<H>2[0-4]|[01]?\\d)(?!\\d)" ...
  ..- attr(*, "names")= chr [1:24] "d" "q" "H" "h" ...
 $ alpha_exact: Named chr [1:6] "((?<b_b_e>janv\\.|févr\\.|mars|avr\\.|mai|juin|juil\\.|août|sept\\.|oct\\.|nov\\.|déc\\.)|(?<B_b_e>janvier|févr"| __truncated__ "(?<B_B_e>janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)(?![[:alpha:]])" "((?<a_a_e>jeu\\.|lun\\.|mar\\.|dim\\.|ven\\.|mer\\.|sam\\.)|(?<A_a_e>jeudi|lundi|mardi|dimanche|vendredi|mercre"| __truncated__ "(?<A_A_e>jeudi|lundi|mardi|dimanche|vendredi|mercredi|samedi)(?![[:alpha:]])" ...
  ..- attr(*, "names")= chr [1:6] "b" "B" "a" "A" ...
 $ num_exact  : Named chr [1:24] "(?<d_e>[012][1-9]|3[01]|[12]0)" "(?<q_e>[0][1-4])" "(?<H_e>2[0-4]|[01]\\d)" "(?<H_e>2[0-4]|[01]\\d)" ...
  ..- attr(*, "names")= chr [1:24] "d" "q" "H" "h" ...
 $ wday_names :List of 2
  ..$ abr : chr [1:7] "dim\\." "lun\\." "mar\\." "mer\\." ...
  ..$ full: chr [1:7] "dimanche" "lundi" "mardi" "mercredi" ...
 $ month_names:List of 2
  ..$ abr : chr [1:12] "janv\\." "févr\\." "mars" "avr\\." ...
  ..$ full: chr [1:12] "janvier" "février" "mars" "avril" ...
vspinu commented 4 years ago

Ok, so on Windows all the abbreviations come with dots at the end. Let me see what I can do.

vspinu commented 4 years ago

Should have been fixed. Would really appreciate if you guys could try the dev version and let me know if it works correctly now.

dominicroye commented 4 years ago

It is working correctly. Thank you!

dominicroye commented 3 years ago

I have to reopen this issue since it is still happening with weekdays. I am sorry that I noticed it now!

> library(lubridate)
> Sys.getlocale("LC_TIME")
[1] "Spanish_Spain.1252"

> dt <- seq(ymd("2018-01-01"), ymd("2018-12-31"), "day")

> head(wday(dt, label = TRUE))
[1] lu\\. ma\\. mi\\. ju\\. vi\\. sá\\.
Levels: do\\. < lu\\. < ma\\. < mi\\. < ju\\. < vi\\. < sá\\.
brianmsm commented 3 years ago

I confirm this bug. However, the solution for the guess_formats (https://github.com/tidyverse/lubridate/commit/cc5f1a6de86863f983fd3f69ac842c31997a03a0) function works and can be easily implemented in .get_locale_regs which is what is used in the wday function.

It is necessary change this line (https://github.com/tidyverse/lubridate/blob/6f26b02de432cd9373ad4ce7766c36eacfc29918/R/guess.r#L311) by this:

  mat[] <- gsub("\\.$", "", mat) # remove abbrev trailing dot in some locales (#781)
  mat[] <- gsub("([].|(){^$*+?[])", "\\\\\\1", mat) # escaping meta chars
DavisVaughan commented 3 years ago

I imagine this works correctly with clock, since we don't do anything with regular expressions:

library(clock)

dt <- seq(date_parse("2018-01-01"), date_parse("2018-12-31"), "day")

head(date_month_factor(dt, labels = "es", abbreviate = TRUE))
#> [1] ene. ene. ene. ene. ene. ene.
#> 12 Levels: ene. < feb. < mar. < abr. < may. < jun. < jul. < ago. < ... < dic.

head(date_weekday_factor(dt, labels = "es", abbreviate = TRUE))
#> [1] lun. mar. mié. jue. vie. sáb.
#> Levels: dom. < lun. < mar. < mié. < jue. < vie. < sáb.

If the labels aren't exactly what you expect, you can always create a custom clock_labels() object to use as the labels argument

augusto-umana commented 1 year ago

Hi, I found that this bug is still alive for month abbreviations different from their equivalent in English (jan != ene, apr != abr, aug != ago, dec != dic). The bug isn't SO specific: I reproduced it in Linux and Windows: Windows:

library(tidyverse)
library(lubridate)

test_dates <- tibble(abr_dates = c("ene-22", 
                                   "feb-22", 
                                   "mar-22", 
                                   "abr-22",
                                   "may-22",
                                   "jun-22",
                                   "jul-22",
                                   "ago-22",
                                   "sep-22", 
                                   "oct-22",
                                   "nov-22",
                                   "dic-22"))

test_dates %>% 
  mutate(dates_date = my(abr_dates))
#> Warning: There was 1 warning in `mutate()`.
#> ℹ In argument: `dates_date = my(abr_dates)`.
#> Caused by warning:
#> !  4 failed to parse.
#> # A tibble: 12 × 2
#>    abr_dates dates_date
#>    <chr>     <date>    
#>  1 ene-22    NA        
#>  2 feb-22    2022-02-01
#>  3 mar-22    2022-03-01
#>  4 abr-22    NA        
#>  5 may-22    2022-05-01
#>  6 jun-22    2022-06-01
#>  7 jul-22    2022-07-01
#>  8 ago-22    NA        
#>  9 sep-22    2022-09-01
#> 10 oct-22    2022-10-01
#> 11 nov-22    2022-11-01
#> 12 dic-22    NA

Created on 2023-06-11 with reprex v2.0.2

Session info ``` r sessionInfo() #> R version 4.3.0 (2023-04-21 ucrt) #> Platform: x86_64-w64-mingw32/x64 (64-bit) #> Running under: Windows 11 x64 (build 22621) #> #> Matrix products: default #> #> #> locale: #> [1] LC_COLLATE=Spanish_Colombia.utf8 LC_CTYPE=Spanish_Colombia.utf8 #> [3] LC_MONETARY=Spanish_Colombia.utf8 LC_NUMERIC=C #> [5] LC_TIME=Spanish_Colombia.utf8 #> #> time zone: America/Bogota #> tzcode source: internal #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.2 #> [5] purrr_1.0.1 readr_2.1.4 tidyr_1.3.0 tibble_3.2.1 #> [9] ggplot2_3.4.2 tidyverse_2.0.0 #> #> loaded via a namespace (and not attached): #> [1] gtable_0.3.3 compiler_4.3.0 reprex_2.0.2 tidyselect_1.2.0 #> [5] scales_1.2.1 yaml_2.3.7 fastmap_1.1.1 R6_2.5.1 #> [9] generics_0.1.3 knitr_1.43 munsell_0.5.0 pillar_1.9.0 #> [13] tzdb_0.4.0 rlang_1.1.1 utf8_1.2.3 stringi_1.7.12 #> [17] xfun_0.39 fs_1.6.2 timechange_0.2.0 cli_3.6.1 #> [21] withr_2.5.0 magrittr_2.0.3 digest_0.6.31 grid_4.3.0 #> [25] rstudioapi_0.14 hms_1.1.3 lifecycle_1.0.3 vctrs_0.6.2 #> [29] evaluate_0.21 glue_1.6.2 fansi_1.0.4 colorspace_2.1-0 #> [33] rmarkdown_2.22 tools_4.3.0 pkgconfig_2.0.3 htmltools_0.5.5 ```

Linux:

library(reprex)
library(tidyverse)
library(lubridate)
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union

test_dates <- tibble(abr_dates = c("ene-22", 
                                   "feb-22", 
                                   "mar-22", 
                                   "abr-22",
                                   "may-22",
                                   "jun-22",
                                   "jul-22",
                                   "ago-22",
                                   "sep-22", 
                                   "oct-22",
                                   "nov-22",
                                   "dic-22"))

test_dates %>% 
  mutate(dates_date = my(abr_dates))
#> Warning: 4 failed to parse.
#> # A tibble: 12 × 2
#>    abr_dates dates_date
#>    <chr>     <date>    
#>  1 ene-22    NA        
#>  2 feb-22    2022-02-01
#>  3 mar-22    2022-03-01
#>  4 abr-22    NA        
#>  5 may-22    2022-05-01
#>  6 jun-22    2022-06-01
#>  7 jul-22    2022-07-01
#>  8 ago-22    NA        
#>  9 sep-22    2022-09-01
#> 10 oct-22    2022-10-01
#> 11 nov-22    2022-11-01
#> 12 dic-22    NA

Created on 2023-06-11 with reprex v2.0.2

Session info ``` r sessionInfo() #> R version 4.2.1 (2022-06-23) #> Platform: x86_64-pc-linux-gnu (64-bit) #> Running under: Ubuntu 22.04.2 LTS #> #> Matrix products: default #> BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 #> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 #> #> locale: #> [1] LC_CTYPE=es_CO.UTF-8 LC_NUMERIC=C #> [3] LC_TIME=es_CO.UTF-8 LC_COLLATE=es_CO.UTF-8 #> [5] LC_MONETARY=es_CO.UTF-8 LC_MESSAGES=es_CO.UTF-8 #> [7] LC_PAPER=es_CO.UTF-8 LC_NAME=C #> [9] LC_ADDRESS=C LC_TELEPHONE=C #> [11] LC_MEASUREMENT=es_CO.UTF-8 LC_IDENTIFICATION=C #> #> attached base packages: #> [1] stats graphics grDevices utils datasets methods base #> #> other attached packages: #> [1] lubridate_1.8.0 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.9 #> [5] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.8 #> [9] ggplot2_3.4.0 tidyverse_1.3.1 reprex_2.0.2 #> #> loaded via a namespace (and not attached): #> [1] styler_1.7.0 tidyselect_1.2.0 xfun_0.30 haven_2.5.0 #> [5] colorspace_2.0-3 vctrs_0.5.1 generics_0.1.2 htmltools_0.5.3 #> [9] yaml_2.3.5 utf8_1.2.2 rlang_1.0.6 R.oo_1.25.0 #> [13] pillar_1.8.1 glue_1.6.2 withr_2.5.0 DBI_1.1.2 #> [17] R.utils_2.12.0 dbplyr_2.2.1 readxl_1.4.0 modelr_0.1.8 #> [21] R.cache_0.16.0 lifecycle_1.0.3 cellranger_1.1.0 munsell_0.5.0 #> [25] gtable_0.3.0 rvest_1.0.2 R.methodsS3_1.8.2 evaluate_0.15 #> [29] knitr_1.39 tzdb_0.3.0 fastmap_1.1.0 fansi_1.0.3 #> [33] highr_0.9 broom_0.8.0 backports_1.4.1 scales_1.2.0 #> [37] jsonlite_1.8.0 fs_1.5.2 hms_1.1.1 digest_0.6.29 #> [41] stringi_1.7.6 grid_4.2.1 cli_3.4.1 tools_4.2.1 #> [45] magrittr_2.0.3 crayon_1.5.1 pkgconfig_2.0.3 ellipsis_0.3.2 #> [49] xml2_1.3.3 assertthat_0.2.1 rmarkdown_2.14 httr_1.4.2 #> [53] rstudioapi_0.13 R6_2.5.1 compiler_4.2.1 ```
renatocava commented 9 months ago

If you are in linux you need to install the locale.

https://orcacore.com/set-up-system-locale-ubuntu-22-04/

SLLDeC commented 1 month ago

Hi, I'm having trouble with a script after the update of R, RStudio, and lubridate that previously worked fine and I'm not sure if it is related with this issue.

The following is a minimal example. In English the output is correct:

> library(lubridate)

Attaching package: ‘lubridate’

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union

> Sys.setlocale("LC_TIME", "English")
[1] "English_United States.1252"
> sessionInfo()
R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 10 x64 (build 19045)

Matrix products: default

locale:
[1] LC_COLLATE=English_World.utf8      LC_CTYPE=English_World.utf8        LC_MONETARY=English_World.utf8    
[4] LC_NUMERIC=C                       LC_TIME=English_United States.1252

time zone: America/Buenos_Aires
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] lubridate_1.9.3

loaded via a namespace (and not attached):
[1] compiler_4.4.1   generics_0.1.3   tools_4.4.1      timechange_0.3.0

> month(ymd(080101),label = T)
[1] Jan
Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < Oct < Nov < Dec

But in Spanish:

> Sys.setlocale("LC_TIME", "spanish")
[1] "Spanish_Spain.1252"
> sessionInfo()
R version 4.4.1 (2024-06-14 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 10 x64 (build 19045)

Matrix products: default

locale:
[1] LC_COLLATE=English_World.utf8  LC_CTYPE=English_World.utf8    LC_MONETARY=English_World.utf8
[4] LC_NUMERIC=C                   LC_TIME=Spanish_Spain.1252    

time zone: America/Buenos_Aires
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] lubridate_1.9.3

loaded via a namespace (and not attached):
[1] compiler_4.4.1   generics_0.1.3   tools_4.4.1      timechange_0.3.0
> month(ymd(080101),label = T)
Error in factor(x, ..., ordered = TRUE) : 
  invalid 'labels'; length 11 should be 1 or 12
In addition: Warning messages:
1: In strsplit(L, "@", fixed = TRUE) : input string 8 is invalid UTF-8
2: In strsplit(L, "@", fixed = TRUE) : input string 12 is invalid UTF-8

So I checked the labels for both locales (English and Spanish):

> names_EN <- lubridate:::.get_locale_regs("English_United States.1252")
> names_EN[["month_names"]][["full"]]
 [1] "January"   "February"  "March"     "April"     "May"       "June"      "July"      "August"    "September" "October"  
[11] "November"  "December" 
> names_EN[["month_names"]][["abr"]]
 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"

> names_ESP <- lubridate:::.get_locale_regs("Spanish_Spain.1252")
> names_ESP[["month_names"]][["full"]]
 [1] "enero"      "febrero"    "marzo"      "abril"      "mayo"       "junio"      "julio"      NA           "septiembre"
[10] "octubre"    "noviembre" 
> names_ESP[["month_names"]][["abr"]]
 [1] "ene" "feb" "mar" "abr" "may" "jun" "jul" NA    "sep" "oct" "nov"

Indeed the Spanish labels have one month less. If I understand correclty Diciembre is missing and also Agosto (that should be in the NA position), both in the full and abbreviated labels.

renatocava commented 1 month ago
Spanish_Spain.1252 

Busca un formato UTF-8.

Por ejemplo prueba: Sys.setlocale("LC_TIME", "Spanish_Peru.utf8).

El problema es el formato que estas usando en español ("Spanish_Spain.1252").

SLLDeC commented 1 month ago
Spanish_Spain.1252 

Busca un formato UTF-8.

Por ejemplo prueba: Sys.setlocale("LC_TIME", "Spanish_Peru.utf8).

El problema es el formato que estas usando en español ("Spanish_Spain.1252").

Thank you! It worked :)