tidyverse / dbplyr

Database (DBI) backend for dplyr
https://dbplyr.tidyverse.org
Other
474 stars 173 forks source link

Possible inconsistency in translation of stringr::str_like() with respect to ignore_case argument #1488

Open edward-burn opened 6 months ago

edward-burn commented 6 months ago

Hi @hadley @mgirlich, I might be misunderstanding but I think there might be some inconsistency in the way that stringr::str_like() is being translated with respect to the argument ignore_case

In the documentation it says ignore_case: Ignore case of matches? Defaults to TRUE to match the SQL LIKE operator.

In postgres and redshift I see what I would expect - that ILIKE is used when ignore_case is true, while LIKE is used when ignore_case is false. But for the backends without ILIKE we have LIKE being used when ignore_case is true and error thrown when false. But for these latter backends, isn't this the wrong way around (as in I would have expected an error if ignore_case was true [or maybe cast both to lower and use LIKE] and use LIKE if ignore_case is false)

library(dbplyr)

lf_postgres <- lazy_frame(a = TRUE, b = 1, c = 2, d = "zzzyzzz", 
                 con = simulate_postgres())
lf_redshift <- lazy_frame(a = TRUE, b = 1, c = 2, d = "zzzyzzz", 
                          con = simulate_redshift())
lf_mssql <- lazy_frame(a = TRUE, b = 1, c = 2, d = "zzzyzzz", 
                 con = simulate_mssql()) 
lf_snowflake <- lazy_frame(a = TRUE, b = 1, c = 2, d = "zzzyzzz", 
                       con = simulate_snowflake()) 
lf_spark <- lazy_frame(a = TRUE, b = 1, c = 2, d = "zzzyzzz", 
                           con = simulate_spark_sql()) 

# ilike if we ignore case - should we not get an error here for backends without ilike?
lf_postgres |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = TRUE))
#> <SQL>
#> SELECT `df`.*
#> FROM `df`
#> WHERE (`d` ILIKE 'Y')
lf_redshift |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = TRUE))
#> <SQL>
#> SELECT `df`.*
#> FROM `df`
#> WHERE (`d` ILIKE 'Y')
lf_mssql |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = TRUE))
#> <SQL>
#> SELECT `df`.*
#> FROM `df`
#> WHERE (`d` LIKE 'Y')
lf_snowflake |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = TRUE))
#> <SQL>
#> SELECT `df`.*
#> FROM `df`
#> WHERE (`d` LIKE 'Y')
lf_spark |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = TRUE))
#> <SQL>
#> SELECT `df`.*
#> FROM `df`
#> WHERE (`d` LIKE 'Y')

# like if we do not ignore case - should we not get like here for all?
lf_postgres |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = FALSE))
#> <SQL>
#> SELECT `df`.*
#> FROM `df`
#> WHERE (`d` LIKE 'Y')
lf_redshift |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = FALSE))
#> <SQL>
#> SELECT `df`.*
#> FROM `df`
#> WHERE (`d` LIKE 'Y')
lf_mssql |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = FALSE))
#> Error in `stringr::str_like()`:
#> ! Backend only supports case insensitve `str_like()`.
lf_snowflake |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = FALSE))
#> Error in `stringr::str_like()`:
#> ! Backend only supports case insensitve `str_like()`.
lf_spark |> 
  dplyr::filter(stringr::str_like(d, "Y", ignore_case = FALSE))
#> Error in `stringr::str_like()`:
#> ! Backend only supports case insensitve `str_like()`.

Created on 2024-04-04 with reprex v2.0.2

Here is an additional reprex with a live snowflake database

library(dplyr)
#> Warning: package 'dplyr' was built under R version 4.2.3
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(dbplyr)
#> 
#> Attaching package: 'dbplyr'
#> The following objects are masked from 'package:dplyr':
#> 
#>     ident, sql

con <- DBI::dbConnect(odbc::odbc(),
                      "ohdsi_omop_snowflake",
                      PWD = Sys.getenv("SNOWFLAKE_PASSWORD"))

concept_db <- tbl(con, in_catalog("OMOP_SYNTHETIC_DATASET", "CDM53", "CONCEPT"))
concept_db |> 
  head(5)
#> # Source:   SQL [5 x 10]
#> # Database: Snowflake 8.13.1[ohdsi@Snowflake/OMOP_SYNTHETIC_DATASET]
#>   CONCEPT_ID CONCEPT_NAME        DOMAIN_ID VOCABULARY_ID CONCEPT_CLASS_ID
#>        <dbl> <chr>               <chr>     <chr>         <chr>           
#> 1          0 No matching concept Metadata  None          Undefined       
#> 2          1 Domain              Metadata  Domain        Domain          
#> 3          2 Gender              Metadata  Domain        Domain          
#> 4          3 Race                Metadata  Domain        Domain          
#> 5          4 Ethnicity           Metadata  Domain        Domain          
#> # ℹ 5 more variables: STANDARD_CONCEPT <chr>, CONCEPT_CODE <chr>,
#> #   VALID_START_DATE <date>, VALID_END_DATE <date>, INVALID_REASON <chr>

# get record back if we bring locally, but not in db
concept_db |> 
  head(5) |> 
  collect() |> 
  dplyr::filter(stringr::str_like(CONCEPT_NAME, "ETHNICITY", ignore_case = TRUE))
#> # A tibble: 1 × 10
#>   CONCEPT_ID CONCEPT_NAME DOMAIN_ID VOCABULARY_ID CONCEPT_CLASS_ID
#>        <dbl> <chr>        <chr>     <chr>         <chr>           
#> 1          4 Ethnicity    Metadata  Domain        Domain          
#> # ℹ 5 more variables: STANDARD_CONCEPT <chr>, CONCEPT_CODE <chr>,
#> #   VALID_START_DATE <date>, VALID_END_DATE <date>, INVALID_REASON <chr>
concept_db |>  
  dplyr::filter(stringr::str_like(CONCEPT_NAME, "ETHNICITY", ignore_case = TRUE)) 
#> # Source:   SQL [0 x 10]
#> # Database: Snowflake 8.13.1[ohdsi@Snowflake/OMOP_SYNTHETIC_DATASET]
#> # ℹ 10 variables: CONCEPT_ID <dbl>, CONCEPT_NAME <chr>, DOMAIN_ID <chr>,
#> #   VOCABULARY_ID <chr>, CONCEPT_CLASS_ID <chr>, STANDARD_CONCEPT <chr>,
#> #   CONCEPT_CODE <chr>, VALID_START_DATE <date>, VALID_END_DATE <date>,
#> #   INVALID_REASON <chr>

# get no record back if we bring locally, not supported in db
concept_db |> 
  head(5) |> 
  collect() |> 
  dplyr::filter(stringr::str_like(CONCEPT_NAME, "ETHNICITY", ignore_case = FALSE))
#> # A tibble: 0 × 10
#> # ℹ 10 variables: CONCEPT_ID <dbl>, CONCEPT_NAME <chr>, DOMAIN_ID <chr>,
#> #   VOCABULARY_ID <chr>, CONCEPT_CLASS_ID <chr>, STANDARD_CONCEPT <chr>,
#> #   CONCEPT_CODE <chr>, VALID_START_DATE <date>, VALID_END_DATE <date>,
#> #   INVALID_REASON <chr>
# db use of stringr
concept_db |>  
  dplyr::filter(stringr::str_like(CONCEPT_NAME, "ETHNICITY", ignore_case = FALSE)) 
#> Error in `stringr::str_like()`:
#> ! Backend only supports case insensitve `str_like()`.

Created on 2024-04-04 with reprex v2.0.2

edward-burn commented 6 months ago

I'd be happy to work on a pr if you think this is a bug

edward-burn commented 6 months ago

Hopefully not too presumptuous, but I opened this pr #1490 that would switch the behaviour around