tidyverse / dplyr

dplyr: A grammar of data manipulation
https://dplyr.tidyverse.org/
Other
4.75k stars 2.12k forks source link

Request non-equal join: closest() supports dual-sided inequality <> #6992

Open zhjx19 opened 7 months ago

zhjx19 commented 7 months ago

For example, I now have the following Table 1 and Table 2. I want to find a value2 from both sides that is closest to the value based on matching the IDs. However, closest() only supports an inequality involving one of: >, >=, <, or <=.

library(purrr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
df1 = tibble(
  ID = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO3", "NO4", "NO5", 
         "NO5", "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO7", 
         "NO7", "NO8", "NO9", "NO9"), 
  value = c(11, 25, 22, 24, 29, 1.75, 26, 27, 55, 44, 2.1, 31, 
         35, 33, 410, 21, 1.44, 80, 75, 52))
df1
#> # A tibble: 20 × 2
#>    ID    value2
#>    <chr>  <dbl>
#>  1 NO1    11   
#>  2 NO1    25   
#>  3 NO2    22   
#>  4 NO3    24   
#>  5 NO3    29   
#>  6 NO3     1.75
#>  7 NO4    26   
#>  8 NO5    27   
#>  9 NO5    55   
#> 10 NO5    44   
#> 11 NO5     2.1 
#> 12 NO6    31   
#> 13 NO7    35   
#> 14 NO7    33   
#> 15 NO7   410   
#> 16 NO7    21   
#> 17 NO7     1.44
#> 18 NO8    80   
#> 19 NO9    75   
#> 20 NO9    52
df2 = tibble(
  编号 = c("NO1", "NO1", "NO2", "NO3", "NO3", "NO4", "NO5", "NO5", 
         "NO5", "NO5", "NO6", "NO7", "NO7", "NO7", "NO8", "NO9", "NO9"), 
  数值2 = c(11.2, 24.6, 21.6, 22, 29.5, 26.1, 26.9, 26.4, 54.3,
            42.1, 30.2, 34.7, 31.2, 20.59, 79.8, 74.2, 50.9))
df2
#> # A tibble: 17 × 2
#>    编号  数值2
#>    <chr> <dbl>
#>  1 NO1    11.2
#>  2 NO1    24.6
#>  3 NO2    21.6
#>  4 NO3    22  
#>  5 NO3    29.5
#>  6 NO4    26.1
#>  7 NO5    26.9
#>  8 NO5    26.4
#>  9 NO5    54.3
#> 10 NO5    42.1
#> 11 NO6    30.2
#> 12 NO7    34.7
#> 13 NO7    31.2
#> 14 NO7    20.6
#> 15 NO8    79.8
#> 16 NO9    74.2
#> 17 NO9    50.9

The manual method I can think of right now is:

df1 %>%
  left_join(df2, join_by(编号), relationship = "many-to-many") %>%
  group_nest(编号, 数值) %>%
  mutate(数值2 = map2_dbl(数值, data, \(x, y) y$数值2[which.min(abs(y$数值2 - x))]),
         误差 = abs(数值 - 数值2)) %>%
  select(-data)
#> # A tibble: 20 × 4
#>    编号    数值 数值2    误差
#>    <chr>  <dbl> <dbl>   <dbl>
#>  1 NO1    11     11.2   0.200
#>  2 NO1    25     24.6   0.400
#>  3 NO2    22     21.6   0.400
#>  4 NO3     1.75  22    20.2  
#>  5 NO3    24     22     2    
#>  6 NO3    29     29.5   0.5  
#>  7 NO4    26     26.1   0.100
#>  8 NO5     2.1   26.4  24.3  
#>  9 NO5    27     26.9   0.100
#> 10 NO5    44     42.1   1.90 
#> 11 NO5    55     54.3   0.700
#> 12 NO6    31     30.2   0.800
#> 13 NO7     1.44  20.6  19.2  
#> 14 NO7    21     20.6   0.41 
#> 15 NO7    33     34.7   1.70 
#> 16 NO7    35     34.7   0.300
#> 17 NO7   410     34.7 375.   
#> 18 NO8    80     79.8   0.200
#> 19 NO9    52     50.9   1.10 
#> 20 NO9    75     74.2   0.800