ropensci / elastic

R client for the Elasticsearch HTTP API
https://docs.ropensci.org/elastic
Other
245 stars 58 forks source link

Bulk search function in the package that I missed? #287

Open tedmoorman opened 2 years ago

tedmoorman commented 2 years ago

Just curious if there was a bulk search function in the package that I may have missed? I ended up writing a function to perform the bulk search. I'm not sure if something like this might be worth adding to the package if currently missing. If it exists, I'd love to know about it!

library(elastic)
library(dplyr)

elastic_search <- function(df, name, con, index, slop = 0, limit = 50) {
  my_df_mod <- df %>% 
    mutate(
      row = row_number(),
      max_row = max(row),
      to_search = if_else(max_row == row,
                     paste0('{"match_phrase": {"name": {"query": "', {{ name }}, '", "slop":', slop, '}}}'),
                     paste0('{"match_phrase": {"name": {"query": "', {{ name }}, '", "slop":', slop, '}}},'))
    ) %>% 
    select(to_search)

  tf <- tempfile(fileext = ".json")
  cat('{"query": {"bool": {"should": [', file = tf, sep = "\n", append = TRUE)
  for (to_search in my_df_mod) {
    cat(to_search, file = tf, sep = "\n", append = TRUE)
  }
  cat(']}}}', file = tf, sep = "\n", append = TRUE)

  Search(con, index=index, body=readLines(tf), sort = "_score:asc", asdf = TRUE, size = limit)$hits$hits
}

my_df <- data.frame(
matrix(
  c(
    "john doe",
    "jane doe"
  ),
  nrow = 2,
  ncol = 1,
  byrow = TRUE,
  dimnames = list(NULL,
                  c("name"))
),
stringsAsFactors = FALSE
)

x <- connect()

result_df <- elastic_search(my_df, name, x, "index-name-here", slop = 100, limit = 200)
sckott commented 2 years ago

Thanks for the issue @tedmoorman Looks like a solid use case.

Have you seen https://docs.ropensci.org/elastic/reference/msearch.html ? Does that work for you as is, or no?

tedmoorman commented 2 years ago

I took a look, but it didn't seem to be enough. For one, "msearch1" and "tf" looked like they needed to be wrapped in readLines. Maybe I was also confused in seeing the index referenced in the JSON, instead of the function. I think more guidance is needed if a user has a long list they'll need to search through. The examples are enumerated searches (from 0 to n, or whatever). In my case, I have a list of over 100 names I need to search for within a large database, although I only provided 2 names in the example above. I looked at a number of different query options in elastic search, and "match_phrase" with the "slop" option ended up being the best I could find for any kind of fuzzy search. Hopefully this is the kind of thing that package users will find to be helpful.

tedmoorman commented 2 years ago

Here is some more code that might be helpful. Feel free to modify.

#' Login credentials for Elastic Search
#'
#' Creates data frame to be used in [es_connect()] from username, password, host_url, and port
#' 
#' @param es_username A character vector equal to the required username.
#' @param es_password A character vector equal to user's Elastic Search password for Elastic Search
#' @param es_ip A character vector with the IP address for Elastic Search 
#' 
#' @family elastic_search
#' 
#' @return A 3-column data frame of credentials written to the home directory.
#' @export
#' 
#' @examples
#' library(elastic)
#' es_credentials("user_name","pa$$W0rd", "xx.xxx.xx.xxx")
es_credentials <- function(es_username, es_password, es_ip) {
  conflicted::conflict_scout()
  testthat::expect_type(es_username, "character")
  testthat::expect_type(es_password, "character")
  testthat::expect_type(es_ip, "character")
  setwd("~/")
  data.frame(
    matrix(
      c(
        es_username, es_password, es_ip
      ),
      nrow = 1,
      ncol = 3,
      byrow = TRUE,
      dimnames = list(NULL,
                      c("es_username", "es_password", "es_ip"))
    ),
    stringsAsFactors = FALSE
  ) %>% 
    saveRDS(file="es_credentials.Rds")
}
#' Connection to Elastic Search
#'
#' Creates connection string needed to connect to Elastic Search
#' 
#' @param file A character vector with the .Rds file created in [es_credentials()]
#' 
#' @family elastic_search
#' 
#' @importFrom elastic connect
#' 
#' @return A connection object for the Elastic Search database
#' @export
#' 
#' @examples
#' library(elastic)
#' es <- es_connect()
es_connect <- function(file = "es_credentials.Rds") {
  conflicted::conflict_scout()
  project_wd <- getwd()
  setwd("~/")
  es_credentials <- readRDS(file)
  testthat::expect_type(es_credentials$es_username, "character")
  testthat::expect_type(es_credentials$es_password, "character")
  testthat::expect_type(es_credentials$es_ip, "character")
  con <- elastic::connect(host = es_credentials$es_ip,
                          transport_schema = "https",
                          user = es_credentials$es_username,
                          pwd = es_credentials$es_password,
                          errors = "complete",
                          ssl_verifypeer = FALSE)
  rm(es_credentials)
  setwd(project_wd)
  return(con)
}
#' Total hits from Elasticsearch
#'
#' Prints out the total number of results from a search in Elasticsearch
#' 
#' @param df A single column R data frame of search terms
#' @param name The column in the R data frame with the search terms. Typically, a column of names.
#' @param con The connection to the Elasticsearch database.
#' @param index The index in the Elasticsearch database to be searched.
#' @param property The property in the Elasticsearch database index to be searched.
#' @param slop The lack of precision in matching. The default is 0, which is exact matching.
#' 
#' @family elastic
#' 
#' @importFrom dplyr row_number
#' @importFrom dplyr if_else
#' @importFrom dplyr select
#' @importFrom elastic Search
#' 
#' @return A list with the following parameters: eq, total
#' @export
#' 
#' @examples
#' library(elastic)
#' es <- es_connect()
#' 
#' my_df <- data.frame(
#'   matrix(
#'     c(
#'       "john doe",
#'       "jane doe"
#'     ),
#'     nrow = 2,
#'     ncol = 1,
#'     byrow = TRUE,
#'     dimnames = list(NULL,
#'                     c("name"))
#'   ),
#'   stringsAsFactors = FALSE
#' )
#'
#' elastic_search_total(my_df, name, es, "index-name-here", "property-name-here", slop = 100)
elastic_search_total <- function(df, name, con, index, property = "name", slop = 0) {
  row <- max_row <- to_search <- tf <- NULL
  conflicted::conflict_scout()
  testthat::expect_s3_class(df, "data.frame")
  testthat::expect_true(dim(df)[2] == 1)
  testthat::expect_true(attributes(con)$class[1] == "Elasticsearch")
  testthat::expect_gte(slop, 0)
  my_df_mod <- df %>% 
    mutate(
      row = dplyr::row_number(),
      max_row = max(row),
      to_search = dplyr::if_else(max_row == row,
                                 paste0('{"match_phrase": {"', property, '": {"query": "', {{ name }}, '", "_name": "', {{ name }}, '", "slop":', slop, '}}}'),
                                 paste0('{"match_phrase": {"', property, '": {"query": "', {{ name }}, '", "_name": "', {{ name }},'", "slop":', slop, '}}},'))
    ) %>% 
    dplyr::select(to_search)

  tf <- tempfile(fileext = ".json")
  cat('{"query": {"bool": {"should": [', file = tf, sep = "\n", append = TRUE)
  for (to_search in my_df_mod) {
    cat(to_search, file = tf, sep = "\n", append = TRUE)
  }
  cat(']}}}', file = tf, sep = "\n", append = TRUE)

  elastic::Search(con, index=index, body=readLines(tf))$hits$total
}
#' Search results from Elasticsearch
#'
#' Creates a data frame of results from a search in Elasticsearch
#' 
#' @param df A single column R data frame of search terms
#' @param name The column in the R data frame with the search terms. Typically, a column of names.
#' @param con The connection to the Elasticsearch database.
#' @param index The index in the Elasticsearch database to be searched.
#' @param property The property in the Elasticsearch database index to be searched.
#' @param slop The lack of precision in matching. The default is 0, which is exact matching.
#' @param limit The number of results to include in the results data frame. The default is 50, and the maximum is 10000.
#' 
#' @family elastic
#' 
#' @importFrom dplyr row_number
#' @importFrom dplyr if_else
#' @importFrom dplyr select
#' @importFrom elastic Search
#' 
#' @return A data frame with the following columns: ????
#' @export
#' 
#' @examples
#' library(elastic)
#' es <- es_connect()
#' 
#' my_df <- data.frame(
#'   matrix(
#'     c(
#'       "john doe",
#'       "jane doe"
#'     ),
#'     nrow = 2,
#'     ncol = 1,
#'     byrow = TRUE,
#'     dimnames = list(NULL,
#'                     c("name"))
#'   ),
#'   stringsAsFactors = FALSE
#' )
#'
#' result_df <- elastic_search(my_df, name, es, "index-name-here", "property-name-here", slop = 100, limit = 100)
elastic_search <- function(df, name, con, index, property = "name", slop = 0, limit = 50) {
  row <- max_row <- to_search <- tf <- NULL
  conflicted::conflict_scout()
  testthat::expect_s3_class(df, "data.frame")
  testthat::expect_true(dim(df)[2] == 1)
  testthat::expect_true(attributes(con)$class[1] == "Elasticsearch")
  testthat::expect_gte(slop, 0)
  testthat::expect_gte(limit, 0)
  testthat::expect_lte(limit, 10000)
  my_df_mod <- df %>% 
    mutate(
      row = dplyr::row_number(),
      max_row = max(row),
      to_search = dplyr::if_else(max_row == row,
                                 paste0('{"match_phrase": {"', property, '": {"query": "', {{ name }}, '", "_name": "', {{ name }}, '", "slop":', slop, '}}}'),
                                 paste0('{"match_phrase": {"', property, '": {"query": "', {{ name }}, '", "_name": "', {{ name }},'", "slop":', slop, '}}},'))
    ) %>% 
    dplyr::select(to_search)

  tf <- tempfile(fileext = ".json")
  cat('{"query": {"bool": {"should": [', file = tf, sep = "\n", append = TRUE)
  for (to_search in my_df_mod) {
    cat(to_search, file = tf, sep = "\n", append = TRUE)
  }
  cat(']}}}', file = tf, sep = "\n", append = TRUE)

  elastic::Search(con, index=index, body=readLines(tf), sort = "_score:asc", asdf = TRUE, size = limit)$hits$hits
}
sckott commented 2 years ago

Thanks for sharing @tedmoorman

I think this is beyond the scope of the package

tedmoorman commented 2 years ago

No problem. I made an edit to the code above so that it is generalizable to searching any property - not just a "name" property.