spsanderson / healthyR.ai

healthyR.ai - AI package for the healthyverse
http://www.spsanderson.com/healthyR.ai/
Other
16 stars 6 forks source link

`hai_umap_list()` #286

Closed spsanderson closed 2 years ago

spsanderson commented 2 years ago

Function:

#' UMAP Projection
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @description Create a umap object from the [uwot::umap()] function.
#'
#' @seealso
#' *  \url{https://cran.r-project.org/package=uwot} (CRAN)
#' *  \url{https://github.com/jlmelville/uwot} (GitHub)
#' *  \url{https://github.com/jlmelville/uwot} (arXiv paper)
#'
#' @details This takes in the user item table/matix that is produced by [kmeans_user_item_tbl()]
#' function. This function uses the defaults of [uwot::umap()].
#'
#' @param .data The data from the [kmeans_user_item_tbl()] function.
#' @param .kmeans_map_tbl The data from the [kmeans_mapped_tbl()].
#' @param .k_cluster Pick the desired amount of clusters from your analysis of the scree plot.
#'
#' @examples
#' library(healthyR.data)
#' library(healthyR)
#' library(dplyr)
#' library(broom)
#'
#' data_tbl <- healthyR_data %>%
#'     filter(ip_op_flag == "I") %>%
#'     filter(payer_grouping != "Medicare B") %>%
#'     filter(payer_grouping != "?") %>%
#'     select(service_line, payer_grouping) %>%
#'     mutate(record = 1) %>%
#'     as_tibble()
#'
#' uit_tbl <- kmeans_user_item_tbl(
#'    .data           = data_tbl
#'    , .row_input    = service_line
#'    , .col_input    =  payer_grouping
#'    , .record_input = record
#'  )
#'
#' kmm_tbl <- kmeans_mapped_tbl(uit_tbl)
#'
#' umap_list(.data = uit_tbl, kmm_tbl, 3)
#'
#' @return A list of tibbles and the umap object
#'
#' @export
#'
umap_list <- function(.data
                      , .kmeans_map_tbl
                      , .k_cluster = 5) {
    # * Tidyeval ----
    k_cluster_var_expr <- .k_cluster

    # * Checks ----
    if (!is.data.frame(.data)) {
        stop(call. = FALSE,
             "(.data) is not a data.frame/tibble. Please supply.")
    }

    if (!is.data.frame(.kmeans_map_tbl)) {
        stop(call. = FALSE,
             "(.kmeans_map_tbl) is not a data.frame/tibble. Please supply.")
    }

    # * Data ----
    data           <- tibble::as_tibble(.data)
    kmeans_map_tbl <- tibble::as_tibble(.kmeans_map_tbl)

    # * Manipulation ----
    umap_obj <- data %>%
        dplyr::select(-1) %>%
        uwot::umap()

    umap_results_tbl <- umap_obj %>%
        tibble::as_tibble() %>%
        purrr::set_names("x", "y") %>%
        dplyr::bind_cols(data %>% dplyr::select(1))

    kmeans_obj <- kmeans_map_tbl %>%
        dplyr::pull(k_means) %>%
        purrr::pluck(k_cluster_var_expr)

    kmeans_cluster_tbl <- kmeans_obj %>%
        broom::augment(data) %>%
        dplyr::select(1, .cluster)

    umap_kmeans_cluster_results_tbl <- umap_results_tbl %>%
        dplyr::left_join(kmeans_cluster_tbl)

    # * Data List ----
    list_names <-
        df_list <- list(
            umap_obj                        = umap_obj,
            umap_results_tbl                = umap_results_tbl,
            kmeans_obj                      = kmeans_obj,
            kmeans_cluster_tbl              = kmeans_cluster_tbl,
            umap_kmeans_cluster_results_tbl = umap_kmeans_cluster_results_tbl
        )

    # * Return ----
    return(df_list)

}

Example:

ump_lst <- umap_list(uit_tbl, kmm_tbl, 3)

> ump_lst
$umap_obj
             [,1]       [,2]
 [1,] -0.74571982  2.2514666
 [2,] -0.15199175  2.1459239
 [3,]  0.68102262 -2.2383791
 [4,] -0.38277975  1.0841764
 [5,]  0.22941372  1.4139083
 [6,]  1.11321011 -1.9530446
 [7,] -0.58606782 -0.6579749
 [8,]  0.77251217 -1.1631352
 [9,]  0.32206175 -1.8929783
[10,]  0.03789804 -0.7778479
[11,] -0.02187505  1.7635368
[12,] -0.92922345  1.7515033
[13,]  0.22819314  0.9602579
[14,] -0.28279415 -0.2338883
[15,]  0.07320938 -1.5743767
[16,]  0.43210325 -1.3941590
[17,]  0.07226262  0.4439653
[18,] -0.90982023  1.2273057
[19,] -0.38926092  0.6284164
[20,]  0.48171482 -0.6196457
[21,] -0.27508488 -1.2331883
[22,] -0.50644556  1.8743180
[23,]  0.73746178 -1.8061606
attr(,"scaled:center")
[1] -1.965624 -9.547336

$umap_results_tbl
# A tibble: 23 × 3
         x      y service_line                 
     <dbl>  <dbl> <chr>                        
 1 -0.746   2.25  Alcohol Abuse                
 2 -0.152   2.15  Bariatric Surgery For Obesity
 3  0.681  -2.24  Carotid Endarterectomy       
 4 -0.383   1.08  Cellulitis                   
 5  0.229   1.41  Chest Pain                   
 6  1.11   -1.95  CHF                          
 7 -0.586  -0.658 COPD                         
 8  0.773  -1.16  CVA                          
 9  0.322  -1.89  GI Hemorrhage                
10  0.0379 -0.778 Joint Replacement            
# … with 13 more rows
# ℹ Use `print(n = ...)` to see more rows

$kmeans_obj
K-means clustering with 3 clusters of sizes 12, 5, 6

Cluster means:
  Blue Cross Commercial Compensation Exchange Plans        HMO   Medicaid
1  0.0783745 0.02182129 0.0043244347    0.006202137 0.04493860 0.03684344
2  0.1495475 0.03679700 0.0003066332    0.020729565 0.16252855 0.13072521
3  0.1170278 0.03141187 0.0101665392    0.013865190 0.09822472 0.08557952
  Medicaid HMO Medicare A Medicare HMO    No Fault    Self Pay
1   0.08001653  0.5625037   0.15152338 0.003475542 0.009976485
2   0.31446157  0.1318675   0.03192357 0.001364577 0.019748398
3   0.14652195  0.3535395   0.10524131 0.007067791 0.031353724

Clustering vector:
 [1] 2 2 1 3 3 1 1 1 1 1 2 2 3 1 1 1 3 3 3 1 1 2 1

Within cluster sum of squares by cluster:
[1] 0.09625399 0.19152559 0.08456928
 (between_SS / total_SS =  73.6 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

$kmeans_cluster_tbl
# A tibble: 23 × 2
   service_line                  .cluster
   <chr>                         <fct>   
 1 Alcohol Abuse                 2       
 2 Bariatric Surgery For Obesity 2       
 3 Carotid Endarterectomy        1       
 4 Cellulitis                    3       
 5 Chest Pain                    3       
 6 CHF                           1       
 7 COPD                          1       
 8 CVA                           1       
 9 GI Hemorrhage                 1       
10 Joint Replacement             1       
# … with 13 more rows
# ℹ Use `print(n = ...)` to see more rows

$umap_kmeans_cluster_results_tbl
# A tibble: 23 × 4
         x      y service_line                  .cluster
     <dbl>  <dbl> <chr>                         <fct>   
 1 -0.746   2.25  Alcohol Abuse                 2       
 2 -0.152   2.15  Bariatric Surgery For Obesity 2       
 3  0.681  -2.24  Carotid Endarterectomy        1       
 4 -0.383   1.08  Cellulitis                    3       
 5  0.229   1.41  Chest Pain                    3       
 6  1.11   -1.95  CHF                           1       
 7 -0.586  -0.658 COPD                          1       
 8  0.773  -1.16  CVA                           1       
 9  0.322  -1.89  GI Hemorrhage                 1       
10  0.0379 -0.778 Joint Replacement             1       
# … with 13 more rows
# ℹ Use `print(n = ...)` to see more rows

``