posit-dev / positron

Positron, a next-generation data science IDE
Other
1.92k stars 58 forks source link

Ark: LSP command sending over string longer than 512MB? #3117

Closed DavisVaughan closed 2 months ago

DavisVaughan commented 2 months ago

While investigating: https://github.com/posit-dev/positron-beta/discussions/19#discussioncomment-9407240 with the reprex below, I was able to reproduce the issue after printing out fits and then running fits |> tidyr::unnest(info).

At some point when trying to run some code after running everything below, the console will hang and you should see this in Positron R Extension

2024-05-13 10:15:22.503 [warning] ARK (R 4.3.1) language client error occurred (port 45724). 'Error' with message: Cannot create a string longer than 0x1fffffe8 characters. This is error number undefined.

And this in the Console: R output channel:

[R] 2024-05-13T14:15:22.535939000Z [ark-unknown] TRACE crates/ark/src/lsp/backend.rs:611: LSP thread exiting gracefully after connection closed ("127.0.0.1:45724").

Interestingly, after waiting like a full minute or so the Console does end up being able to process the code in there and returns control to me. But the LSP is now dead and not processing any requests.

library(sf)
library(readr)
library(dplyr)
library(sfdep)

tf <- tempfile(fileext = ".csv")

download.file("https://raw.githubusercontent.com/xj-liu/spatial_feature_incorporation/main/houses1990.csv", tf)

# read and create an sf object
houses_raw <- read_csv(tf) |>
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326) |>
  # apply a smidgen of jittering to the points because there are dupes
  mutate(geometry = st_jitter(geometry))

houses_nb <- houses_raw |>
  mutate(
    # use knn neighbors
    nb = st_knn(geometry, k = 25),
    # IDW weight
    wt = st_inverse_distance(nb, geometry)
  )

autocorrelation <- houses_nb |>
  reframe(across(where(is.numeric), \(.x) {
    broom::tidy(global_moran_test(.x, nb, wt))
  })) |>
  tidyr::pivot_longer(everything()) |>
  tidyr::unnest_wider(value)

# Identify which measures are spatially autocorrelated
auto_cor_fields <- autocorrelation |>
  select(name, I = estimate1, p_value = p.value) |>
  mutate(is_autocorrelated = p_value <= 0.01 & I > 0.3)

# these are the input fields that we want to use neighboring values for
auto_correlated_x <- auto_cor_fields |>
  filter(
    is_autocorrelated,
    # this will be the y so we cannot use this
    name != "houseValue"
  ) |>
  pull(name)

# calculate the spatial lag of these variables
# drop the neighbors and the weights and the geometry
clean_df <- houses_nb |>
  mutate(
    across(
      all_of(auto_correlated_x),
      \(.x) st_lag(.x, nb, wt),
      .names = "{.col}_lag"
    )
  ) |>
  st_drop_geometry() |>
  select(-c(nb, wt))

# Model Specifications ---------------------------------------------------

library(tidymodels)
# These packages are needed for the engines
# install.packages(c("ranger", "kernlab", "xgboost"))

boost_tree_xgboost_spec <-
  boost_tree(
    # tree_depth = tune(),
    # trees = tune(),
    # learn_rate = tune(),
    # min_n = tune(),
    # loss_reduction = tune(),
    # sample_size = tune(),
    # stop_iter = tune()
  ) |>
  set_engine("xgboost") |>
  set_mode("regression")

linear_reg_glm_spec <-
  linear_reg() |>
  set_engine("glm")

rand_forest_ranger_spec <-
  rand_forest(
    # mtry = tune(),
    # min_n = tune()
  ) |>
  set_engine("ranger") |>
  set_mode("regression")

svm_poly_kernlab_spec <-
  svm_poly(
    # cost = tune(),
    # degree = tune(),
    # scale_factor = tune(),
    # margin = tune()
  ) |>
  set_engine("kernlab") |>
  set_mode("regression")

models <- list(
  xgb = boost_tree_xgboost_spec,
  lm = linear_reg_glm_spec,
  rf = rand_forest_ranger_spec,
  svm_poly = svm_poly_kernlab_spec
)

# Partition --------------------------------------------------------------

# create initial split
init_split <- initial_split(clean_df)

# training and testing
train_df <- training(init_split)
test_df <- testing(init_split)

folds <- vfold_cv(train_df)

# Pre-processing steps ---------------------------------------------------

# just going to apply normalization to each of the models
base_rec <- recipe(
  houseValue ~ income + houseAge + rooms + bedrooms + population + households,
  data = train_df
)

sp_house_age <- recipe(
  houseValue ~ income + houseAge + rooms + bedrooms + population + households +
    # spatial component
    houseAge_lag,
  data = train_df
) |> step_scale(everything())

sp_income <- recipe(
  houseValue ~ income + houseAge + rooms + bedrooms + population + households +
    # spatial component
    income_lag,
  data = train_df
) |> step_scale(everything())

sp_income_age <- recipe(
  houseValue ~ income + houseAge + rooms + bedrooms + population + households +
    # spatial component
    income_lag + houseAge_lag,
  data = train_df
) |> step_scale(everything())

sp_all <- recipe(
  houseValue ~ .,
  data = train_df
) |> step_scale(everything())

# create a list of all the recipes we want to work with
recipes <- list(base_rec, sp_house_age, sp_income, sp_income_age, sp_all)

# Workflowsets -----------------------------------------------------------
doParallel::registerDoParallel(cores = 4)

wfs <- workflow_set(recipes, models, cross = TRUE)

fits <- workflow_map(
  wfs,
  resamples = folds,
  verbose = TRUE,
  control = control_grid(parallel_over = "everything")
)
DavisVaughan commented 2 months ago

Slightly smaller and faster reprex, run everything to wfs here, then manually type out wfs %>% tidyr::unnest(c(inf)), put your cursor at inf<> and hit tab. That results in an absolute wall of completion() LSP request error text like what you see below, and I'm pretty sure in the real case with fits that ends up sending over >512 MB of info. It looks like:

  1. We probably should not be logging an error here at all
  2. We are inlining objects into the call, which is technically the real issue because those objects can be huge and overload this string limit
             at /Users/davis/.cargo/registry/src/index.crates.io-6f17d22bba15001f/anyhow-1.0.80/src/error.rs:565:25
   4: <core::result::Result<T,F> as core::ops::try_trait::FromResidual<core::result::Result<core::convert::Infallible,E>>>::from_residual
             at /rustc/82e1608dfa6e0b5569232559e3d385fea5a93112/library/core/src/result.rs:1963:27
   5: ark::lsp::completions::sources::composite::call::completions_from_session_arguments
             at /Users/davis/files/programming/positron/amalthea/crates/ark/src/lsp/completions/sources/composite/call.rs:221:9
   6: ark::lsp::completions::sources::composite::call::completions_from_arguments
[Info  - 10:40:28 AM] completion(): Failed to provide completions: Error evaluating .ps.completions.formalNames(.Primitive("c"), structure(list(wflow_id = c("recipe_1_xgb", 
"recipe_1_lm", "recipe_1_rf", "recipe_1_svm_poly", "recipe_2_xgb", 
"recipe_2_lm", "recipe_2_rf", "recipe_2_svm_poly", "recipe_3_xgb", 
"recipe_3_lm", "recipe_3_rf", "recipe_3_svm_poly", "recipe_4_xgb", 
"recipe_4_lm", "recipe_4_rf", "recipe_4_svm_poly", "recipe_5_xgb", 
"recipe_5_lm", "recipe_5_rf", "recipe_5_svm_poly"), info = list(
    structure(list(workflow = list(structure(list(pre = structure(list(
        actions = list(recipe = structure(list(recipe = structure(list(
            var_info = structure(list(variable = c("income", 
            "houseAge", "rooms", "bedrooms", "population", "households", 
            "houseValue"), type = list(c("double", "numeric"), 
                c("double", "numeric"), c("double", "numeric"
                ), c("double", "numeric"), c("double", "numeric"
                ), c("double", "numeric"), c("double", "numeric"
                )), role = c("predictor", "predictor", "predictor", 
            "predictor", "predictor", "predictor", "outcome"), 
                source = c("original", "original", "original", 
                "original", "original", "original", "original"
                )), row.names = c(NA, -7L), class = c("tbl_df", 
            "tbl", "data.frame")), term_info = structure(list(
                variable = c("income", "houseAge", "rooms", "bedrooms", 
                "population", "households", "houseValue"), type = list(
                  c("double", "numeric"), c("double", "numeric"
                  ), c("double", "numeric"), c("double", "numeric"
                  ), c("double", "numeric"), c("double", "numeric"
                  ), c("double", "numeric")), role = c("predictor", 
                "predictor", "predictor", "predictor", "predictor", 
                "predictor", "outcome"), source = c("original", 
                "original", "original", "original", "original", 
                "original", "original")), row.names = c(NA, -7L
            ), class = c("tbl_df", "tbl", "data.frame")), steps = NULL, 
            template = structure(list(income = c(5.7143, 5.9683, 
            3.3903, 3.7973, 6.0574, 3.2841, 2.227, 3.7139, 1.845, 
            5.5983, 4.3438, 4.2679, 2.9817, 8.2049, 1.3375, 7.0177, 
            7.9187, 3.1062, 2.4861, 4.2917, 1.6483, 3.1856, 3.0973, 
            4.1359, 4.1552, 4.6731, 1.9074, 3.8581, 5.0234, 4.425, 
            3.25, 5.2589, 2.8, 4.2222, 5.6152, 3.6875, 3.0114, 
            5.5456, 12.8665, 3.8, 4.875, 3.0437, 4.6696, 4.3958, 
            3.0926, 2.4884, 3.8201, 2.065, 1.6667, 4.2434, 3.0938, 
            3.3833, 5.363, 7.0565, 4.2596, 1.9556, 2.6513, 2.0214, 
            5.7188, 4.3898, 3.0893, 2.9044, 2.1149, 2.0134, 3.9722, 
            1.6111, 5.0602, 4.5625, 2.3482, 0.4999, 1.125, 3.1719, 
            11.3176, 7.0469, 10.7569, 5.4337, 5.3107, 3.8125, 
            1.1384, 1.6318, 2.2277, 4.9643, 3.7958, 2.6803, 3.7813, 
            3.725, 2.5759, 5.2088, 7.1497, 15.0001, 3.631, 3.8897, 
            3.9443, 4.9517, 3.1771, 2.1522, 3.5524, 2.0549, 3.2813, 
            3.6121, 3.3371, 4.5484, 3.5404, 4.2348, 2.1667, 1.5455, 

<snip for brevity, this goes onnnnnnn and onnnnn>
library(sf)
library(readr)
library(dplyr)
library(sfdep)

tf <- tempfile(fileext = ".csv")

download.file("https://raw.githubusercontent.com/xj-liu/spatial_feature_incorporation/main/houses1990.csv", tf)

# read and create an sf object
houses_raw <- read_csv(tf) |>
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326) |>
  # apply a smidgen of jittering to the points because there are dupes
  mutate(geometry = st_jitter(geometry))

houses_nb <- houses_raw |>
  mutate(
    # use knn neighbors
    nb = st_knn(geometry, k = 25),
    # IDW weight
    wt = st_inverse_distance(nb, geometry)
  )

autocorrelation <- houses_nb |>
  reframe(across(where(is.numeric), \(.x) {
    broom::tidy(global_moran_test(.x, nb, wt))
  })) |>
  tidyr::pivot_longer(everything()) |>
  tidyr::unnest_wider(value)

# Identify which measures are spatially autocorrelated
auto_cor_fields <- autocorrelation |>
  select(name, I = estimate1, p_value = p.value) |>
  mutate(is_autocorrelated = p_value <= 0.01 & I > 0.3)

# these are the input fields that we want to use neighboring values for
auto_correlated_x <- auto_cor_fields |>
  filter(
    is_autocorrelated,
    # this will be the y so we cannot use this
    name != "houseValue"
  ) |>
  pull(name)

# calculate the spatial lag of these variables
# drop the neighbors and the weights and the geometry
clean_df <- houses_nb |>
  mutate(
    across(
      all_of(auto_correlated_x),
      \(.x) st_lag(.x, nb, wt),
      .names = "{.col}_lag"
    )
  ) |>
  st_drop_geometry() |>
  select(-c(nb, wt))

# Model Specifications ---------------------------------------------------

library(tidymodels)
# These packages are needed for the engines
# install.packages(c("ranger", "kernlab", "xgboost"))

boost_tree_xgboost_spec <-
  boost_tree(
    # tree_depth = tune(),
    # trees = tune(),
    # learn_rate = tune(),
    # min_n = tune(),
    # loss_reduction = tune(),
    # sample_size = tune(),
    # stop_iter = tune()
  ) |>
  set_engine("xgboost") |>
  set_mode("regression")

linear_reg_glm_spec <-
  linear_reg() |>
  set_engine("glm")

rand_forest_ranger_spec <-
  rand_forest(
    # mtry = tune(),
    # min_n = tune()
  ) |>
  set_engine("ranger") |>
  set_mode("regression")

svm_poly_kernlab_spec <-
  svm_poly(
    # cost = tune(),
    # degree = tune(),
    # scale_factor = tune(),
    # margin = tune()
  ) |>
  set_engine("kernlab") |>
  set_mode("regression")

models <- list(
  xgb = boost_tree_xgboost_spec,
  lm = linear_reg_glm_spec,
  rf = rand_forest_ranger_spec,
  svm_poly = svm_poly_kernlab_spec
)

# Partition --------------------------------------------------------------

# create initial split
init_split <- initial_split(clean_df)

# training and testing
train_df <- training(init_split)
test_df <- testing(init_split)

folds <- vfold_cv(train_df)

# Pre-processing steps ---------------------------------------------------

# just going to apply normalization to each of the models
base_rec <- recipe(
  houseValue ~ income + houseAge + rooms + bedrooms + population + households,
  data = train_df
)

sp_house_age <- recipe(
  houseValue ~ income + houseAge + rooms + bedrooms + population + households +
    # spatial component
    houseAge_lag,
  data = train_df
) |> step_scale(everything())

sp_income <- recipe(
  houseValue ~ income + houseAge + rooms + bedrooms + population + households +
    # spatial component
    income_lag,
  data = train_df
) |> step_scale(everything())

sp_income_age <- recipe(
  houseValue ~ income + houseAge + rooms + bedrooms + population + households +
    # spatial component
    income_lag + houseAge_lag,
  data = train_df
) |> step_scale(everything())

sp_all <- recipe(
  houseValue ~ .,
  data = train_df
) |> step_scale(everything())

# create a list of all the recipes we want to work with
recipes <- list(base_rec, sp_house_age, sp_income, sp_income_age, sp_all)

wfs <- workflow_set(recipes, models, cross = TRUE)
juliasilge commented 2 months ago

In Positron 2024.05.0 (Universal) build 1251, I can walk through this reprex and then I successfully do not see any LSP completion errors:

https://github.com/posit-dev/positron/assets/12505835/06fc4a16-11cc-41a5-97b7-9d787fdfdf5e