insightsengineering / teal.gallery

A Gallery of Exploratory Web Apps used for Analyzing Clinical Trial Data
https://insightsengineering.github.io/teal.gallery/
Other
29 stars 7 forks source link

[Bug]: python connector not reproducing complete code #145

Closed averissimo closed 6 months ago

averissimo commented 6 months ago

What happened?

python connector is not reproducing the full code needed for the different modules.

Context

When running the genentech.shinyapps.io/nest_python_dev/ app the code does not reproduce the full R code

This is probably due to calling within in the teal_data_module (and the code parser not recognizing dependencies)

Snippet showing example of failure ``` r library(teal.data) #> Loading required package: teal.code data <- within( teal_data(), { library(reticulate) python_dependencies <- c("pip", "numpy", "pandas") virtualenv_dir <- Sys.getenv("VIRTUALENV_NAME", "example_env_name") python_path <- Sys.getenv("PYTHON_PATH") if (python_path == "") { python_path <- NULL } reticulate::virtualenv_create(envname = virtualenv_dir, python = python_path) reticulate::virtualenv_install(virtualenv_dir, packages = python_dependencies, ignore_installed = TRUE) reticulate::use_virtualenv(virtualenv_dir, required = TRUE) iris_raw <- cbind(id = seq_len(nrow(iris)), iris) python_code <- "import pandas as pd data = r.iris_raw def svd_whiten(dat): import numpy as np X = np.matrix(dat) U, s, Vt = np.linalg.svd(X, full_matrices=False) X_white = np.dot(U, Vt) return X_white data_columns = data.columns global numeric_cols_ix global numeric_cols numeric_cols_ix = list(range(5))[1:] numeric_cols = [x for i,x in enumerate(data_columns) if i in numeric_cols_ix] svd_res = svd_whiten(data.iloc[:, numeric_cols_ix]) data_new = pd.concat([data, pd.DataFrame(svd_res)], axis = 1) data_new.columns = list(data_columns) + [i + '.whiten' for i in numeric_cols] data_new = data_new.round(10) data_new " withr::with_options( list(reticulate.engine.environment = environment()), py_run_string(python_code) ) IRIS <- py$data_new } ) #> virtualenv: example_env_name #> Using virtual environment 'example_env_name' ... datanames(data) <- c("IRIS") rlang::hash(data[["IRIS"]]) #> [1] "e9225d33975ec7ec986485068139f7af" get_code(data, datanames = "IRIS") |> cat() #> library(reticulate) #> IRIS <- py$data_new ``` Created on 2024-02-19 with [reprex v2.0.2](https://reprex.tidyverse.org)
Possible alternative using `teal.code::eval_code` and `# @linksto ...` ``` r library(teal.data) #> Loading required package: teal.code data <- teal.code::eval_code( teal_data(), " library(reticulate) python_dependencies <- c(\"pip\", \"numpy\", \"pandas\") # @linksto IRIS virtualenv_dir <- Sys.getenv(\"VIRTUALENV_NAME\", \"example_env_name\") # @linksto IRIS python_path <- Sys.getenv(\"PYTHON_PATH\") # @linksto IRIS if (python_path == \"\") { python_path <- NULL } reticulate::virtualenv_create(envname = virtualenv_dir, python = python_path) # @linksto IRIS reticulate::virtualenv_install(virtualenv_dir, packages = python_dependencies, ignore_installed = TRUE) # @linksto IRIS reticulate::use_virtualenv(virtualenv_dir, required = TRUE) # @linksto IRIS iris_raw <- cbind(id = seq_len(nrow(iris)), iris) # @linksto IRIS " ) #> virtualenv: example_env_name #> Using virtual environment 'example_env_name' ... data <- within( data, { python_code <- "import pandas as pd data = r.iris_raw def svd_whiten(dat): import numpy as np X = np.matrix(dat) U, s, Vt = np.linalg.svd(X, full_matrices=False) X_white = np.dot(U, Vt) return X_white data_columns = data.columns global numeric_cols_ix global numeric_cols numeric_cols_ix = list(range(5))[1:] numeric_cols = [x for i,x in enumerate(data_columns) if i in numeric_cols_ix] svd_res = svd_whiten(data.iloc[:, numeric_cols_ix]) data_new = pd.concat([data, pd.DataFrame(svd_res)], axis = 1) data_new.columns = list(data_columns) + [i + '.whiten' for i in numeric_cols] data_new = data_new.round(10) data_new " withr::with_options( list(reticulate.engine.environment = environment()), value <- py_run_string(python_code) ) IRIS <- value$data_new } ) datanames(data) <- c("IRIS") rlang::hash(data[["IRIS"]]) #> [1] "e9225d33975ec7ec986485068139f7af" get_code(data, datanames = "IRIS") |> cat() #> library(reticulate) #> python_dependencies <- c("pip", "numpy", "pandas") #> virtualenv_dir <- Sys.getenv("VIRTUALENV_NAME", "example_env_name") #> python_path <- Sys.getenv("PYTHON_PATH") #> if (python_path == "") { #> python_path <- NULL #> } #> reticulate::virtualenv_create(envname = virtualenv_dir, python = python_path) #> reticulate::virtualenv_install(virtualenv_dir, packages = python_dependencies, ignore_installed = TRUE) #> reticulate::use_virtualenv(virtualenv_dir, required = TRUE) #> iris_raw <- cbind(id = seq_len(nrow(iris)), iris) #> python_code <- "import pandas as pd\ndata = r.iris_raw\ndef svd_whiten(dat):\n import numpy as np\n X = np.matrix(dat)\n U, s, Vt = np.linalg.svd(X, full_matrices=False)\n X_white = np.dot(U, Vt)\n return X_white\n\ndata_columns = data.columns\nglobal numeric_cols_ix\nglobal numeric_cols\nnumeric_cols_ix = list(range(5))[1:]\nnumeric_cols = [x for i,x in enumerate(data_columns) if i in numeric_cols_ix]\nsvd_res = svd_whiten(data.iloc[:, numeric_cols_ix])\ndata_new = pd.concat([data, pd.DataFrame(svd_res)], axis = 1)\ndata_new.columns = list(data_columns) + [i + '.whiten' for i in numeric_cols]\ndata_new = data_new.round(10)\ndata_new\n" #> withr::with_options(list(reticulate.engine.environment = environment()), value <- py_run_string(python_code)) #> IRIS <- value$data_new ``` Created on 2024-02-19 with [reprex v2.0.2](https://reprex.tidyverse.org)

image

averissimo commented 6 months ago

@m7pr what do you think about the alternative code for the connector?

Using a mix of eval_code and within (This avoids having to escape the python code)

m7pr commented 6 months ago

@averissimo I think we should rewrite the app so that it uses eval_code that can handle comments, in which we add #@linksto tag so that python setup code is returned for IRIS. Would you like me to handle that, or would you like to try on your own?

averissimo commented 6 months ago

I can do it myself as the code above already does it.

Do you think it's a good idea to mix eval_code with within? My first thought is that it looks a bit better when defining the python_code string.

But I'm reluctant on it.

edit: copying it here without <details> tag.

library(teal.data)
#> Loading required package: teal.code

data <- teal.code::eval_code(
  teal_data(),
    "
library(reticulate)
python_dependencies <- c(\"pip\", \"numpy\", \"pandas\") # @linksto IRIS
virtualenv_dir <- Sys.getenv(\"VIRTUALENV_NAME\", \"example_env_name\") # @linksto IRIS
python_path <- Sys.getenv(\"PYTHON_PATH\") # @linksto IRIS
if (python_path == \"\") {
  python_path <- NULL
}
reticulate::virtualenv_create(envname = virtualenv_dir, python = python_path)  # @linksto IRIS
reticulate::virtualenv_install(virtualenv_dir, packages = python_dependencies, ignore_installed = TRUE)  # @linksto IRIS
reticulate::use_virtualenv(virtualenv_dir, required = TRUE)  # @linksto IRIS

iris_raw <- cbind(id = seq_len(nrow(iris)), iris)  # @linksto IRIS
"
)
#> virtualenv: example_env_name
#> Using virtual environment 'example_env_name' ...

data <- within(
  data,
  {
    python_code <- "import pandas as pd
data = r.iris_raw
def svd_whiten(dat):
  import numpy as np
  X = np.matrix(dat)
  U, s, Vt = np.linalg.svd(X, full_matrices=False)
  X_white = np.dot(U, Vt)
  return X_white

data_columns = data.columns
global numeric_cols_ix
global numeric_cols
numeric_cols_ix = list(range(5))[1:]
numeric_cols = [x for i,x in enumerate(data_columns) if i in numeric_cols_ix]
svd_res = svd_whiten(data.iloc[:, numeric_cols_ix])
data_new = pd.concat([data, pd.DataFrame(svd_res)], axis = 1)
data_new.columns = list(data_columns) + [i + '.whiten' for i in numeric_cols]
data_new = data_new.round(10)
data_new
"

    withr::with_options(
      list(reticulate.engine.environment = environment()),
      value <- py_run_string(python_code)
    )
    IRIS <- value$data_new
  }
)  

datanames(data) <- c("IRIS")
rlang::hash(data[["IRIS"]])
#> [1] "e9225d33975ec7ec986485068139f7af"

get_code(data, datanames = "IRIS") |> cat()
#> library(reticulate)
#> python_dependencies <- c("pip", "numpy", "pandas")
#> virtualenv_dir <- Sys.getenv("VIRTUALENV_NAME", "example_env_name")
#> python_path <- Sys.getenv("PYTHON_PATH")
#> if (python_path == "") {
#>     python_path <- NULL
#> }
#> reticulate::virtualenv_create(envname = virtualenv_dir, python = python_path)
#> reticulate::virtualenv_install(virtualenv_dir, packages = python_dependencies, ignore_installed = TRUE)
#> reticulate::use_virtualenv(virtualenv_dir, required = TRUE)
#> iris_raw <- cbind(id = seq_len(nrow(iris)), iris)
#> python_code <- "import pandas as pd\ndata = r.iris_raw\ndef svd_whiten(dat):\n  import numpy as np\n  X = np.matrix(dat)\n  U, s, Vt = np.linalg.svd(X, full_matrices=False)\n  X_white = np.dot(U, Vt)\n  return X_white\n\ndata_columns = data.columns\nglobal numeric_cols_ix\nglobal numeric_cols\nnumeric_cols_ix = list(range(5))[1:]\nnumeric_cols = [x for i,x in enumerate(data_columns) if i in numeric_cols_ix]\nsvd_res = svd_whiten(data.iloc[:, numeric_cols_ix])\ndata_new = pd.concat([data, pd.DataFrame(svd_res)], axis = 1)\ndata_new.columns = list(data_columns) + [i + '.whiten' for i in numeric_cols]\ndata_new = data_new.round(10)\ndata_new\n"
#> withr::with_options(list(reticulate.engine.environment = environment()), value <- py_run_string(python_code))
#> IRIS <- value$data_new
m7pr commented 6 months ago

Do you think it's a good idea to mix eval_code with within? My first thought is that it looks a bit better when defining the python_code string.

Yes, in this case this looks fine. You have my blessing!