UK-Biobank / UKB-RAP-Notebooks-Access

UKB RAP Notebooks contains a collection of examples of how to use UK Biobank Research Analyses Platform (RAP).
MIT License
57 stars 9 forks source link

Errors when running "JupyterNotebook_R/A108_Constructing-the-Olink-dataset_R.ipynb" #14

Closed xiangzhu closed 4 weeks ago

xiangzhu commented 1 month ago

Dear UK Biobank team,

First of all, thank you so much for creating and sharing all these Jupyter notebooks with the public. They are incredibly useful for learning about UKB RAP.

I was able to run most of these notebooks in UKB RAP without any issues. However, I did encounter two types of errors when running "JupyterNotebook_R/A108_Constructing-the-Olink-dataset_R.ipynb".

I requested the same instance (mem1_hdd1_v2_x8) in RAP as the one specified in your notebook.

Error about "protection stack overflow" for "Plate ID" and "Limit of detection"

# Plate ID
field_30901_df <- fread("field_30901.csv") %>%
    select(-participant.p30901_i1) %>%
    pivot_longer(cols = -c(participant.eid), names_to = "instance", values_to = "PlateID") %>%
    filter(!is.na(PlateID)) %>%
    mutate(instance = str_remove(instance, "participant.p30901_i"))
field_30901_sdf <- sparklyr::copy_to(sc, field_30901_df, overwrite = TRUE)
Error: protect(): protection stack overflow
Traceback:

1. sparklyr::copy_to(sc, field_30901_df, overwrite = TRUE)
2. copy_to.spark_connection(sc, field_30901_df, overwrite = TRUE)
3. sdf_copy_to(dest, df, name, memory, repartition, overwrite, ...)
4. sdf_copy_to.default(dest, df, name, memory, repartition, overwrite, 
 .     ...)
5. sdf_import(x, sc, name, memory, repartition, overwrite, struct_columns, 
 .     ...)
6. sdf_import.default(x, sc, name, memory, repartition, overwrite, 
 .     struct_columns, ...)
7. spark_data_copy(sc, x, name = name, repartition = repartition, 
 .     serializer = serializer, struct_columns = struct_columns)
8. spark_data_perform_copy(sc, serializers[[serializer]], df, repartition, 
 .     raw_columns)
9. serializer(sc, df, columns, repartition)
10. lapply(df, function(x) {
  .     as.list(if (inherits(x, "Date")) {
  .         as.integer(x)
  .     }
  .     else if (inherits(x, "POSIXt")) {
  .         as.numeric(x)
  .     }
  .     else if (inherits(x, "factor")) {
  .         as.character(x)
  .     }
  .     else {
  .         x
  .     })
  . })
11. FUN(X[[i]], ...)
12. as.list(if (inherits(x, "Date")) {
  .     as.integer(x)
  . } else if (inherits(x, "POSIXt")) {
  .     as.numeric(x)
  . } else if (inherits(x, "factor")) {
  .     as.character(x)
  . } else {
  .     x
  . })

Interestingly, this error only occurs for Data-Field 30901, not for Data-Fields 30900 or 30902.

The same error also occurs for a showcase resource "Limit of detection", as shown below.

# Limit of detection
system(" wget  -nd  biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/olink_limit_of_detection.dat")
olink_limit_of_detection <- fread("olink_limit_of_detection.dat") %>% mutate(Assay = tolower(Assay))
olink_limit_of_detection_sdf <- sparklyr::copy_to(sc, olink_limit_of_detection, overwrite = TRUE)
Error: protect(): protection stack overflow
Traceback:

1. sparklyr::copy_to(sc, olink_limit_of_detection, overwrite = TRUE)
2. copy_to.spark_connection(sc, olink_limit_of_detection, overwrite = TRUE)
3. sdf_copy_to(dest, df, name, memory, repartition, overwrite, ...)
4. sdf_copy_to.default(dest, df, name, memory, repartition, overwrite, 
 .     ...)
5. sdf_import(x, sc, name, memory, repartition, overwrite, struct_columns, 
 .     ...)
6. sdf_import.default(x, sc, name, memory, repartition, overwrite, 
 .     struct_columns, ...)
7. spark_data_copy(sc, x, name = name, repartition = repartition, 
 .     serializer = serializer, struct_columns = struct_columns)
8. spark_data_perform_copy(sc, serializers[[serializer]], df, repartition, 
 .     raw_columns)
9. serializer(sc, df, columns, repartition)
10. lapply(df, function(x) {
  .     as.list(if (inherits(x, "Date")) {
  .         as.integer(x)
  .     }
  .     else if (inherits(x, "POSIXt")) {
  .         as.numeric(x)
  .     }
  .     else if (inherits(x, "factor")) {
  .         as.character(x)
  .     }
  .     else {
  .         x
  .     })
  . })
11. FUN(X[[i]], ...)
12. as.list(if (inherits(x, "Date")) {
  .     as.integer(x)
  . } else if (inherits(x, "POSIXt")) {
  .     as.numeric(x)
  . } else if (inherits(x, "factor")) {
  .     as.character(x)
  . } else {
  .     x
  . })

Java error about "Batch number" and "Processing start date"

For two showcase resources, I am getting the following Java errors.

# Batch number
system(" wget  -nd  biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/olink_batch_number.dat")
olink_batch_number <- fread("olink_batch_number.dat")
olink_batch_number_sdf <- sparklyr::copy_to(sc, olink_batch_number, overwrite = TRUE)
Error:
! java.lang.IllegalArgumentException: Invalid type 2

Run `sparklyr::spark_last_error()` to see the full Spark error (multiple lines)
To use the previous style of error message set
`options("sparklyr.simple.errors" = TRUE)`
Traceback:

1. sparklyr::copy_to(sc, olink_batch_number, overwrite = TRUE)
2. copy_to.spark_connection(sc, olink_batch_number, overwrite = TRUE)
3. sdf_copy_to(dest, df, name, memory, repartition, overwrite, ...)
4. sdf_copy_to.default(dest, df, name, memory, repartition, overwrite, 
 .     ...)
5. sdf_import(x, sc, name, memory, repartition, overwrite, struct_columns, 
 .     ...)
6. sdf_import.default(x, sc, name, memory, repartition, overwrite, 
 .     struct_columns, ...)
7. spark_data_copy(sc, x, name = name, repartition = repartition, 
 .     serializer = serializer, struct_columns = struct_columns)
8. spark_data_perform_copy(sc, serializers[[serializer]], df, repartition, 
 .     raw_columns)
9. serializer(sc, df, columns, repartition)
10. invoke_static(sc, "sparklyr.Utils", "parallelize", spark_context(sc), 
  .     num_rows, cols %>% unname() %>% lapply(function(x) {
  .         serialize(x, connection = NULL, version = 2L, xdr = TRUE)
  .     }), as.list(timestamp_col_idxes), as.list(string_col_idxes), 
  .     if (repartition > 0) as.integer(repartition) else 1L)
11. invoke_static.spark_shell_connection(sc, "sparklyr.Utils", "parallelize", 
  .     spark_context(sc), num_rows, cols %>% unname() %>% lapply(function(x) {
  .         serialize(x, connection = NULL, version = 2L, xdr = TRUE)
  .     }), as.list(timestamp_col_idxes), as.list(string_col_idxes), 
  .     if (repartition > 0) as.integer(repartition) else 1L)
12. invoke_method(sc, TRUE, class, method, ...)
13. invoke_method.spark_shell_connection(sc, TRUE, class, method, 
  .     ...)
14. core_invoke_method(sc, static, object, method, FALSE, ...)
15. core_invoke_method_impl(sc, static, noreply = FALSE, object, 
  .     method, return_jobj_ref, ...)
16. spark_error(msg)
17. rlang::abort(message = msg, use_cli_format = TRUE, call = NULL)
18. signal_abort(cnd, .file)
# Processing start date
system(" wget  -nd  biobank.ndph.ox.ac.uk/ukb/ukb/auxdata/olink_processing_start_date.dat")
olink_processing_start_date <- fread("olink_processing_start_date.dat")
olink_processing_start_date_sdf <- sparklyr::copy_to(sc, olink_processing_start_date, overwrite = TRUE)
Error:
! java.lang.IllegalArgumentException: Invalid type 2

Run `sparklyr::spark_last_error()` to see the full Spark error (multiple lines)
To use the previous style of error message set
`options("sparklyr.simple.errors" = TRUE)`
Traceback:

1. sparklyr::copy_to(sc, olink_processing_start_date, overwrite = TRUE)
2. copy_to.spark_connection(sc, olink_processing_start_date, overwrite = TRUE)
3. sdf_copy_to(dest, df, name, memory, repartition, overwrite, ...)
4. sdf_copy_to.default(dest, df, name, memory, repartition, overwrite, 
 .     ...)
5. sdf_import(x, sc, name, memory, repartition, overwrite, struct_columns, 
 .     ...)
6. sdf_import.default(x, sc, name, memory, repartition, overwrite, 
 .     struct_columns, ...)
7. spark_data_copy(sc, x, name = name, repartition = repartition, 
 .     serializer = serializer, struct_columns = struct_columns)
8. spark_data_perform_copy(sc, serializers[[serializer]], df, repartition, 
 .     raw_columns)
9. serializer(sc, df, columns, repartition)
10. invoke_static(sc, "sparklyr.Utils", "parallelize", spark_context(sc), 
  .     num_rows, cols %>% unname() %>% lapply(function(x) {
  .         serialize(x, connection = NULL, version = 2L, xdr = TRUE)
  .     }), as.list(timestamp_col_idxes), as.list(string_col_idxes), 
  .     if (repartition > 0) as.integer(repartition) else 1L)
11. invoke_static.spark_shell_connection(sc, "sparklyr.Utils", "parallelize", 
  .     spark_context(sc), num_rows, cols %>% unname() %>% lapply(function(x) {
  .         serialize(x, connection = NULL, version = 2L, xdr = TRUE)
  .     }), as.list(timestamp_col_idxes), as.list(string_col_idxes), 
  .     if (repartition > 0) as.integer(repartition) else 1L)
12. invoke_method(sc, TRUE, class, method, ...)
13. invoke_method.spark_shell_connection(sc, TRUE, class, method, 
  .     ...)
14. core_invoke_method(sc, static, object, method, FALSE, ...)
15. core_invoke_method_impl(sc, static, noreply = FALSE, object, 
  .     method, return_jobj_ref, ...)
16. spark_error(msg)
17. rlang::abort(message = msg, use_cli_format = TRUE, call = NULL)
18. signal_abort(cnd, .file)

Could you please provide some assistance or guidance on how to resolve these errors at your earliest convenience?

Thanks a lot!

mel-lew commented 4 weeks ago

Thank you for bringing this to our attention. We have just merged in a fix, #15 , that addresses this problem
Best wishes Melissa