GEDI-PA / vl_GEDI-PA_2024

GEDI global PA analysis v.2024
0 stars 0 forks source link

Troubleshooting extract_gedi in matching.R #1

Open wildintellect opened 1 month ago

wildintellect commented 1 month ago

@abarenblitt https://github.com/GEDI-PA/vl_GEDI-PA_2024/blob/main/matching_func_2024.r

  1. the Int64 issue, casting the shot_number as characters seems ok for now since they are unique IDs
  2. iso_matched_gedi_df is empty but still used in an rbind?
  3. There is no need to cast sf objects and recast as SpatialPointsDataFrame (this is probably really inefficient)
    # convert sf to SpatVect for Terra
    vsf <- st_read(f)
    v <- vect(vsf)

    https://github.com/rspatial/terra/issues/38#issuecomment-633094236

  4. There's an Extract that is then cbinded to the original table in the same function/loop where all the data is being read and combined. This is likely not memory efficient since the whole raster ends up in memory at the same time.
  5. The loop inside the function should probably be it's own function since listing the gpkg on s3 only need to be done once and doesn't change inside the function.

I didn't have the matched or mras so I couldn't fully test everything. Mostly tested the loading and joining of the tabular data.

Testing code


packages <- c("sp","sf","dplyr","terra","tidyr","magrittr","paws")

package.check <- lapply(packages, FUN = function(x) {
  suppressPackageStartupMessages(library(x, character.only = TRUE))
})

s3 <- paws::s3()

extract_gedi2b <- function(matched, mras){

  ###TODO: This section can probably be moved out of the function, the inputs don't change it
  results <- s3$list_objects_v2(Bucket = "maap-ops-workspace", 
                                Prefix=paste("shared/abarenblitt/GEDI_global_PA_v2/WDPA_gedi_L2A_tiles/",sep=""))
  all_gedil2_f <- sapply(results$Contents, function(x) {x$Key})
  pattern=paste(".gpkg",sep="")
  all_gedil2_f <- grep(pattern, all_gedil2_f, value=TRUE)
  all_gedil2_f <- basename(all_gedil2_f)[8]#[4:6] #Currently specifying working files

  results4 <- s3$list_objects_v2(Bucket = "maap-ops-workspace", 
                                 Prefix=paste("shared/abarenblitt/GEDI_global_PA_v2/WDPA_gedi_L4A_tiles/",sep=""))
  all_gedil4_f <- sapply(results4$Contents, function(x) {x$Key})
  pattern4=paste(".gpkg",sep="")
  all_gedil4_f <- grep(pattern4, all_gedil4_f, value=TRUE)
  all_gedil4_f <- basename(all_gedil4_f)[8]#[4:6] #Currently specifying working files

  results2b <- s3$list_objects_v2(Bucket = "maap-ops-workspace", 
                                  Prefix=paste("shared/abarenblitt/GEDI_global_PA_v2/WDPA_gedi_L2B_tiles/",sep=""))
  all_gedil2b_f <- sapply(results2b$Contents, function(x) {x$Key})
  pattern=paste(".gpkg",sep="")
  all_gedil2b_f <- grep(pattern, all_gedil2b_f, value=TRUE)
  all_gedil2b_f <- basename(all_gedil2b_f[8])#[4:6] #Currently specifying working files
  ###

  # Initialize an empty list to store results
  results_list <- list()
  ### TODO: Are you sure you need the next line?
  iso_matched_gedi_df <- NULL # Initialize before loop

  # Iterate over the sequence of indices for your files
  for (this_csvid in seq_along(all_gedil2_f)) {
    cat("Reading in no. ", this_csvid, "csv of ", length(all_gedil2_f), "csvs for iso3", iso3, "\n")

    ### Make this it's own function
    # Read GEDI L4A data
    gedil4_f_path <- paste(gedipath, "WDPA_gedi_L4A_tiles/", all_gedil4_f[this_csvid], sep = "")
    gedil4_f <- st_read(gedil4_f_path, int64_as_string = TRUE)

    # Read GEDI L2A data
    gedil2_f_path <- paste(gedipath, "WDPA_gedi_L2A_tiles/", all_gedil2_f[this_csvid], sep = "")
    gedil2_f <- st_read(gedil2_f_path, int64_as_string = TRUE)

    # Check if GEDI L4A data is empty
    if (nrow(gedil4_f) < 1) {
      cat("Error: No data for GEDI L4A\n")
      gedi_l24 <- gedil2_f
      gedi_l24$agbd <- NA
      gedi_l24$agbd_se <- NA
      gedi_l24$agbd_t <- NA
      gedi_l24$agbd_t_se <- NA
    } else {
      # Select relevant columns from GEDI L4A
      ### Drop the geometry, it's redundant
      gedi_l4_sub <- gedil4_f %>% st_drop_geometry() %>%
        dplyr::select(shot_number, agbd, agbd_se, agbd_t, agbd_t_se)
      ### Return here, do the join in the outer function
      # Join with GEDI L2A data
      gedi_l24 <- inner_join(gedil2_f, gedi_l4_sub, by = "shot_number")
    }

    print(dim(gedi_l24))

    ###TODO: Make this it's own function
    # Read GEDI L2B data
    gedil2b_f_path <- paste(gedipath, "WDPA_gedi_L2B_tiles/", all_gedil2b_f[this_csvid], sep = "")
    gedil2b_f <- st_read(gedil2b_f_path, int64_as_string = TRUE)
    names(gedil2b_f)[names(gedil2b_f) == "geolocation.lon_lowestmode"] <- "lon_lowestmode"
    names(gedil2b_f)[names(gedil2b_f) == "geolocation.lat_lowestmode"] <- "lat_lowestmode"
    names(gedil2b_f)[names(gedil2b_f) == "land_cover_data.landsat_treecover"] <- "landsat_treecover"

    # Check if GEDI L2B data is empty
    if (nrow(gedil2b_f) < 1) {
      cat("Error: No data for GEDI L4A\n")
      gedi_l24b <- gedi_l24
      gedi_l24b$agbd <- NA
      gedi_l24b$agbd_se <- NA
      gedi_l24b$agbd_t <- NA
      gedi_l24b$agbd_t_se <- NA
    } else {
      # Select relevant columns from GEDI L4A
      ### Drop the geometry, it's redundant
      gedi_l2b_sub <- gedil2b_f %>% st_drop_geometry() %>%
        dplyr::select(shot_number, landsat_treecover, pai, fhd_normal)
      ### Return here, do the join in the outer function
      # Join with GEDI L2A data
      gedi_l24b <- inner_join(gedi_l24, gedi_l2b_sub, by = "shot_number")
    }
    print(dim(gedi_l24b))

    # Initialize empty spatial object for the current iteration
    gedi_l24b_sp <- NULL

    # Convert to spatial points data frame if there is data
    if (nrow(gedi_l24b) > 0) {
      ### You don't want to use SPDF you want a SpatVect
      ### See https://github.com/rspatial/terra/issues/38#issuecomment-633094236
      ### since it's already an sf object above now, the conversion is much simpler
      ### As a SpatVect you could keep the shot_number but the cbind is ok
      ### There is an option to bind inside the extract function too
      gedi_l24b_sp <- SpatialPointsDataFrame(
        coords = gedi_l24b[, c("lon_lowestmode", "lat_lowestmode")],
        data = gedi_l24b,
        proj4string = CRS("+init=epsg:4326")
      ) %>% spTransform(CRS("+init=epsg:6933"))

      ### TODO: Move this to it's own function
      matched_gedi <- terra::extract(mras,vect(gedi_l24b_sp), df=TRUE)
      matched_gedi_metrics <- cbind(matched_gedi,gedi_l24b_sp@data)
      matched_gedi_metrics_filtered <- matched_gedi_metrics %>% dplyr::filter(!is.na(status)) %>% 
        convertFactor(matched0 = matched,exgedi = .) 

      iso_matched_gedi_df <- rbind(matched_gedi_metrics_filtered,iso_matched_gedi_df)
      print(dim(iso_matched_gedi_df))
    }

    # Store results in a list
    results_list[[this_csvid]] <- iso_matched_gedi_df
  }

  # Combine all results
  if (!is.null(iso_matched_gedi_df)) {
    iso_matched_gedi_df <- do.call(rbind, results_list)
  }

  cat("Done GEDI processing\n")
  return(iso_matched_gedi_df)
}
wildintellect commented 1 month ago

@abarenblitt I remembered one more thing.

  1. Move as many package installs to conda, when testing I added:
    - r-lme4
    - r-arm

    and cut the install.R just to optmatch (though now I see optmatch in the conda :thinking: )

abarenblitt commented 1 month ago

I've gone through and separated out some of the loop to run in pieces as suggested. It seems the last part (the extract and cbind) is where the memory is getting clogged up:

results <- s3$list_objects_v2(Bucket = "maap-ops-workspace", Prefix=paste("shared/abarenblitt/GEDI_global_PA_v2/WDPA_gedi_L2A_tiles/",sep="")) all_gedil2_f <- sapply(results$Contents, function(x) {x$Key}) pattern=paste(".gpkg",sep="") all_gedil2_f <- grep(pattern, all_gedil2_f, value=TRUE) all_gedil2_f <- basename(all_gedil2_f)[8]#[4:6] #Currently specifying working files

results4 <- s3$list_objects_v2(Bucket = "maap-ops-workspace", Prefix=paste("shared/abarenblitt/GEDI_global_PA_v2/WDPA_gedi_L4A_tiles/",sep="")) all_gedil4_f <- sapply(results4$Contents, function(x) {x$Key}) pattern4=paste(".gpkg",sep="") all_gedil4_f <- grep(pattern4, all_gedil4_f, value=TRUE) all_gedil4_f <- basename(all_gedil4_f)[8]#[4:6] #Currently specifying working files results2b <- s3$list_objects_v2(Bucket = "maap-ops-workspace", Prefix=paste("shared/abarenblitt/GEDI_global_PA_v2/WDPA_gedi_L2B_tiles/",sep="")) all_gedil2b_f <- sapply(results2b$Contents, function(x) {x$Key}) pattern=paste(".gpkg",sep="") all_gedil2b_f <- grep(pattern, all_gedil2b_f, value=TRUE) all_gedil2b_f <- basename(all_gedil2b_f[8])#[4:6] #Currently specifying working files

extract_gedi2b <- function(matched){

# Initialize an empty list to store results
iso_matched_gedi_df <- NULL # Initialize before loop

        # Iterate over the sequence of indices for your files
for (this_csvid in seq_along(all_gedil2_f)) {
            cat("Reading in no. ", this_csvid, "csv of ", length(all_gedil2_f), "csvs for iso3", iso3, "\n")

            # Read GEDI L4A data
            gedil4_f_path <- paste(gedipath, "WDPA_gedi_L4A_tiles/", all_gedil4_f[this_csvid], sep = "")
            gedil4_f <- as.data.frame(st_read(gedil4_f_path))

            # Read GEDI L2A data
            gedil2_f_path <- paste(gedipath, "WDPA_gedi_L2A_tiles/", all_gedil2_f[this_csvid], sep = "")
            gedil2_f <- as.data.frame(st_read(gedil2_f_path))

            # Read GEDI L2B data
            gedil2b_f_path <- paste(gedipath, "WDPA_gedi_L2B_tiles/", all_gedil2b_f[this_csvid], sep = "")
            gedil2b_f <- as.data.frame(st_read(gedil2b_f_path))
            names(gedil2b_f)[names(gedil2b_f) == "geolocation.lon_lowestmode"] <- "lon_lowestmode"
            names(gedil2b_f)[names(gedil2b_f) == "geolocation.lat_lowestmode"] <- "lat_lowestmode"
            names(gedil2b_f)[names(gedil2b_f) == "land_cover_data.landsat_treecover"] <- "landsat_treecover"

            # Check if GEDI L4A data is empty
            if (nrow(gedil4_f) < 1) {
                cat("Error: No data for GEDI L4A\n")
                gedi_l24 <- gedil2_f
                gedi_l24$agbd <- NA
                gedi_l24$agbd_se <- NA
                gedi_l24$agbd_t <- NA
                gedi_l24$agbd_t_se <- NA
            } else {
                # Select relevant columns from GEDI L4A
                gedi_l4_sub <- gedil4_f %>%st_drop_geometry()%>% 
                    dplyr::select(shot_number, agbd, agbd_se, agbd_t, agbd_t_se)

                # Join with GEDI L2A data
                gedi_l24 <- inner_join(gedil2_f, gedi_l4_sub, by = "shot_number")
            }

            # print(dim(gedi_l24))

            # Check if GEDI L2B data is empty
            if (nrow(gedil2b_f) < 1) {
                cat("Error: No data for GEDI L4A\n")
                gedi_l24b <- gedi_l24
                gedi_l24b$landsat_treecover<- NA
                gedi_l24b$pai <- NA
                gedi_l24b$fhd_normal <- NA

            } else {
                # Select relevant columns from GEDI L4A
                gedi_l2b_sub <- gedil2b_f %>%st_drop_geometry()%>% 
                    dplyr::select(shot_number, landsat_treecover, pai, fhd_normal)

                # Join with GEDI L2A data
                gedi_l24b <- inner_join(gedi_l24, gedi_l2b_sub, by = "shot_number")
            }
            # print(class(gedi_l24b))
    # Save results to RDS and CSV files
    saveRDS(gedi_l24b, file = paste(f.path3, "GNB_extractStep1/", iso3, "_pa_", id_pa, 
                                       "_gedi_wk_", gediwk, ".RDS", sep = ""))

    cat(id_pa, "in", iso3, "results are written to directory\n")

}}

iso_test<-extract_gedi2b(matched = matched) extracted<-list.files(paste(f.path3,"GNB_extractStep1/",sep=""), pattern=".RDS", full.names = TRUE)

/// THIS PART USES UP TOO MUCH MEMORY****///

extract_gediPart2 <- function(matched,mras){

results_list <- list()

# Initialize empty spatial object for the current iteration
for (this_csvid in seq_along(extracted)) {
gedi_l24b <- readRDS(extracted[this_csvid])
gedi_l24b_sp <- NULL
if (nrow(gedi_l24b) > 0) {
    gedi_l24b_sp <- SpatialPointsDataFrame(
        coords = gedi_l24b[, c("lon_lowestmode", "lat_lowestmode")],
        data = gedi_l24b,
        proj4string = CRS("+init=epsg:4326")
    ) %>% spTransform(CRS("+init=epsg:6933"))

  matched_gedi <- terra::extract(mras,vect(gedi_l24b_sp), df=TRUE)
  matched_gedi_metrics <- cbind(matched_gedi,gedi_l24b_sp@data)
  matched_gedi_metrics_filtered <- matched_gedi_metrics %>% dplyr::filter(!is.na(status)) %>% 
  convertFactor(matched0 = matched,exgedi = .) 

  iso_matched_gedi_df <- rbind(matched_gedi_metrics_filtered,iso_matched_gedi_df)
  print(dim(iso_matched_gedi_df))
}

# Store results in a list
# results_list[[this_csvid]] <- iso_matched_gedi_df

}

Combine all results

if (!is.null(iso_matched_gedi_df)) { iso_matched_gedi_df <- do.call(rbind, results_list) }

cat("Done GEDI processing\n") return(iso_matched_gedi_df) }

mras <- tryCatch({ matched2ras(matched) }, error = function(e) { cat("Error converting matched data to raster stack for PA", id_pa, ":", e$message, "\n") return(NULL) })

iso_test2<-extract_gediPart2(matched = matched,mras = mras)

abarenblitt commented 1 month ago

These are links to the necessary variables for testing:

extract: https://ade.maap-project.org/serverajb2pmem-jwtproxy/server-4401/lab/tree/output/WDPA_matching_results/GNB_extractStep1/GNB_pa_342659_gedi_wk_24.RDS

matched: https://maap-ops-workspace.s3.amazonaws.com/shared/abarenblitt/GEDI_global_PA_v2/GNB_wk24Work/GNB_pa_342659_matching_results_wk24.RDS?AWSAccessKeyId=ASIA43WBHR7NWUUNWH6J&Signature=2txLOewF9uzk231rQ5p0suPdGGY%3D&x-amz-security-token=IQoJb3JpZ2luX2VjECQaCXVzLXdlc3QtMiJHMEUCIFwTzw61j8xRaeVHxz%2BPjGik9XqBRwIC%2BvcjIY7Re%2FuQAiEAsHZmBWatxdfRSO5%2FqfcHvZxnDz1zNKtBuF4gYZMPIUcquAUIfRADGgw4ODQwOTQ3NjcwNjciDCo%2F3Zu1cEhyqNbo9SqVBQ8egk7uiyQP%2BcO97M5mSuJnbdSs7WIHcesVmGCsS%2BIl2N8xhjPFgm2VnIBq367NJygsXD8RyVLnAARYfqPdDhpqCeW0qBnIQ2AFYU4%2Ba89GH3iKYAWVAEXsnphdcizTO%2BLL6fNZQfKB8M1D%2FAClVWzc3cwtK8FT%2B%2FD%2FwCcTskmb7zfSHYHmoz2GagMkTVUyLQwINwVLGxav9QbgbRTyKqBC%2BrpG%2BNF3c1zKQR91fHeC4z2vOb8CZDYLWpiX3pAfEbXNtKZBKJ8%2BF%2FmVAXyQxS0TpBZDXQTgT2FyTEPyyw%2FhDhbR4A7ydDdJet8n7d6NvvRRzBAoEBo206UMbYTOgiBXws2LjalWB0pGqE2%2F60Pc2hXaX4zU9EBRU%2FGvWWTnRg5BFEv9mUuRboXikpCjwUuCYKVmxbMjUZVtGgalqip4g8O6Sp%2BI%2BAPjMu5iWutojOubY%2B7bevwmJv1OB9BdKw2lKuxqEs%2BMmVH0nQaQZ0uO11UoOlvRER4%2FskWh2xBqA87jUeT84G67IwPuKm8xIMwyriB6LA7rnh6pDTH7769xhf2RhLiMd81QnrVPsl5nw2Qqvf5uEe1Rg0DzU7%2F%2FXLkQ40QswxQseXYsfETqdnZKFLPn32eb9WOGROCzIcFc7%2B9%2FFhjrnRK2mDwU9KPGi1XH1M8YiqitZwLlk1vUle3InGbI4Z9r7F9ZVX3KgwiP2Dtmr013zL7Snra0bRp0LYkgYSrkumSlIkUCHHK2c7MIRw1DQ5WJgpdVAArRIdd0Zrwr6Bl5pFHdiKaFodxRP5D01QsIVn%2F5K6b58oRgwx5yQm6TkhHhXF4WDUrTLhy%2BVNCcAI0VO%2BlK3cpe%2FbjIls%2F2wIjLGhXBj1XWGu50aUosY4ddeWow7eSguAY6sQEqKecMDlJqYJpSrYlVOTAOXe8bJHsyPxoBLz%2BLWyCuZvM0Id2%2BLWWb3vKWuJHSztRoeOgTQLrpaasJgrKcQSJn9Y86zUcrSF3TQSXMkCmsTyddBwfcZKk70cpoOEbLJmduEB6l62b4tait417JKoDU05muiA0R6jOaSgQNZ5SJXbJHBQOxB52zJ2qA0m%2BV4LR%2BeNZdAn3h%2FgFba9kcUnTPxboPYPOfU8ch16yxsJNeR2A%3D&Expires=1728680479

mras: https://ade.maap-project.org/serverajb2pmem-jwtproxy/server-4401/lab/tree/output/WDPA_matching_results/GNBmras_pa_342659_gedi_wk_24.tif

wildintellect commented 1 month ago

@abarenblitt can you put all those files in your shared-bucket, extract and mras.

abarenblitt commented 1 month ago

Yes here!

https://ade.maap-project.org/serverajb2pmem-jwtproxy/server-4401/lab/tree/my-public-bucket/GEDI_global_PA_v2/GNB_prepped_control_wk24.RDS

https://ade.maap-project.org/serverajb2pmem-jwtproxy/server-4401/lab/tree/my-public-bucket/GEDI_global_PA_v2/GNBmras_pa_342659_gedi_wk_24.tif

wildintellect commented 1 month ago

I still can't test it properly because I don't have a gpkg L2A that corresponds to an mras but the following code seems pretty memory efficient and fast:

library(terra)
gedi_l2a <- "s3://maap-ops-workspace/shared/abarenblitt/GEDI_global_PA_v2/WDPA_gedi_L2A_tiles/tile_num_29780_L2A.gpkg"
points <- terra::vect(gsub("s3://","/vsis3/",gedi_l2a))

mras_path <- "s3://maap-ops-workspace/shared/abarenblitt/GEDI_global_PA_v2/GNBmras_pa_342659_gedi_wk_24.tif"
mras <- terra::rast(gsub("s3://","/vsis3/",mras_path))

#reproject to match the raster
points <- project(points, mras)
matched_gedi <- terra::extract(mras, points, id=FALSE, df=TRUE)

I would do this inside a function and before matching the tables.