Looking for an efficient (faster) code to execute a process using a stars object

alexyshr commented 4 years ago

I have solved my task using an sfobject, but it takes 18 mins and the number of rows (cells in starsobject) is only 3381. Do you have any ideas to execute the process faster using directly the stars object?

Available data For each cell in the NetCDF file, there are related non-hurricane (nh) (stored in excel file) and hurricane (h) (stored in Tiff files) wind velocities for different return periods in years (10, 20, 50, 100, 250, 500, 700, 1000, 1700, 3000, 7000). In the following graphic (for a specific cell), the attribute nh_50 (50 years) reports 30 km/h, whose probability is 0.02 (1/50). So the tuple (x=0.02, y=30) is one point in the nh hazard curve. With the available attributes nh_10, nh_20, ..., nh_7000, and h_10, h_20, ..., h_7000, it is possible to construct for each cell the whole nh hazard curve, and h hazard curve respectively.

Description of the task Create the combined (c) hazard curve (integration of both types of hazards) and get from it the integrated wind velocities as new attributes c_10, c_20, ..., c_7000. The values (probabilities) in the axis x of the c hazard curve are calculated with the probabilities of h and nh using the next equation. If there is no hurricane information, the combined wind velocity is equal to the non-hurricane.

$P_e(y>Y_N) = 1 - P_{NH}(y<Y_N)P_{H}(y<Y_N)$

where

$P_e(y>Y_N)$

is the probability for the combined wind hazards,

$P_{NH}(y<Y_N)$

is the probability for non-hurricane winds, and

$P_{H}(y<Y_N)$

is the probability for hurricane winds.

-

Summary of the procedure: First, for each cell gets hurricane and non-hurricane information for specific return periods in years, then, constructs detailed hazard curves (as some pairs of points in each curve are available, it is possible to interpolate between them to detail the curve) for hurricane and non-hurricanes, next, integrates hazard curves into a detailed combined hazard curve representing both types of events simultaneously, finally, interpolate/calculate integrated wind velocities as attributes using combined hazard curve.

# get data
myurl <- "http://geocorp.co/wind/hazard_curves.zip"
filename <- "hazard_curves.zip"
download.file(myurl, filename, mode = "wb")
unzip(filename)

library(stars)
#> Warning: package 'stars' was built under R version 3.6.3
#> Loading required package: abind
#> Loading required package: sf
#> Warning: package 'sf' was built under R version 3.6.3
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(sf)
library(xlsx)
#> Warning: package 'xlsx' was built under R version 3.6.3
library(stringr)
path <- "./"
ncname <- "10fg_jan1_2017"
ncfile <- paste0(path, ncname, ".nc", sep = "")

# Create stars object from netcdf file
# isdcolraster.st1 = read_stars(ncfile) #This change attribute names
st1 <- read_ncdf(ncfile)
#> no 'var' specified, using fg10
#> other available variables:
#>  longitude, latitude, time
#> No projection information found in nc file. 
#>  Coordinate variable units found to be degrees, 
#>  assuming WGS84 Lat/Lon.

# Add the id to stars cells: top left to right, then down in zigzag
lon <- read_ncdf(ncfile, var = c("longitude"))
#> No projection information found in nc file. 
#>  Coordinate variable units found to be degrees, 
#>  assuming WGS84 Lat/Lon.
nlon <- dim(lon)
lat <- read_ncdf(ncfile, var = c("latitude"))
#> Warning in .get_nc_projection(meta$attribute, rep_var, all_coord_var): No
#> projection information found in nc file.
nlat <- dim(lat)
st1$cellindexlr <- 1:(nlon * nlat)

# Create sf object from stars
sf1 <- st_as_sf(st1)
names(sf1) <- c("fg10_2017-01-01", "cellindexlr", "geometry")

# NON HURRICANE info
# Read non hurricane return levels into nhrl, and add to lonlat.unstacklr
nhrl <- read.xlsx(file = "rlera5_original.xlsx", sheetName = "pp_pintar")
nhrl$cellindexlr <- as.integer(nhrl$cellindexlr)

# Join both st1 and nhrl in a sf object
sf_nh <- st_sf(sf1[, 1:2, drop = TRUE], nhrl[, 2:12], geometry = sf1$geometry)

# View return level nh_700
plot(sf_nh[, "nh_700"], key.pos = 1, reset = FALSE)


# stars version of non-hurricanes info
st_nh <- st_as_stars(sf_nh)

# HURRICANE INFO
# Read hurrican rasters (with grd there is a problem in return level 500 file COL_CV-WIND_TR500_INT1.grd)
rasterfiles <- list.files(path = "./", pattern = "*.*.tif$")
rasterfiles <- paste0("./", rasterfiles)
rasterfiles <- str_sort(rasterfiles, numeric = TRUE)
st_h <- read_stars(rasterfiles, quiet = TRUE)
mynames <- names(st_h)
start <- str_locate(mynames, "_")
stop <- str_locate(mynames, "\\.tif")
mynames <- str_sub(mynames, start = start[, 1] + 1, end = stop[, 1] - 1)
st_h <- setNames(st_h, mynames)
# sf version of hurricanes info
sf_h <- st_as_sf(st_h, as_points = FALSE, merge = FALSE)

# View return level h_700
plot(sf_h[, "h_700"], key.pos = 1, reset = FALSE)


# JOIN NON-HURRICANES AND HURRICANES INFO in DATA.FRAME
df_nh_h <- cbind(sf_nh[, 2:13, drop = TRUE], sf_h[, 2:12, drop = TRUE])

# ADD EMPTY COLUMNS FOR COMBINED(h integrated with nh) WIND INFO
df_nh_h$c_10 <- NA
df_nh_h$c_20 <- NA
df_nh_h$c_50 <- NA
df_nh_h$c_100 <- NA
df_nh_h$c_250 <- NA
df_nh_h$c_500 <- NA
df_nh_h$c_700 <- NA
df_nh_h$c_1000 <- NA
df_nh_h$c_1700 <- NA
df_nh_h$c_3000 <- NA
df_nh_h$c_7000 <- NA

# sf for non hurricanes (nh), hurricanes (h) and empty combined (c) info
sf_nh_h_c <- st_sf(df_nh_h, geometry = sf_nh$geometry)

# Function to combine hurricane (h_) and non-hurricane (nh_)
combined <- function(id,
                     sf = sf_nh_h_c,
                     nhcolumns = c(2:12),
                     hcolumns = c(13:23),
                     typicalmri = c(10, 20, 50, 100, 250, 500, 700, 1000, 1700, 3000, 7000)) {
  typicalpa <- 1 / typicalmri
  typicalrlh <- as.data.frame(sf[sf$cellindexlr == id, ])[hcolumns]
  typicalrlh <- t(typicalrlh)
  typicalrlnh <- as.data.frame(sf[sf$cellindexlr == id, ])[nhcolumns]
  typicalrlnh <- t(typicalrlnh)

  # Check if hurricane return levels have not NA values
  if (sum(is.na(typicalrlh)) == 0) { # All values need to be non NA
    rl <- seq(from = 10, to = 600, by = 1)
    peh <- Hmisc::approxExtrap(x = typicalrlh[, 1], y = typicalpa, xout = rl)$y
    peh[peh < 0] <- 0
    peh[peh > 1] <- 1

    penh <- Hmisc::approxExtrap(x = typicalrlnh[, 1], y = typicalpa, xout = rl)$y
    penh[penh < 0] <- 0
    penh[penh > 1] <- 1

    # Probability of combined hazard curve
    pec <- (1 - ((1 - peh) * (1 - penh)))
    # Return level of combined hazard curve
    typicalrlc <- Hmisc::approxExtrap(x = pec, y = rl, xout = typicalpa)$y
    typicalrlc[typicalrlc < 0] <- 0
  } else { # if (sum(is.na(typicalrlh)) == 0) if any of the hurricane rl is NA
    # In case of not availability of hurricane data, then combined rl is equal to non hurricane rl
    typicalrlc <- as.numeric(typicalrlnh[, 1])
  }
  return(typicalrlc)
}

# Perform the calculation of combined c based on h and nh
idlist <- sf_nh_h_c$cellindexlr
start_time <- Sys.time()
for (id in idlist) {
  typicalrlc <- combined(id)
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_10 <- typicalrlc[1]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_20 <- typicalrlc[2]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_50 <- typicalrlc[3]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_100 <- typicalrlc[4]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_250 <- typicalrlc[5]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_500 <- typicalrlc[6]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_700 <- typicalrlc[7]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_1000 <- typicalrlc[8]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_1700 <- typicalrlc[9]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_3000 <- typicalrlc[10]
  sf_nh_h_c[sf_nh_h_c$cellindexlr == id, ]$c_7000 <- typicalrlc[11]
  # cat(id)
  # cat("\n")
}
end_time <- Sys.time()
(end_time - start_time)
#> Time difference of 15.0332 mins
# Time difference of 18.11518 mins

# Convert sf to stars the retult
st_nh_h_c <- st_as_stars(sf_nh_h_c)

# Plot some combined results

attributesnames <- c("c_700", "c_1700", "c_3000")
plot(st_nh_h_c[attributesnames, ], border = NA)

alexyshr commented 4 years ago

I wrote this new code using the long version (reshape2::melt) of a data.table and executing the new function extrapolaonce by unique values of the field name(repeating cell identifier in the long table), for instance using the code: full_hch_df <- hch_df[, extrapola(x=x_rl_h, y=y_pa_h, xout=rl600), by=name]. At the end the long table is returned to the wide format with stats::reshape Even though the processing time improved a lot (3 seconds Vs. 18 min):

I feel I am destroying and rebuilding the sfobject in an inappropriate or disorganized way
Another point is that I am still using the sfobject, not the starsobject

I appreciate the recommendations to optimize the way I work with stars(and sf)

Best,

Note: I ran this new code with an updated R version (4.0.2 - 2020-06-22). It seems that the way the command plot.stars (_stars0.4-4) defines default breaks is different compared with the previous version I used: R version 3.6.2 (2019-12-12) and _stars0.4-3. So, I compared the results (arrays) and both are equal, despite the final maps look very different, however, there is an unexpected difference in the maximum value of the scales!

library(stars)
#> Loading required package: abind
#> Loading required package: sf
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(sf)
library(xlsx)
library(stringr)
library(reshape2)
library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:reshape2':
#> 
#>     dcast, melt
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#> 
#>     between, first, last
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
path <- "./"
ncname <- "10fg_jan1_2017"
ncfile = paste0(path, ncname, ".nc", sep = "")

#Create stars object from netcdf file
#isdcolraster.st1 = read_stars(ncfile) #This change attribute names
st1 = read_ncdf(ncfile)
#> no 'var' specified, using fg10
#> other available variables:
#>  longitude, latitude, time
#> No projection information found in nc file. 
#>  Coordinate variable units found to be degrees, 
#>  assuming WGS84 Lat/Lon.

#Add the id to stars cells: top left to right, then down in zigzag
lon <- read_ncdf(ncfile, var = c("longitude"))
#> No projection information found in nc file. 
#>  Coordinate variable units found to be degrees, 
#>  assuming WGS84 Lat/Lon.
nlon = dim(lon)
lat <- read_ncdf(ncfile, var = c("latitude"))
#> Warning in .get_nc_projection(meta$attribute, rep_var, all_coord_var): No
#> projection information found in nc file.
nlat = dim(lat)
st1$cellindexlr =1:(nlon*nlat)

#Create sf object from stars
sf1 = st_as_sf(st1)
names(sf1) = c("fg10_2017-01-01", "cellindexlr", "geometry")

#NON HURRICANE info
#Read non hurricane return levels into nhrl, and add to lonlat.unstacklr
nhrl = read.xlsx(file="rlera5_original.xlsx", sheetName = "pp_pintar")
nhrl$cellindexlr = as.integer(nhrl$cellindexlr)

#Join both st1 and nhrl in a sf object
sf_nh = st_sf(sf1[,1:2, drop=TRUE], nhrl[,2:12], geometry=sf1$geometry)

#View return level nh_700
plot(sf_nh[,"nh_700"], key.pos = 1, reset=FALSE)


#stars version of non-hurricanes info
st_nh = st_as_stars(sf_nh)

#HURRICANE INFO
#Read hurrican rasters (with grd there is a problem in return level 500 file COL_CV-WIND_TR500_INT1.grd)
rasterfiles = list.files(path="./", pattern = "*.*.tif$")
rasterfiles = paste0("./",rasterfiles)
rasterfiles = str_sort(rasterfiles, numeric = TRUE)
st_h = read_stars(rasterfiles, quiet = TRUE)
mynames = names(st_h)
start = str_locate(mynames, "_")
stop = str_locate(mynames, "\\.tif")
mynames = str_sub(mynames, start=start[,1]+1, end=stop[,1]-1)
st_h = setNames(st_h, mynames)
#sf version of hurricanes info
sf_h = st_as_sf(st_h, as_points=FALSE, merge =FALSE)

#View return level h_700
plot(sf_h[,"h_700"], key.pos = 1, reset=FALSE)


#JOIN NON-HURRICANES AND HURRICANES INFO in DATA.FRAME
df_nh_h = cbind(sf_nh[ ,2:13, drop=TRUE], sf_h[, 2:12, drop=TRUE])

#ADD EMPTY COLUMNS FOR COMBINED(h integrated with nh) WIND INFO
df_nh_h$c_10 = NA
df_nh_h$c_20 = NA
df_nh_h$c_50 = NA
df_nh_h$c_100 = NA
df_nh_h$c_250 = NA
df_nh_h$c_500 = NA
df_nh_h$c_700 = NA
df_nh_h$c_1000 = NA
df_nh_h$c_1700 = NA
df_nh_h$c_3000 = NA
df_nh_h$c_7000 = NA

#sf for non hurricanes (nh), hurricanes (h) and empty combined (c) info
sf_nh_h_c = st_sf(df_nh_h, geometry=sf_nh$geometry)

#Function to combine hurricane (h_) and non-hurricane (nh_)
#listofids=1:3
combined_columns <- function(sf=sf_nh_h_c, 
                             listofids=listofids, #Identifiers of sf to filter the dataset
                             filtersf=sf$cellindexlr %in% listofids, 
                             columns_rl_h=c("h_10","h_20","h_50","h_100","h_250","h_500","h_700","h_1000","h_1700","h_3000","h_7000"), 
                             columns_rl_nh=c("nh_10","nh_20","nh_50","nh_100","nh_250","nh_500","nh_700","nh_1000","nh_1700","nh_3000","nh_7000"),
                             typicalmri= c(10,20,50,100,250,500,700,1000,1700,3000,7000)) {
  #HURRICANE info applying 'filtersf', 
  #converting to long table and sorting
  #Repeat the id in the field 'name'
  rl_h = cbind(name=listofids, as.data.frame(sf[filtersf, ])[columns_rl_h])
  #Long table version of 'rl_h' with fields 'name', 'h' and 'value'
  rl_h = reshape2::melt(rl_h, id.vars='name', variable.name='h')
  #Sorting
  rl_h = rl_h[order(rl_h$name,rl_h$h),]

  #NON HURRICANE info applying filtersf,
  #converting to long table and sorting
  #Repeat the id in the field 'name'
  rl_nh = cbind(name=listofids, as.data.frame(sf[filtersf, ])[columns_rl_nh])
  #Long table version of 'rl_nh' with fields 'name', 'nh' and 'value'
  rl_nh = reshape2::melt(rl_nh, id.vars='name', variable.name='nh')
  #Sorting
  rl_nh = rl_nh[order(rl_nh$name,rl_nh$nh),]

  #Create a long table version of typical mean return intervals,
  #with fields 'name', 'mri' and 'value'
  typicalmri.df = data.frame(typicalmri=rep("typicalmri", times=length(listofids)), stringsAsFactors = FALSE) #create a list with the strings representing expressions
  typicalmri.df = lapply(typicalmri.df$typicalmri, function(x){
    x=parse(text=x)
    eval(x)
  })   
  typicalmri.df <- do.call(rbind, typicalmri.df) #unnest the matrix
  colnames(typicalmri.df) <- paste0("mri", typicalmri)
  typicalmri.df = cbind(name=listofids,data.frame(typicalmri.df))
  typicalmri.df = reshape2::melt(typicalmri.df, id.vars='name', variable.name='mri')
  typicalmri.df = typicalmri.df[order(typicalmri.df$name,typicalmri.df$mri),]

  #Create a long table version of probabilities for typical mean return intervals  
  typicalpa.df = typicalmri.df
  typicalpa.df$value = 1/typicalpa.df$value

  #Hazard curve long table for non-hurricanes,
  #storing the return levels corresponding to typical return periods,
  #with field names 'name', 'x_rl_nh' and 'y_pa_nh'
  hcnh_df = data.frame(name=rl_nh$name, x_rl_nh=rl_nh$value, y_pa_nh=typicalpa.df$value)
  hcnh_df = as.data.table(hcnh_df)

  #There are non NA values in non-hurricane return levels
  #
  #Check if hurricane return levels have not NA values
  if (sum(is.na(rl_h$value)) == 0) {#All values need to be non NA
    #Detailed return levels from 1 kph to 600 kph
    rl600 = seq(from=1, to=600, by=1)

    #Create a long table version of detailed return levels,
    #with fields 'name', 'rl1_600' and 'value'
    rl600.mx = data.frame(rl=rep("rl600", times=length(listofids)), stringsAsFactors = FALSE) #create a list with the strings representing expressions
    rl600.mx = lapply(rl600.mx$rl, function(x){
      x=parse(text=x)
      eval(x)
    })   
    rl600.mx <- do.call(rbind, rl600.mx) #unnest the matrix
    colnames(rl600.mx) <- paste0("rl",1:600)
    rl600.mx = cbind(name=listofids, data.frame(rl600.mx))
    rl600.mx = reshape2::melt(rl600.mx, id.vars='name', variable.name='rl1_600')
    rl600.mx = rl600.mx[order(rl600.mx$name,rl600.mx$rl1_600),]    

    #Hazard curve long table for hurricanes,
    #storing the return levels corresponding to typical mean recurrence intervals probabilities,
    #with field names 'name', 'x_rl_h' and 'y_pa_h'
    hch_df = data.frame(name=rl_h$name, x_rl_h=rl_h$value, y_pa_h=typicalpa.df$value)
    hch_df = as.data.table(hch_df)

    #Function to extrapolante (interpolate between 10 years and 7000 years)
    extrapola <- function(x, y, xout){
      #Be aware that the interpolated return value 
      #allways correspond to values in axis y (so swap axis input if you need x axis values outputs)
      list(x=xout, y=Hmisc::approxExtrap(x=x, y=y, xout=xout)$y)
    }

    #Full (means it includes rl from 1 to 600) hazard curve long table for hurricanes,
    #storing the DETAILED return levels (1 to 600 kph) corresponding 
    #to detailed mean recurrence intervals probabilities (not only the typical ones),
    #with field names 'name' (id), 'x' (return level) and 'y' (probability)
    #Be aware that extrapola function will run in a long table by name using package data.table (once for each different name)
    full_hch_df <- hch_df[, extrapola(x=x_rl_h, y=y_pa_h, xout=rl600), by=name] 
    full_hch_df$y[full_hch_df$y<0] = 0
    full_hch_df$y[full_hch_df$y>1] = 1

    #Full (means it includes rl from 1 to 600) hazard curve long table for non-hurricanes,
    #storing the DETAILED return levels (1 to 600 kph) corresponding 
    #to detailed mean recurrence intervals probabilities (not only the typical ones),
    #with field names 'name' (id), 'x' (return level) and 'y' (probability)
    #Be aware that extrapola function will run in a long table by name using package data.table (once for each different name)
    full_hcnh_df <- hcnh_df[, extrapola(x=x_rl_nh, y=y_pa_nh, xout=rl600), by=name]
    full_hcnh_df$y[full_hcnh_df$y<0] = 0
    full_hcnh_df$y[full_hcnh_df$y>1] = 1

    #Combined probability using ASCE7-16  equation C26.5-2 
    pa_c = (1 - ((1-full_hch_df$y)*(1-full_hcnh_df$y)))

    #Full (means it includes rl from 1 to 600) hazard curve long table for combined winds,
    #storing the DETAILED return levels (1 to 600 kph) corresponding 
    #to detailed mean recurrence intervals probabilities (not only the typical ones),
    #with field names 'name' (id), 'x_rl_c' (return level) and 'y_pa_c' (probability)      
    full_hcc_df = data.frame(name=full_hcnh_df$name, x_rl_c=full_hcnh_df$x, y_pa_c=pa_c)
    full_hcc_df = as.data.table(full_hcc_df)

    #From full hazard curve long table for combined winds, get (interpolate)
    #return levels for typical mean return interval probabilities, and
    #convert from long table to wide table version
    #long table version
    #Be aware that extrapola function will run in a long table by name using package data.table (once for each different name)
    hcc_df <- full_hcc_df[, extrapola(x=y_pa_c, y=x_rl_c, xout=1/typicalmri), by=name]
    hcc_df$y[hcc_df$y<0] = 0
    hcc_df = cbind(hcc_df[,"name"], c_rl=str_replace(typicalpa.df[,"mri"], "mri", "c_"), hcc_df[,"y"])
    #wide table version with proper column names (c_*)
    hcc_df = reshape(hcc_df, idvar = "name", timevar = "c_rl", direction = "wide")
    mynames = names(hcc_df)[-1]
    startstop = str_locate(mynames, "y\\.")
    mynames = str_sub(mynames, start=startstop[,2]+1, end=nchar(mynames))
    mynames = c("name",mynames)
    names(hcc_df) = mynames
  } else { #if (sum(is.na(typicalrlh)) == 0) if any of the hurricane rl is NA
    #In case of not availability of hurricane data, then combined rl is equal to non-hurricane rl
    hcc_df = cbind(hcnh_df[,"name"], c_rl=str_replace(typicalpa.df[,"mri"], "mri", "c_"), hcnh_df[,"x_rl_nh"])
    hcc_df = reshape(hcc_df, idvar = "name", timevar = "c_rl", direction = "wide")
    mynames = names(hcc_df)[-1]
    startstop = str_locate(mynames, "x_rl_nh\\.")
    mynames = str_sub(mynames, start=startstop[,2]+1, end=nchar(mynames))
    mynames = c("name",mynames)
    names(hcc_df) = mynames
  }
  return (hcc_df)
}

#First, perform the extrapolation process in all NON-NA values
listofids1 = sf_nh_h_c[!is.na(sf_nh_h_c$h_10), ]$cellindexlr

start_time <- Sys.time()
#Combined results for NON-NA values
hcc_df1 = combined_columns(sf=sf_nh_h_c, listofids=listofids1, 
                           filtersf=sf_nh_h_c$cellindexlr %in% listofids1)

#Second, all NA values
listofids2 = sf_nh_h_c[is.na(sf_nh_h_c$h_10), ]$cellindexlr
#Combined results for NA values
hcc_df2 = combined_columns(sf=sf_nh_h_c, listofids=listofids2, 
                           filtersf=sf_nh_h_c$cellindexlr %in% listofids2)

#Third, integrate NON-NA and NA results
hcc_df = rbind(hcc_df1,hcc_df2)

#Then, integrate results in sf object 'sf_nh_h_c'
#Organize column names
mycolnameshcc_df = names(hcc_df)
mycolnameshcc_df = str_replace(mycolnameshcc_df, "name", "cellindexlr")
#combined_cols = paste0("c_", c(10,20,50,100,250,500,700,1000,1700,3000,7000))
#Remove data.table class for the object
hcc_df = as.data.frame(hcc_df)
#Change column names
names(hcc_df) = mycolnameshcc_df

#Remove spatial column from sf,
#leave only combined columns (c_*), and change to data.frame
df_c = sf_nh_h_c[, c(1, 24:34), drop=TRUE]
#Note: to be able to integrate interpolated retults (hcc_df)
#      with sf 'sf_nh_h_c' it is neccesary to work only 
#      with combined columns (c_*) in a separated dataframe, because
#      the next command summarise with na.omit
#      will not work when there are NA values in the same column
#      of the rows to integrate!

#Integrate rows with the same id cellindexlr, but replacing
#NA values with NON-NA values
#Note: This procedure does not perform the rows integration,
#      if two rows with the same id have NA values in same column!
#Variable 'df_usedids_c' will hold a tibble dataframe with 
#  ids used to extrapolate!
df_usedids_c = df_c %>% 
  full_join(hcc_df) %>%
  group_by(cellindexlr) %>%
  summarise(across(.cols = everything(), .fns = na.omit))
#> Joining, by = c("cellindexlr", "c_10", "c_20", "c_50", "c_100", "c_250", "c_500", "c_700", "c_1000", "c_1700", "c_3000", "c_7000")
#> `summarise()` ungrouping output (override with `.groups` argument)
df_usedids_colnames = names(df_usedids_c)

#Define 'notin' operator, to look for non_used_ids
`%notin%` <- Negate(`%in%`)

listof_usedids = c(listofids1, listofids2)

#Tibble dataframe with ids not used to extrapolate
df_nonusedids_c = as_tibble(df_c[df_c$cellindexlr %notin% listof_usedids, df_usedids_colnames])

#Create the tibble dataframe that integrates all rows (used and non_used ids)
#We use 'bind_rows' (instead of rbind) because one dataframe (df_usedids_c) is tibble dataframe
#In this example 0 rows
df_allids_c = bind_rows(df_nonusedids_c, df_usedids_c)

#Sort final tibble dataframe (for combined fields) by id
df_allids_c = df_allids_c %>%
  arrange(cellindexlr)#arrange(x, desc(y))

#Integrate other columns (h_*, nh_*) with combined columns results in a tibble dataframe
df_nh_h_c = bind_cols(sf_nh_h_c[, c(1:23), drop=TRUE], df_allids_c[ ,2:12])

#Reassign sf 'sf_nh_h_c' with combined information
sf_nh_h_c = st_sf(df_nh_h_c, geometry=sf_nh_h_c$geometry)

end_time <- Sys.time()
(end_time - start_time)
#> Time difference of 3.360209 secs

#Finally, convert to stars and plot!  
#Convert to stars
st_nh_h_c = st_as_stars(sf_nh_h_c)
st_crs(st_nh_h_c) = 4326

#Plot some combined results
attributesnames = c("c_700", "c_1700", "c_3000")
plot(st_nh_h_c[attributesnames,], border=NA)

alexyshr commented 3 years ago

Here the solution with st_apply. It takes one additional second, but the code is shorter and easier!

library(stars)
#> Loading required package: abind
#> Loading required package: sf
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(xlsx)
library(stringr)
library(ggplot2)
library(viridis)
#> Loading required package: viridisLite
library(ggthemes)
path <- "./"
ncname <- "10fg_jan1_2017"
ncfile = paste0(path, ncname, ".nc", sep = "")

#Create stars object from netcdf file
st1 = read_ncdf(ncfile)
#> no 'var' specified, using fg10
#> other available variables:
#>  longitude, latitude, time
#> No projection information found in nc file. 
#>  Coordinate variable units found to be degrees, 
#>  assuming WGS84 Lat/Lon.

#Get lat and long values
lon <- read_ncdf(ncfile, var = c("longitude"))
#> No projection information found in nc file. 
#>  Coordinate variable units found to be degrees, 
#>  assuming WGS84 Lat/Lon.
nlon = dim(lon)
lat <- read_ncdf(ncfile, var = c("latitude"))
#> Warning in .get_nc_projection(meta$attribute, rep_var, all_coord_var): No
#> projection information found in nc file.
nlat = dim(lat)

#Coordinates of all cells in a dataframe (x, y)
lonlat.unstacklr <- expand.grid(x=as.numeric(lon$longitude), y=as.numeric(lat$latitude))

#NON HURRICANE info from excel
  #Read non hurricane return levels into nhrl, 
  nhrl = read.xlsx(file="rlera5_original.xlsx", sheetName = "pp_pintar")

  #Create stars object of non-hurricanes
  st_nh = st_as_stars(cbind(lonlat.unstacklr, nhrl))
  #Change from attributes to band dimension
  st_nh = merge(st_nh, name="band") #Noh Hurricane Bands from 2:12 (first one is the index)
  names(st_nh) = "wind"

#HURRICANE INFO
  #Read hurrican rasters
  rasterfiles = list.files(path="./", pattern = "*.*.tif$")
  rasterfiles = paste0("./",rasterfiles)
  rasterfiles = str_sort(rasterfiles, numeric = TRUE)
  st_h = read_stars(rasterfiles, quiet = TRUE)
  mynames = names(st_h)
  start = str_locate(mynames, "_")
  stop = str_locate(mynames, "\\.tif")
  mynames = str_sub(mynames, start=start[,1]+1, end=stop[,1]-1)
  st_h = setNames(st_h, mynames)
  #Change from attributes to band dimension
  st_h = merge(st_h, name="band") #Hurricane Bands from 2:12 (first one is the index)
  names(st_h) = "wind"

#INTEGRATED stars of Hurricanes and Non-Hurricanes
  st_h_nh = c(st_h, st_nh, along="band")

  #Band names are in dim values
  attr(st_h_nh, "dimensions")$band$values #From 2 to 12 hurricane, from 14 to 24 non-hurricane
#>  [1] "cellindexlr" "h_10"        "h_20"        "h_50"        "h_100"      
#>  [6] "h_250"       "h_500"       "h_700"       "h_1000"      "h_1700"     
#> [11] "h_3000"      "h_7000"      "cellindexlr" "nh_10"       "nh_20"      
#> [16] "nh_50"       "nh_100"      "nh_250"      "nh_500"      "nh_700"     
#> [21] "nh_1000"     "nh_1700"     "nh_3000"     "nh_7000"

#CHECK the content with ggplot
ggplot() +
  geom_stars(data = st_h_nh[1, , , c(2:12,14:24)]) + 
  facet_wrap("band") + 
  scale_fill_viridis() +
  coord_equal() +
  theme_map() +
  theme(legend.position = "bottom") +
  theme(legend.key.width = unit(2, "cm"))


#FUNCTION to perform hurricane and non-hurricane integration with st_apply
#mycuenta = 0
combined_columns_st_apply <- function(st,  #stars object with band dimension
                                      bands_rl_h, mri_h,   #bands_rl_h: bands holding return levels for hurricanes,
                                                           #mri_h: return periods for hurricanes
                                      bands_rl_nh, mri_nh, #bands_rl_nh: bands holding return levels for non-hurricanes,
                                                           #mri_nh: return periods for non-hurricanes
                                      mri_c                #Output return periods for combined objects
                                      ){
  #mycuenta <<- mycuenta + 1
  #print(paste0("Cell ID: ",mycuenta))

  #Wind velocities (return levels from 1 to 600 km/h)
  rl600 = seq(from=1, to=600, by=1)

  #Function to extrapolate (and interpolate inside ma and min values)
  extrapola <- function(x, y, xout){
    y=Hmisc::approxExtrap(x=x, y=y, xout=xout)$y
  }

  #Hurricanes
  #Data frame with rl and probabilities for hurricanes (for each cell)
  rl_h = NULL
  for (band in bands_rl_h) {
    rl_h = c(rl_h, st[band])  #st[]: the value of each cell in st for specific band
    #why? because this function will be used in st_apply (be aware of previous merge: attributes to band dimension)
  }

  #Check data with problems in non-hurricane dataset
  if (any(!is.na(rl_h))){
    if (sum(rl_h) == 0 | sum(rl_h) < 0) {
      rl_h = rep(NA, length(bands_rl_h))
    }
  }    

  h = data.frame(rl=rl_h, prob = 1/mri_h)

  #create hurricane hazard curve (detailed >>> 1:600)
  if (any(is.na(h$rl)) | any(is.na(h$prob))){ # check if dataframe hurricane columns have NA values
    hc_h_prob = rep(NA, length(rl600))
  } else{
    hc_h_prob = extrapola(x=h$rl, y=h$prob, xout=rl600)  
  }

  hc_h = data.frame(rl=rl600, prob=hc_h_prob)

  #Do not forget to check extreme values in probs
  hc_h$prob[hc_h$prob < 0] = 0
  hc_h$prob[hc_h$prob > 1] = 1

  #Non-Hurricanes  
  #data frame with rl and probabilities for non-hurricanes (for each cell)
  rl_nh = NULL
  for (band in bands_rl_nh) {
    rl_nh = c(rl_nh, st[band])  #st[]: the value of each cell in st for specific band
    #why? because this function will be used in st_apply (be aware of previous merge: attributes to band dimension)
  }

  #Check data with problems in non-hurricane dataset
  if (any(!is.na(rl_nh))){
    if (sum(rl_nh) == 0 | sum(rl_nh) < 0) {
      rl_nh = rep(NA, length(bands_rl_nh))
    }
  }

  h = data.frame(rl=rl_h, prob = 1/mri_h)

  nh = data.frame(rl=rl_nh, prob = 1/mri_nh)

  #create non-hurricane hazard curve (detailed >>> 1:600)
  if (any(is.na(nh$rl)) | any(is.na(nh$prob))){ # check if dataframe hurricane columns have NA values
    hc_nh_prob = rep(NA, length(rl600))
  } else{
    hc_nh_prob = extrapola(x=nh$rl, y=nh$prob, xout=rl600) #Not check for NA values because non-hurricane has to have rl value in all cells
  }

  hc_nh = data.frame(rl=rl600, prob=hc_nh_prob)

  #Do not forget to check extreme values in probs
  hc_nh$prob[hc_nh$prob < 0] = 0
  hc_nh$prob[hc_nh$prob > 1] = 1

  #create combined hazard curve (detailed >>> 1:600)
  if (any(is.na(hc_h$prob)) | any(is.na(hc_h$rl))){
    #If non-hurrican data, then the probability comes from non-hurricane
    hc_c_prob = hc_nh$prob
  } else {
    #pec = (1 - ((1-peh)*(1-penh)))
    hc_c_prob = (1 - ((1-hc_h$prob)*(1-hc_nh$prob)))     
  }
  hc_c = data.frame(rl=rl600, prob=hc_c_prob)

  #Do not forget to check extreme lower values in probs
  hc_c$prob[hc_c$prob<0] = 0

  #Interpolate rl in combined hazard curve based on 1/mri_c
  prob_c = 1/mri_c
  if (any(is.na(hc_c$prob)) | any(is.na(hc_h$rl))){ #Do not interpolate if any NA
    rl_c_output = rep(NA, length(prob_c))
  } else {
    rl_c_output = extrapola(x=hc_c$prob, y=hc_c$rl, xout=prob_c)  
  }
  return (rl_c_output)
}

mri_c = c(10,20,50,100,250,500,700,1000,1700,3000,7000)
start_time <- Sys.time()
combined_st = st_apply(st_h_nh, c("x", "y"), combined_columns_st_apply, 
                       bands_rl_h = c(2:12), mri_h = c(10,20,50,100,250,500,700,1000,1700,3000,7000),
                       bands_rl_nh = c(14:24), mri_nh = c(10,20,50,100,250,500,700,1000,1700,3000,7000), 
                       mri_c = mri_c)
end_time <- Sys.time()
(end_time - start_time)
#> Time difference of 5.449278 secs

#Change combined_columns_st_apply values
combined_st = st_set_dimensions(combined_st, "combined_columns_st_apply", values= paste0("c_", mri_c))
combined_st = st_set_dimensions(combined_st, names = c("band", "x", "y"))

#Check combined result
ggplot()+
  geom_stars(data=combined_st) + 
  facet_wrap("band") +
  scale_fill_viridis() +
  coord_equal() +
  theme_map() +
  theme(legend.position = "bottom") +
  theme(legend.key.width = unit(2, "cm"))

edzer commented 3 years ago

Could you please try to bring this issue down to a point, e.g. in less than 10 lines text and less than 50 lines of code?

alexyshr commented 3 years ago

I summarized the first approach. The checks I am performing inside the function combined_columns_st_apply did not allow me to reduce more the number of code lines. I am leaving the code comments between lines.

library(stars)
#> Loading required package: abind
#> Loading required package: sf
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(xlsx)
library(units)
#> udunits system database from E:/Documents/R/win-library/4.0/units/share/udunits
library(stringr)
library(ggplot2)
library(viridis)
#> Loading required package: viridisLite
library(ggthemes)

#NON HURRICANE return levels RL info from excel
nhrl = read.xlsx(file="rlera5_original.xlsx", sheetName = "pp_pintar")

#Create stars object with non-hurricane info (joining nhrl data frame)
  sf = "./10fg_jan1_2017.nc" %>% read_ncdf() %>% 
    adrop() %>% st_xy2sfc(as_points = TRUE) %>% drop_units() %>% st_as_sf()
#> no 'var' specified, using fg10
#> other available variables:
#>  longitude, latitude, time
#> No projection information found in nc file. 
#>  Coordinate variable units found to be degrees, 
#>  assuming WGS84 Lat/Lon.
  sf$cellindexlr = 1:length(sf$fg10)
  #stars non-hurricanes
  st_nh = sf %>% merge(nhrl, by="cellindexlr") %>% st_sfc2xy() %>% select(c(-2))

#Change from attributes to band dimension
st_nh <- st_nh %>% merge(name="band") %>% setNames(c("wind")) #Non Hurricane Bands from 2:12 (first one is the index)

#HURRICANE INFO
#Read hurrican rasters
st_h <-  "./" %>% list.files(pattern = "*.*.tif$", full.names=TRUE) %>% 
        str_sort(numeric=TRUE) %>% read_stars(quiet = TRUE)

start = names(st_h) %>% str_locate("_")
end = names(st_h) %>% str_locate("\\.tif")
st_h = st_h %>% names() %>% str_sub(start = start[,1]+1, end = end[,1]-1) %>% 
       setNames(object=st_h) %>% merge(name="band") %>% setNames(c("wind")) #Hurricane Bands from 2:12 (first one is the index)

#INTEGRATED stars of Hurricanes and Non-Hurricanes
st_h_nh = c(st_h, st_nh, along="band")

#Band names are in dim values
attr(st_h_nh, "dimensions")$band$values #From 2 to 12 hurricane, from 14 to 24 non-hurricane
#>  [1] "cellindexlr" "h_10"        "h_20"        "h_50"        "h_100"      
#>  [6] "h_250"       "h_500"       "h_700"       "h_1000"      "h_1700"     
#> [11] "h_3000"      "h_7000"      "cellindexlr" "nh_10"       "nh_20"      
#> [16] "nh_50"       "nh_100"      "nh_250"      "nh_500"      "nh_700"     
#> [21] "nh_1000"     "nh_1700"     "nh_3000"     "nh_7000"

#CHECK the content with ggplot
ggplot() +
  geom_stars(data = st_h_nh[1, , , c(2:12,14:24)]) + 
  facet_wrap("band") + 
  scale_fill_viridis() +
  coord_equal() +
  theme_map() +
  theme(legend.position = "bottom") +
  theme(legend.key.width = unit(2, "cm"))

#Function to perform hurricane and non-hurricane integration with st_apply
#Description: First, for each cell gets hurricane and non hurricane information for specific return periods in years,
#             then, constructs detailed hazard curves (x: return levels from 1 to 600; y: corresponding probabilities) for hurricane and non-hurricanes,
#             next, integrates hazard curves into a detailed combined hazard curve representing both types of events simultaneously,
#             finally, interpolates return levels using combined hazard curve and user defined output return periods
combined_columns_st_apply <- function(st,  #stars object with band dimension
                                      bands_rl_h = c(2:12), mri_h = c(10,20,50,100,250,500,700,1000,1700,3000,7000),   #bands_rl_h: bands holding return levels for hurricanes,
                                      #mri_h: return periods for hurricanes
                                      bands_rl_nh = c(14:24), mri_nh = c(10,20,50,100,250,500,700,1000,1700,3000,7000), #bands_rl_nh: bands holding return levels for non-hurricanes,
                                      #mri_nh: return periods for non-hurricanes
                                      mri_c = c(10,20,50,100,250,500,700,1000,1700,3000,7000) #Output return periods for combined events
){

  #Wind velocities (return levels from 1 to 600 km/h)
  rl600 = seq(from=1, to=600, by=1)

  #Function to extrapolate (and interpolate inside ma and min values)
  extrapola <- function(x, y, xout){y=Hmisc::approxExtrap(x=x, y=y, xout=xout)$y}

  #Hurricanes
    #h = data frame with rl and probabilities for hurricanes (for each cell)
      rl_h = NULL
      for (band in bands_rl_h) {rl_h = c(rl_h, st[band])}  #st[]: the value of each cell in st for specific band

      #Check hurricane dataset, then replace by NA if problems in data
      if (any(!is.na(rl_h))){if (sum(rl_h) == 0 | sum(rl_h) < 0) {rl_h = rep(NA, length(bands_rl_h))}}    

      h = data.frame(rl=rl_h, prob = 1/mri_h)

    #Create hurricane hazard curve (detailed >>> 1:600)
      if (any(is.na(h$rl)) | any(is.na(h$prob))){ # Check if data frame hurricane columns has NA values
        hacu_h_prob = rep(NA, length(rl600))
      } else{
        hacu_h_prob = extrapola(x=h$rl, y=h$prob, xout=rl600)
      }

      hacu_h = data.frame(rl=rl600, prob=hacu_h_prob)

    #Do not forget to check extreme values in probabilities because Hmisc::approxExtrap is extrapolating,
    #this means it can go bellow 0 or above 1
      hacu_h$prob[hacu_h$prob < 0] = 0
      hacu_h$prob[hacu_h$prob > 1] = 1

  #Non-Hurricanes  
    #nh = data frame with rl and probabilities for non-hurricanes (for each cell)
      rl_nh = NULL
      for (band in bands_rl_nh) {rl_nh = c(rl_nh, st[band])}  #st[]: the value of each cell in st for specific band

      #Check non-hurricane dataset, then replace by NA if problems in data
      if (any(!is.na(rl_nh))){if (sum(rl_nh) == 0 | sum(rl_nh) < 0) {rl_nh = rep(NA, length(bands_rl_nh))}}

      nh = data.frame(rl=rl_nh, prob = 1/mri_nh)

    #create non-hurricane hazard curve (detailed >>> 1:600)
      if (any(is.na(nh$rl)) | any(is.na(nh$prob))){ # check if data frame hurricane columns have NA values
        hacu_nh_prob = rep(NA, length(rl600))
      } else{
        hacu_nh_prob = extrapola(x=nh$rl, y=nh$prob, xout=rl600) #Not check for NA values because non-hurricane has to have RL value in all cells
      }

      hacu_nh = data.frame(rl=rl600, prob=hacu_nh_prob)

    #Do not forget to check extreme values in probs
      hacu_nh$prob[hacu_nh$prob < 0] = 0
      hacu_nh$prob[hacu_nh$prob > 1] = 1

  #Integration of hurricane with non-hurricanes (combined)
    #hacu_c: Create combined hazard curve in a data frame (detailed >>> 1:600)
      if (any(is.na(hacu_h$prob)) | any(is.na(hacu_h$rl))){
        #If non-hurricane data, then the probability comes from non-hurricane
        hacu_c_prob = hacu_nh$prob
      } else {
        #pec = (1 - ((1-peh)*(1-penh)))
        hacu_c_prob = (1 - ((1-hacu_h$prob)*(1-hacu_nh$prob)))     
      }
      hacu_c = data.frame(rl=rl600, prob=hacu_c_prob)

    #Do not forget to check extreme lower values in probs
      hacu_c$prob[hacu_c$prob<0] = 0

  #Obtain return levels using combined hazard curve
    #Interpolate rl in combined hazard curve based on 1/mri_c
    if (any(is.na(hacu_c$prob)) | any(is.na(hacu_h$rl))){ #Do not interpolate if any NA
      rl_c_output = rep(NA, length(1/mri_c))
    } else {
      rl_c_output = extrapola(x=hacu_c$prob, y=hacu_c$rl, xout=1/mri_c)  
    }
    return (rl_c_output)
}

mri_c = c(10,20,50,100,250,500,700,1000,1700,3000,7000)
start_time <- Sys.time()
combined_st = st_apply(st_h_nh, c("x", "y"), combined_columns_st_apply, mri_c = mri_c)
end_time <- Sys.time()
(end_time - start_time)
#> Time difference of 4.989475 secs

#Change combined_columns_st_apply values and name
combined_st = combined_st %>% st_set_dimensions(names = c("band", "x", "y")) %>% 
                                                        st_set_dimensions("band", values= paste0("c_", mri_c))

#Check combined result
ggplot()+
  geom_stars(data=combined_st) + 
  facet_wrap("band") +
  scale_fill_viridis() +
  coord_equal() +
  theme_map() +
  theme(legend.position = "bottom") +
  theme(legend.key.width = unit(2, "cm"))

r-spatial / stars

Looking for an efficient (faster) code to execute a process using a stars object #319