mcglinnlab / soar

SOAR: species occurrence aggregator in R
2 stars 0 forks source link

Density Plot Issues #22

Open AshleyWoods opened 5 years ago

AshleyWoods commented 5 years ago

Issues with making the raster layers match for plotting the density of the two. Code:

#get the data in here
dat <- occ_download_import(key = "0009491-190621201848488")
sp_key <- name_suggest(q ="Caretta caretta", rank = "species")$key[1]
comparison <- map_fetch(taxonKey = sp_key, srs = "EPSG:3857")

#visualize it
#plot(dat)
#plot(comparison)

#prep for later

  #Turns dat into a raster to match comparison 
  crds = data.frame(long = dat$decimalLongitude, lat = dat$decimalLatitude)
  crds = subset(crds, subset= lat != -90)
  crds_sp = SpatialPoints(crds, proj4string = CRS("+proj=longlat +ellps=WGS84"))
  #r_template = comparison
  #r = raster(nrows=512, ncols=512, xmn=-20037.51, xmx=20002.49, ymn=-6396.115,
  #           ymx=6363.885, crs = CRS("+proj=cea +units=km +ellps=WGS84"))
  #crds_cea = spTransform(crds_sp, CRSobj = CRS(proj4string(r)))
  #ct = rasterize(crds_cea, r, fun='count')

  crds_sp_reproj = spTransform(crds_sp, CRSobj = CRS(projection(comparison)))
  ct = rasterize(crds_sp_reproj, comparison, fun='count')
  max_ct = maxValue(ct)
  prop = calc(ct, function(x) ifelse(is.na(x), 0, x / max_ct))
  #this will turn the full numbers in to percentages of the max to match the values
  #from the map_fetch() functions ( values on a scale of 0 to 1)
  int <- ct@data@values

#plot the thing -- how to show the density of the points
plot(comparison, prop, 
     xlab = "Density of Refrence Points (Comparison dataset)",
     ylab = "Density of Species of dat", maxpixels = 262144)

#add best fit line
abline(lm(getValues(prop) ~ getValues(comparison)))
AshleyWoods commented 5 years ago

Updated code for attempt:

#get the data in here
dat <- occ_download_import(key = "0009491-190621201848488")
sp_key <- name_suggest(q ="Caretta caretta", rank = "species")$key[1]
comparison <- map_fetch(taxonKey = sp_key, srs = "EPSG:3857")

#visualize it
#plot(dat)
#plot(comparison)

#prep for later

#somethings funky here, so yeah
#comparison2 = projectRaster(comparison, comparison)#crs = "+proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext  +no_defs")
comparison3 <- setExtent(comparison, extent(-20037508, 20037508, -20037508, 20037508))

  #Turns dat into a raster to match comparison
  crds = data.frame(long = dat$decimalLongitude, lat = dat$decimalLatitude)
  crds = subset(crds, subset= lat > -85.06)
  crds = subset(crds, subset = lat < 85.06)

  #crds2 = dat$data[c("decimalLongitude","decimalLatitude")]
  #crds2 = subset(crds2, subset= "decimalLatitude" > -85.06)
  #crds2 = subset(crds2, subset = "decimalLatitude" < 85.06)

#  crds_sp = SpatialPoints(crds, proj4string = CRS("+proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext  +no_defs")) #CRS(projection(comparison3)))#
  #r_template = comparison
  #r = raster(nrows=512, ncols=512, xmn=-20037.51, xmx=20002.49, ymn=-6396.115,
  #           ymx=6363.885, crs = CRS("+proj=cea +units=km +ellps=WGS84"))
  #crds_cea = spTransform(crds_sp, CRSobj = CRS(proj4string(r)))
  #ct = rasterize(crds_cea, r, fun='count')

#  crds_sp_reproj = spTransform(crds_sp, CRSobj = CRS("+proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext  +no_defs"))
    crds = sp::SpatialPoints(crds, proj4string = CRS("+proj=longlat"))
    crds_3857 = sp::spTransform(crds, CRSobj = CRS(projection(comparison3)))
   ct = rasterize(crds_3857, comparison3, fun='count')
  max_ct = maxValue(ct)
  prop = calc(ct, function(x) ifelse(is.na(x), 0, x / max_ct))
  #this will turn the full numbers in to percentages of the max to match the values
  #from the map_fetch() functions ( values on a scale of 0 to 1)
  int <- ct@data@values

  #prop = projectRaster(prop, crs = "+proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m +nadgrids=@null +wktext  +no_defs")
#  crds = sp::SpatialPoints(crds, proj4string = CRS("+proj=longlat"))
#  crds_3857 = sp::spTransform(crds, CRSobj = CRS(projection(comparison3)))
  plot(comparison3)
  points(crds_3857, pch='.', col='red')

#plot the thing -- how to show the density of the points
plot(comparison3, prop, 
     xlab = "Density of Refrence Points (Comparison dataset)",
     ylab = "Density of Species of Interest", maxpixels = 262144)

#add best fit line
abline(lm(getValues(prop) ~ getValues(comparison)))

This is very close to the desired output, but the map projection is still just slightly off. In the image below, look at the east coast of Australia for the most obvious example.

image

The graph this outputs looks like this:

image

dmcglinn commented 5 years ago

Hmm, I think their patch is actually working. Try this:

den <- map_fetch(taxonKey = 2480498, srs = "EPSG:3857")
occ <- occ_search(taxonKey = 2480498)
crds <- occs$data[c("decimalLongitude","decimalLatitude")] ## Extract only the coordinates
crds_3857 <- rgdal::project(as.matrix(crds), projection(den))

r <- setExtent(den, extent(-20037508, 20037508, -20037508, 20037508))

pdf('test.pdf')
plot(r)
mp <- maps::map(plot = FALSE)
xy <- rgdal::project(cbind(mp$x, mp$y), projection(den))
lines(xy)
points(crds_3857, col='red', pch = '.')
dev.off()
AshleyWoods commented 5 years ago

I see that now, I did some messing around with the zoom window from RStudio and it turns out that the dimensions of the zoom window can affect how the points line up for some reason, they scale differently. (But that's a problem for RStudio to solve)

While that helps a lot, we are still losing points while turning crds into a raster. I believe it may happen during the rasterize() step.

#get the data in here
dat <- occ_download_import(key = "0009491-190621201848488")
sp_key <- name_suggest(q ="Caretta caretta", rank = "species")$key[1]
comparison <- map_fetch(taxonKey = sp_key, srs = "EPSG:3857")

comparison3 <- setExtent(comparison, extent(-20037508, 20037508, -20037508, 20037508))

  #Throw out invalid crds + reproject it
  crds = data.frame(long = dat$decimalLongitude, lat = dat$decimalLatitude)
  crds = subset(crds, subset= lat > -85.06)
  crds = subset(crds, subset = lat < 85.06)
  crds_3857 <- rgdal::project(as.matrix(crds), projection(comparison))

#  I don't think these are needed anymore? I'm not sure  
#  crds2 = sp::SpatialPoints(crds, proj4string = CRS("+proj=longlat"))
#  crds_38572 = sp::spTransform(crds2, CRSobj = CRS(projection(comparison3)))

  #Rasterize the crds
  #I believe the Error happens here!!!!!!!!!!!!!!!!!!!!!!!
  ct = rasterize(crds_3857, comparison, fun='count')

  #turn the numeric values into density values
  max_ct = maxValue(ct)
  prop = calc(ct, function(x) ifelse(is.na(x), 0, x / max_ct))

  #grab data for best fit line
  #int <- ct@data@values

  #visualize
  plot(comparison3)
  points(crds_3857, pch='.', col='red')
# If you could get this to work I think it would support my point about losing points
  #points(x = prop, pch = '.', col = 'blue')

#plot the thing -- how to show the density of the points
plot(comparison3, prop, 
     xlab = "Density of Refrence Points (Comparison dataset)",
     ylab = "Density of Species of Interest", maxpixels = 262144)

#add best fit line
abline(lm(getValues(prop) ~ getValues(comparison)))
dmcglinn commented 5 years ago

I looked into the points dropping and I'm not seeing it on my end. Good news is that I think this is working for the most part now. I think maybe you should get a fresh update of your caretta caretta data and you might see that the two overlap.

test.pdf

AshleyWoods commented 5 years ago

This code now grabs the newest Caretta caretta dataset (once you replace the pound signs with your login)

#get the data in here
#dat <- occ_download_import(key = "0009491-190621201848488")
sp_key <- name_suggest(q ="Caretta caretta", rank = "species")$key[1]
res = occ_download(paste("taxonKey =", sp_key), 'hasCoordinate = TRUE',
                   user = '#######', pwd = '#########', 
                   email = '#########################')
continue = TRUE
while(continue){
  meta = occ_download_meta(res)
  if (meta$status == "SUCCEEDED"){
    continue = FALSE
    dat <- occ_download_get(res[1], overwrite = TRUE) %>%
      occ_download_import()
  }     
  if (meta$status == "KILLED"){
    continue = FALSE
    #Returns: ERROR:Bad Request(HTTP 400)
  }
  #30 second delay --  Extend?
  Sys.sleep(30)
}

comparison <- map_fetch(taxonKey = sp_key, srs = "EPSG:3857")

comparison3 <- setExtent(comparison, extent(-20037508, 20037508, -20037508, 20037508))

  #Throw out invalid crds + reproject it
  crds = data.frame(long = dat$decimalLongitude, lat = dat$decimalLatitude)
  crds = subset(crds, subset= lat > -85.06)
  crds = subset(crds, subset = lat < 85.06)
  crds_3857 <- rgdal::project(as.matrix(crds), projection(comparison3))

#  I don't think these are needed anymore? I'm not sure  
#  crds2 = sp::SpatialPoints(crds, proj4string = CRS("+proj=longlat"))
#  crds_38572 = sp::spTransform(crds2, CRSobj = CRS(projection(comparison3)))

  #Rasterize the crds
  #I believe the Error happens here!!!!!!!!!!!!!!!!!!!!!!!
  ct = rasterize(crds_3857, comparison3, fun='count')

  #turn the numeric values into density values
  max_ct = maxValue(ct)
  prop = calc(ct, function(x) ifelse(is.na(x), 0, x / max_ct))

  #grab data for best fit line
  #int <- ct@data@values

  #visualize
  plot(comparison3)
  points(crds_3857, pch='.', col='red')
  #If you could get this to work I think it would support my point about loosing points
  #points(x = prop, pch = '.', col = 'blue')

#plot the thing -- how to show the density of the points
plot(comparison3, prop, 
     xlab = "Density of Refrence Points (Comparison dataset)",
     ylab = "Density of Species of Interest", maxpixels = 262144)

#add best fit line
abline(lm(getValues(prop) ~ getValues(comparison3)))

That gives this plot, even though it should be giving a 1:1 with the exception of a few (4) points that were removed due to proximity to the poles. image

AshleyWoods commented 5 years ago

I added a section at the end of the code that shows how many coordinates the GBIF data has at the beginning of the code vs. the end after it has been rasterized. Remember to replace the '#' with your login information. When I ran the last section and checked the value of the 'errors' variable it had a value of 20765. This means that 20765 coordinate points were not transferred to the raster layer and that is why the end graph does not appear as a 1:1.

#get the data in here
#dat <- occ_download_import(key = "0009491-190621201848488")
sp_key <- name_suggest(q ="Caretta caretta", rank = "species")$key[1]
res = occ_download(paste("taxonKey =", sp_key), 'hasCoordinate = TRUE',
                   user = "#######", pwd = "########", 
                   email = "##################")
continue = TRUE
while(continue){
  meta = occ_download_meta(res)
  if (meta$status == "SUCCEEDED"){
    continue = FALSE
    dat <- occ_download_get(res[1], overwrite = TRUE) %>%
      occ_download_import()
  }     
  if (meta$status == "KILLED"){
    continue = FALSE
    #Returns: ERROR:Bad Request(HTTP 400)
  }
  #30 second delay --  Extend?
  Sys.sleep(30)
}

comparison <- map_fetch(taxonKey = sp_key, srs = "EPSG:3857")

comparison3 <- setExtent(comparison, extent(-20037508, 20037508, -20037508, 20037508))

  #Throw out invalid crds + reproject it
  crds = data.frame(long = dat$decimalLongitude, lat = dat$decimalLatitude)
  crds = subset(crds, subset= lat > -85.06)
  crds = subset(crds, subset = lat < 85.06)
  crds_3857 <- rgdal::project(as.matrix(crds), projection(comparison3))

#  I don't think these are needed anymore? I'm not sure  
#  crds2 = sp::SpatialPoints(crds, proj4string = CRS("+proj=longlat"))
#  crds_38572 = sp::spTransform(crds2, CRSobj = CRS(projection(comparison3)))

  #Rasterize the crds
  #I believe the Error happens here!!!!!!!!!!!!!!!!!!!!!!!
  ct = rasterize(crds_3857, comparison3, fun='count')

  #turn the numeric values into density values
  max_ct = maxValue(ct)
  prop = calc(ct, function(x) ifelse(is.na(x), 0, x / max_ct))
  test = calc(ct, function(x) ifelse(is.na(x), NA, x/max_ct))

  #grab data for best fit line
  #int <- ct@data@values

  #visualize
  plot(comparison3)
  points(crds_3857, pch='.', col='red')
  #If you could get this to work I think it would support my point about loosing points
  #points(x = prop, pch = '.', col = 'blue')

#plot the thing -- how to show the density of the points
plot(comparison3, prop, 
     xlab = "Density of Refrence Points (Comparison dataset)",
     ylab = "Density of Species of Interest", maxpixels = 262144, xlim = c(0,1), ylim = c(0,1))

#add best fit line
abline(lm(getValues(prop) ~ getValues(comparison3)))

# checking for problems with the coordinates and losing them when creating the raster
# these coordinates are lost from the GBIF data, not the mapfetch data
raster_count = 0
crds_count = length(crds_3857)

for (i in 1:262144) {
  if (!is.na(test[i])) {
    raster_count = raster_count + 1
  }
}

errors = crds_count - raster_count #if this is not zero, then it is the number of coordinates that were lost
errors
dmcglinn commented 5 years ago

Hey @AshleyWoods I don't have time right now to go through your code but an even simpler check on my end shows that rasterize is not making a mistake

sum(getValues(ct), na.rm=T)
[1] 10795
dim(crds_3857)
[1] 10795     2

So every coordinate in crds_3857 has been converted into a unit of count in ct. Can you confirm this behavior on your end as well please

AshleyWoods commented 5 years ago

I see what you mean. I'm not sure where it is happening then, but something is going wrong if my graph still looks like that. I'll through the code again and see what I can find.

> sum(getValues(ct), na.rm=T)
[1] 10801
> dim(crds_3857)
[1] 10801     2
AshleyWoods commented 5 years ago

Here's something interesting. I plotted prop against prop and got this: image But when I plotted comparison3 against comparison3, I got this: image

The density values are fundamentally different. I think that some of the coordinates are recording more than one occurrence. I need to find what column that value is stored in and take that into account to see if that fixes the issue. I have two suspicions for what columns this value is stored in, but it sometimes contains a null value and sometimes contains a zero. When I figure out what column the data is stored in, do you know how I could factor this in?

dmcglinn commented 5 years ago

That's a good call to think about duplicates. I'm not sure if the GBIF folks filtered those out before computing the densities. To identify duplicate records the easiest thing to do is filter if they have the exact same long and lat. This is easy to do in R

  # remove duplicates
  crds = crds[!duplicated(crds), ]

Unfortunately this doesn't solve the problem you have identified. I appears that the density values have been binned very coarsely into 4 values (at least in this dataset - this may be general not sure). As you have noted the bin classes don't match up with what we're seeing in the raw point data. I think in this case we need to file another bug report over at rgbif with a simple reproducible example so that we can better understand how GBIF has computed these pixel colors that we're capturing with map_fetch. What do you think?

dmcglinn commented 5 years ago

My thought is the last time we were getting strange stuff it was because of an upstream bug and this case may the same

AshleyWoods commented 5 years ago

I wasn't talking about duplicates. I meant that in some occurrences, there are columns that allow the reporter of the occurrence to input how many individuals there were. Some people left it blank, some have it filled in with a zero, but some have around 200 occurrences. I think gbif may be counting those and this is not an error. At the very least, I think we should test it first and try the course binning to see if it eventually matches up or if there is a but somewhere.

AshleyWoods commented 5 years ago

I finally found a page on the Gbif website that told me which columns contain this and how to read them.

AshleyWoods commented 5 years ago

I added code to make sure every occurrence was counted for every individual included in the observation.

#get the data in here
#dat <- occ_download_import(key = "0009491-190621201848488")
sp_key <- name_suggest(q ="Caretta caretta", rank = "species")$key[1]
res = occ_download(paste("taxonKey =", sp_key), 'hasCoordinate = TRUE',
                   user = "ashwood", pwd = "Voidfish13", 
                   email = "woodsy913@comcast.net")
continue = TRUE
while(continue){
  meta = occ_download_meta(res)
  if (meta$status == "SUCCEEDED"){
    continue = FALSE
    dat <- occ_download_get(res[1], overwrite = TRUE) %>%
      occ_download_import()
  }     
  if (meta$status == "KILLED"){
    continue = FALSE
    #Returns: ERROR:Bad Request(HTTP 400)
  }
  #30 second delay --  Extend?
  Sys.sleep(30)
}

comparison <- map_fetch(taxonKey = sp_key, srs = "EPSG:3857")

comparison3 <- setExtent(comparison, extent(-20037508, 20037508, -20037508, 20037508))

  #Throw out invalid crds + reproject it
  crds = data.frame(long = dat$decimalLongitude, lat = dat$decimalLatitude)
  crds = subset(crds, subset= lat > -85.06)
  crds = subset(crds, subset = lat < 85.06)

  #for the sake of having the counting cols (but still weeding out the same ones that have been taken from crds)
  crds2 = data.frame(long = dat$decimalLongitude, lat = dat$decimalLatitude, count = dat$individualCount, quantity = dat$organismQuantity)
  crds2 = subset(crds2, subset= lat > -85.06)
  crds2 = subset(crds2, subset = lat < 85.06)

  #and now you make sure EVERY occurrence is counted
  for (i in 1:length(rownames(crds2))) {
    if (!is.na(crds2[i,3]) && crds2[i,3] > 1) {
      #it has more than 1 occurrence in the individualCount column, append it however many times to the end of crds
      for (j in 1:crds2[i,3]) {
        crds <- rbind(crds, c(crds2[i,1], crds2[i,2]))
      }

    }else if (!is.na(crds2[i,4]) && is.numeric(crds2[i,4]) && crds2[i,4] > 1) {
      #it has more than 1 occurrence in the organismQuantity column, append it however many times to the end of crds
      for (j in 1:crds2[i,4]) {
        crds <- rbind(crds, c(crds2[i,1], crds2[i,2]))
      }

    }
  }

  crds_3857 <- rgdal::project(as.matrix(crds), projection(comparison3))

#  I don't think these are needed anymore? I'm not sure  
#  crds2 = sp::SpatialPoints(crds, proj4string = CRS("+proj=longlat"))
#  crds_38572 = sp::spTransform(crds2, CRSobj = CRS(projection(comparison3)))

  #Rasterize the crds
  #I believe the Error happens here!!!!!!!!!!!!!!!!!!!!!!!
  ct = rasterize(crds_3857, comparison3, fun='count')

  #turn the numeric values into density values
  max_ct = maxValue(ct)
  prop = calc(ct, function(x) ifelse(is.na(x), 0, x / max_ct))
  test = calc(ct, function(x) ifelse(is.na(x), NA, x/max_ct))

  #grab data for best fit line
  #int <- ct@data@values

  #visualize
  plot(comparison3)
  points(crds_3857, pch='.', col='red')
  #If you could get this to work I think it would support my point about loosing points
  #points(x = prop, pch = '.', col = 'blue')

#plot the thing -- how to show the density of the points
plot(comparison3, prop, 
     xlab = "Density of Refrence Points (Comparison dataset)",
     ylab = "Density of Species of Interest", maxpixels = 262144, xlim = c(0,1), ylim = c(0,1))

#add best fit line
abline(lm(getValues(prop) ~ getValues(comparison3)))

This is the solution it gives: image

It's different, but it is still not 1:1

AshleyWoods commented 5 years ago

I'm trying to get a package called rprotobuf (suggested by Tim Robertson on this thread) installed to try and work with the mvt packages. However, it gives me these errors and I can't figure out what it wants me to do. (or if I have permission to do so on Uniola)

configure: WARNING: Protobuf headers not found with default CXXFLAGS and CPPFLAGS, manually trying /usr/local/include
configure: WARNING: Unsetting ac_cv_header_google_protobuf_stubs_common_h

and

configure: error: ERROR: ProtoBuf headers required; use '-Iincludedir' in CXXFLAGS for unusual locations.
ERROR: configuration failed for package ‘RProtoBuf’
* removing ‘/home/woodsae/R/x86_64-pc-linux-gnu-library/3.5/RProtoBuf’

Thoughts?

MattBlissett commented 5 years ago

You would need to install the protobuf library.

https://github.com/eddelbuettel/rprotobuf#installation has instructions for Debian/Ubuntu, "with similar commands on other operating systems or distributions". It's the kind of thing that typically requires permissions.