Closed maurolepore closed 5 years ago
dependent_variable_biomass_component may be important to have in both tables to make them interpretable as stand-alone units.
If you have a normalized collection of tables you can always reconstruct a master table from the pieces. See master()
in the example below.
A normalized database avoids mistakes that will surely come from trying to update mulitple tables in the exact same way. Even a typo will make the merge brake -- resulting in a missing value somewhere.
library(tidyverse)
library(allodb)
redundant <- c(
"dependent_variable_biomass_component",
"allometry_specificity",
"dbh_min_cm",
"dbh_max_cm"
)
sitespecies_normalized <- sitespecies[ , setdiff(names(sitespecies), redundant)]
names(sitespecies_normalized)
#> [1] "site" "family" "species"
#> [4] "species_code" "life_form" "equation_group"
#> [7] "equation_id" "equation_taxa" "notes_on_species"
#> [10] "wsg_id" "wsg_specificity"
all <- full_join(equations, sitespecies_normalized, by = "equation_id")
names(all)
#> [1] "ref_id"
#> [2] "equation_allometry"
#> [3] "equation_id"
#> [4] "equation_form"
#> [5] "dependent_variable_biomass_component"
#> [6] "independent_variable"
#> [7] "allometry_specificity"
#> [8] "geographic_area"
#> [9] "dbh_min_cm"
#> [10] "dbh_max_cm"
#> [11] "sample_size"
#> [12] "dbh_units_original"
#> [13] "biomass_units_original"
#> [14] "allometry_development_method"
#> [15] "regression_model"
#> [16] "other_equations_tested"
#> [17] "log_biomass"
#> [18] "bias_corrected"
#> [19] "bias_correction_factor"
#> [20] "notes_fitting_model"
#> [21] "original_data_availability"
#> [22] "warning"
#> [23] "site"
#> [24] "family"
#> [25] "species"
#> [26] "species_code"
#> [27] "life_form"
#> [28] "equation_group"
#> [29] "equation_taxa"
#> [30] "notes_on_species"
#> [31] "wsg_id"
#> [32] "wsg_specificity"
# My approach is to build a wrapper that reconstructs a master table from
# the normalized tables (with no redundant columns)
master <- function() {
eqn_site_sp <- full_join(equations, sitespecies_normalized, by = "equation_id")
full_join(eqn_site_sp, sites_info, by = "site")
}
glimpse(master())
#> Observations: 755
#> Variables: 43
#> $ ref_id <chr> "jenkins_2004_cdod", "jen...
#> $ equation_allometry <chr> "10^(1.1891+1.419*(log10(...
#> $ equation_id <chr> "2060ea", "2060ea", "a4d8...
#> $ equation_form <chr> "10^(a+b*(log10(dbh^c)))"...
#> $ dependent_variable_biomass_component <chr> "Total aboveground biomas...
#> $ independent_variable <chr> "DBH", "DBH", "DBH", "DBH...
#> $ allometry_specificity <chr> "Species", "Species", "Sp...
#> $ geographic_area <chr> "Ohio, USA", "Ohio, USA",...
#> $ dbh_min_cm <chr> "0.21", "0.21", "0.19", "...
#> $ dbh_max_cm <chr> "5.73", "5.73", "3.86", "...
#> $ sample_size <chr> NA, NA, NA, NA, NA, NA, N...
#> $ dbh_units_original <chr> "cm", "cm", "cm", "cm", "...
#> $ biomass_units_original <chr> "g", "g", "g", "g", "g", ...
#> $ allometry_development_method <chr> "harvest", "harvest", "ha...
#> $ regression_model <chr> NA, NA, NA, NA, NA, NA, N...
#> $ other_equations_tested <chr> NA, NA, NA, NA, NA, NA, N...
#> $ log_biomass <chr> NA, NA, NA, NA, NA, NA, N...
#> $ bias_corrected <chr> "1", "1", "1", "1", "1", ...
#> $ bias_correction_factor <chr> "1.056", "1.056", "1.016"...
#> $ notes_fitting_model <chr> NA, NA, NA, NA, NA, NA, N...
#> $ original_data_availability <chr> NA, NA, NA, NA, NA, NA, N...
#> $ warning <chr> NA, NA, NA, NA, NA, NA, N...
#> $ site <chr> "Lilly Dicky", "Tyson", "...
#> $ family <chr> "Sapindaceae", "Sapindace...
#> $ species <chr> "Acer rubrum", "Acer rubr...
#> $ species_code <chr> "316", "acerub", "318", "...
#> $ life_form <chr> "Tree", "Tree", "Tree", "...
#> $ equation_group <chr> "Expert", "Expert", "Expe...
#> $ equation_taxa <chr> "Acer rubrum", "Acer rubr...
#> $ notes_on_species <chr> NA, NA, NA, NA, NA, NA, N...
#> $ wsg_id <chr> NA, NA, NA, NA, NA, NA, N...
#> $ wsg_specificity <chr> NA, NA, NA, NA, NA, NA, N...
#> $ id <chr> NA, NA, NA, NA, NA, NA, N...
#> $ Site <chr> NA, NA, NA, NA, NA, NA, N...
#> $ lat <chr> NA, NA, NA, NA, NA, NA, N...
#> $ long <chr> NA, NA, NA, NA, NA, NA, N...
#> $ UTM_Zone <chr> NA, NA, NA, NA, NA, NA, N...
#> $ UTM_X <chr> NA, NA, NA, NA, NA, NA, N...
#> $ UTM_Y <chr> NA, NA, NA, NA, NA, NA, N...
#> $ intertropical <chr> NA, NA, NA, NA, NA, NA, N...
#> $ size.ha <chr> NA, NA, NA, NA, NA, NA, N...
#> $ E <chr> NA, NA, NA, NA, NA, NA, N...
#> $ wsg.site.name <chr> NA, NA, NA, NA, NA, NA, N...
Created on 2019-03-15 by the reprex package (v0.2.1)
Let me show a toy example for clarity
library(tidyverse)
eqn <- tribble(
~id, ~eqn, ~component,
01, "a + b", "whole",
02, "a + c", "part",
)
site <- tribble(
~id, ~site,
01, "scbi",
01, "bci",
02, "luquillo",
)
master <- function() full_join(eqn, site, by = "id")
eqn
#> # A tibble: 2 x 3
#> id eqn component
#> <dbl> <chr> <chr>
#> 1 1 a + b whole
#> 2 2 a + c part
site
#> # A tibble: 3 x 2
#> id site
#> <dbl> <chr>
#> 1 1 scbi
#> 2 1 bci
#> 3 2 luquillo
master()
#> # A tibble: 3 x 4
#> id eqn component site
#> <dbl> <chr> <chr> <chr>
#> 1 1 a + b whole scbi
#> 2 1 a + b whole bci
#> 3 2 a + c part luquillo
Created on 2019-03-15 by the reprex package (v0.2.1)
@teixeirak
allometry_specifity
is in the equation table described as:
Specific taxonomic level for which the equation was developed (species, genus, family or plant group)
equation_group
is the sitespecies table, defined as:
Allometric equation category that can be selected by the user. "Generic" equations are the current best equations or equation groups that can be applied to any site in a broad geographic area (e.g., pantropical, continental) and to any taxa at that site. "Expert" equations are the current best equations for specific taxa at a specific site, as identified by a botanist or forest ecologist. "Other" equations are potentially relevant (i.e., may be selected as best under different criteria) or were previously classified as best available.
@maurolepore I get your point....it is nice to have the taxa specificity in both tables but we also want to keep the best practice as you recommend.
it is nice to have the taxa specificity in both tables
Most users will need only the warpper, say master()
, that contains all info. master()
is equivalent to View
in an SQL database. The individual normalized tables (e.g. equations
, sitespecies
, etc.) are useful only for the database managers -- as it helps them maintain the database with minimum storage and minimum chances of mistakes.
For example, here are a bunch of tables from the SQL-database of ForestGEO. Most of them users don't need to worry about because they can use "joint" tables such as ViewFullTable.
library(tidyverse)
fgeo.data::data_dictionary %>%
as_tibble() %>%
pull(table) %>%
unique()
#> [1] "Census" "CensusQuadrat"
#> [3] "Coordinates" "Country"
#> [5] "CurrentObsolete" "DataCollection"
#> [7] "DBH" "DBHAttributes"
#> [9] "Family" "Features"
#> [11] "FeatureTypes" "Genus"
#> [13] "Log" "Measurement"
#> [15] "MeasurementAttributes" "MeasurementType"
#> [17] "Personnel" "PersonnelRole"
#> [19] "Quadrat" "Reference"
#> [21] "RemeasAttribs" "Remeasurement"
#> [23] "RoleReference" "Site"
#> [25] "Species" "SpeciesInventory"
#> [27] "Specimen" "Stem"
#> [29] "SubSpecies" "Tree"
#> [31] "TreeAttributes" "TreeTaxChange"
#> [33] "TSMAttributes" "ViewFullTable"
#> [35] "ViewTaxonomy"
Created on 2019-03-15 by the reprex package (v0.2.1)
Here is an example of the problem I am trying to show.
The column allometry_specifity
seems to mean the same (and it should) in both the equations
and sitespecies
tables:
library(tidyverse)
allodb::sitespecies_metadata %>%
filter(Field == "allometry_specificity") %>%
pull(Description)
#> [1] "Refers to the specific taxonomic level for which the biomass equation was developed (species, genus, family or plant group)"
allodb::equations_metadata %>%
filter(Field == "allometry_specificity") %>%
pull(Description)
#> [1] "Specific taxonomic level for which the equation was developed (species, genus, family or plant group)"
.
I understant that each equation_id
should then have a single value of allometry_specificity
. Yet, this isn't the case here:
library(tidyverse)
library(allodb)
shared_cols <- c(
"equation_id",
"dependent_variable_biomass_component",
"allometry_specificity",
"dbh_min_cm",
"dbh_max_cm"
)
bad_id <- "390a5a"
equations %>%
filter(equation_id == bad_id) %>%
select(shared_cols)
#> # A tibble: 1 x 5
#> equation_id dependent_variable_b~ allometry_specif~ dbh_min_cm dbh_max_cm
#> <chr> <chr> <chr> <chr> <chr>
#> 1 390a5a Total aboveground bi~ Species 0.3 1
sitespecies %>%
filter(equation_id == bad_id) %>%
select(shared_cols)
#> # A tibble: 1 x 5
#> equation_id dependent_variable_b~ allometry_specif~ dbh_min_cm dbh_max_cm
#> <chr> <chr> <chr> <chr> <chr>
#> 1 390a5a Total aboveground bi~ Genus 0.3 1
.
Where the two tables missmatch, NA
gets inserted:
library(tidyverse)
library(allodb)
shared_cols <- c(
"equation_id",
"dependent_variable_biomass_component",
"allometry_specificity",
"dbh_min_cm",
"dbh_max_cm"
)
bad_id <- "390a5a"
equations %>%
full_join(sitespecies) %>%
filter(equation_id == bad_id) %>%
# filter(is.na(dbh_units_original)) %>%
select(equation_id, allometry_specificity, matches("unit"))
#> Joining, by = c("equation_id", "dependent_variable_biomass_component", "allometry_specificity", "dbh_min_cm", "dbh_max_cm")
#> # A tibble: 2 x 4
#> equation_id allometry_specificity dbh_units_origin~ biomass_units_origin~
#> <chr> <chr> <chr> <chr>
#> 1 390a5a Species cm g
#> 2 390a5a Genus <NA> <NA>
.
With missing values in dbh_units*
or bioimass_units*
, the resulting biomass
is also NA
.
There was an error in the eq table. I just fixed it.
If that's the only shared column betwen equation
and sitespecies
table, then I will eliminate it from the equation table
Thanks!
It's practially impossible to not make mistakes if the data is duplicated.
If that's the only shared column betwen equation and sitespecies table ...
Here are all the duplicated columns, the only one that should be in the two tables is the key, i.e. equation_id
.
intersect(
names(allodb::equations),
names(allodb::sitespecies)
)
#> [1] "equation_id"
#> [2] "dependent_variable_biomass_component"
#> [3] "allometry_specificity"
#> [4] "dbh_min_cm"
#> [5] "dbh_max_cm"
I eliminated the redundant columns form the sitespecies tables.
Awesome, thanks!
I just finished an exploration of the problems. It should now be irrelevant, but I have it ready so I'll share it so the problem you have just solved becomes clearer.
I'll update allodb and use this analysis to check it's now all good.
library(tidyverse)
#> Warning: package 'purrr' was built under R version 3.5.3
library(allodb)
pick_different <- function(matches) {
equations %>%
full_join(sitespecies, by = "equation_id") %>%
select(equation_id, matches(matches)) %>%
mutate(is_different = !map2_lgl(.[[2]], .[[3]], identical)) %>%
filter(is_different) %>%
unique()
}
intersect(names(equations), names(sitespecies))
#> [1] "equation_id"
#> [2] "dependent_variable_biomass_component"
#> [3] "allometry_specificity"
#> [4] "dbh_min_cm"
#> [5] "dbh_max_cm"
# Good
pick_different("allometry_specificity")
#> # A tibble: 0 x 4
#> # ... with 4 variables: equation_id <chr>, allometry_specificity.x <chr>,
#> # allometry_specificity.y <chr>, is_different <lgl>
# Bad
pick_different("dependent_variable_biomass_component")
#> # A tibble: 3 x 4
#> equation_id dependent_variable_biom~ dependent_variable_bio~ is_different
#> <chr> <chr> <chr> <lgl>
#> 1 333c34 Stem (wood and bark) Stem and branches (liv~ TRUE
#> 2 e9d686 Stem (wood and bark) Stem and branches (liv~ TRUE
#> 3 8eca60 Whole tree (above stump) Stem and branches (liv~ TRUE
# Bad
pick_different("dbh_min_cm")
#> # A tibble: 1 x 4
#> equation_id dbh_min_cm.x dbh_min_cm.y is_different
#> <chr> <chr> <chr> <lgl>
#> 1 122ba6 25 2.5 TRUE
# Bad
pick_different("dbh_max_cm")
#> # A tibble: 1 x 4
#> equation_id dbh_max_cm.x dbh_max_cm.y is_different
#> <chr> <chr> <chr> <lgl>
#> 1 122ba6 550 55 TRUE
Created on 2019-03-18 by the reprex package (v0.2.1)
# Awesome!
intersect(names(allodb::equations), names(allodb::sitespecies))
#> [1] "equation_id"
Created on 2019-03-18 by the reprex package (v0.2.1)
@gonzalezeb,
Some columns seem redundant -- they appear in more than one table. Can you confirm the redundance and try eliminate it? Keeping the duplicated columns in sync is prone to errors that result in missing values inserted unexpectedly when two tables are joint together.
For example, here I expected the only shared column to be
equation_id
:This example shows that merging by
equation_id
(only) results in duplicated columns:Created on 2019-03-15 by the reprex package (v0.2.1)