traitecoevo / austraits.build

Source for AusTraits
Other
16 stars 2 forks source link

Function to reshape data frame from long to wide #99

Closed dfalster closed 6 years ago

dfalster commented 6 years ago

(and give correct value types: character, integer, etc)

SamCAndrew commented 6 years ago

I wrote a function to convert austraits into a matrix with each species as a row and each column as a trait. I also divided the traits into types so max and min values for traits have separate columns (this can be easily changed by editing the "value_type"column to be the same for all values for a trait). for numeric traits I took the mean of the values for each species with data. For categorical traits I took the first trait value for each species. It looks like for most traits there are many species with multiple records reporting different trait values. Below is a table summarizing this problem (zero is for species with no data and one means there is only one value reported for these species etc.).

What can we do about this?

image

The function outputs a list with four data frames. The first data frame is the trait values for each species The second data frame is the number of data points for each species The third data frame is the number of values per species for categorical traits The fourth data frame is the SD values for the means from numeric traits

below is the code for the function.

#### function for Long to wide Format using austraits ####
## set up data
#rm(list = ls())
library(plyr)
library(tidyr)

austraits <- remake::make('austraits')

austraits_data <- austraits$data

list_traits <- unique(austraits_data$trait_name)

## the Function

long_to_wide <- function(list_traits, austraits_data) {

  ### Prep data
  austraits_data[austraits_data == "NA"] <- NA
  austraits_data1 <- austraits_data[!is.na(austraits_data$value),]
  austraits_data1 <- austraits_data1[austraits_data1$trait_name %in% list_traits, ]
  ### make trait names
  austraits_data1$trait_type <- paste(austraits_data1$trait_name, austraits_data1$value_type, sep = "_")
  list_traits2 <- as.data.frame(unique(paste(austraits_data1$trait_name, austraits_data1$value_type, sep = "/")))

  continuous_traits <- c("leaf_area", "leaf_length", "leaf_width", "leaf_area_per_sapwood_area", "leaf_area_ratio",
                         "seed_mass", "seed_volume", "seed_length", "seed_width", "wood_density",
                         "seed_breadth", "leaf_C_per_dry_mass", "leaf_cell_wall_fraction", "leaf_cell_wall_N",
                         "bark_mass_area", "bark_thickness", "branch_mass_fraction", "diaspore_mass", "flowering_time",
                         "fruit_diameter", "fruit_length", "fruiting_time", "genome_size", "glaucous",
                         "leaf_cell_wall_N_fraction", "leaf_chlorophyll_per_dry_mass", "leaf_CN_ratio", "leaf_dark_respiration_per_area", "leaf_dark_respiration_per_dry_mass",
                         "leaf_delta13C", "leaf_delta15N", "leaf_dry_mass", "leaf_dry_matter_content", "leaf_elastic_modulus",
                         "leaf_fracture_force", "leaf_K_per_area", "leaf_K_per_dry_mass", "lignotuber", "leaf_lifespan",
                         "leaf_lobed", "leaf_mass_fraction", "leaf_N_per_area", "leaf_N_per_dry_mass", "leaf_P_per_area",
                         "leaf_P_per_dry_mass", "leaf_photosynthetic_nitrogen_use_efficiency", "leaf_photosynthetic_water_use_efficiency", "leaf_saturated_water_content_per_mass", "leaf_specific_conductivity",
                         "leaf_thickness", "leaf_toughness", "leaf_water_content_per_area", "leaf_water_content_per_mass", "nitrogen_fixing",
                         "photosynthetic_rate_per_area", "photosynthetic_rate_per_dry_mass", "plant_height", "root_wood_density", "sapwood_specific_conductivity",
                         "seed_mass_reserve", "seed_P_concentration", "seed_size_less_than_10mm", "seed_size_over_10mm", "serotiny",
                         "specific_leaf_area", "specific_leaf_area_SE", "stem_cross_section_area", "stem_cross_section_area_no_bark", "stem_density",
                         "stem_tissue_density", "stomatal_conductance_per_area", "storage_organ_present", "succulent", "water_use_efficiency")

  Categorical_traits <- c("aquatic", "dispersal_syndrome", "fire_response", "flower_colour", "fruit_type",
                          "fruit_type_botany", "fruit_type_function", "growth_habit", "leaf_arrangement", "leaf_compoundness",
                          "leaf_margin", "leaf_phenology", "leaf_shape", "leaf_type", "life_history",
                          "lifespan", "lineage", "photosynthetic_pathway", "plant_growth_form", "regen_strategy",
                          "root_structure", "seed_shape", "woodiness", "dispersal_appendage")

  Species_list <- as.data.frame(unique(austraits_data1$species_name))
  colnames(Species_list) <- "sp_name"

  aus_continuous <- austraits_data1[austraits_data1$trait_name %in% continuous_traits,]
  aus_continuous$value1 <- as.numeric(aus_continuous$value)
  #### same thing for all birds just without "sex_M.1" as a extra subsetting factor
  Trait_by_species<-ddply(aus_continuous,c("trait_type", "species_name"),summarise,
                    N_pop=sum(!is.na(value1)), trait_mean=mean(value1),
                    SD_trait=sd(value1))

  ### Get missing data
  missing_Species_1 <- as.data.frame(Species_list[!Species_list$sp_name %in% Trait_by_species$species_name,])
  colnames(missing_Species_1) <- "species_name"
  matrix_1 <- as.data.frame(matrix( ,nrow(missing_Species_1), length(unique(Trait_by_species$trait_type))))
  colnames(matrix_1) <- c(unique(Trait_by_species$trait_type))
  missing_Species_1 <- cbind(missing_Species_1, matrix_1)

  ## mean
  Trait_mean <- Trait_by_species[,c("trait_type", "species_name", "trait_mean")]
  mean_wide <- spread(Trait_mean, trait_type, trait_mean)
  mean_wide <- rbind(mean_wide, missing_Species_1)
  mean_wide <- mean_wide[order(mean_wide$species_name),]
  ## n
  Trait_n <- Trait_by_species[,c("trait_type", "species_name", "N_pop")]
  n_wide <- spread(Trait_n, trait_type, N_pop)
  n_wide <- rbind(n_wide, missing_Species_1)
  n_wide <- n_wide[order(n_wide$species_name),]
  ## SD
  Trait_SD <- Trait_by_species[,c("trait_type", "species_name", "SD_trait")]
  SD_wide <- spread(Trait_SD, trait_type, SD_trait)
  SD_wide <- rbind(SD_wide, missing_Species_1)
  SD_wide <- n_wide[order(SD_wide$species_name),]
  SD_for_values <- SD_wide

  rm(Trait_mean, Trait_n, Trait_SD, missing_Species_1, matrix_1)

  aus_categorical <- austraits_data1[austraits_data1$trait_name %in% Categorical_traits,]

  Trait_by_species2<-ddply(aus_categorical,c("trait_name", "species_name"),summarise,
                          N_pop=sum(!is.na(value)))#, trait_mean=paste0(unique(value)))

  missing_Species_2 <- as.data.frame(Species_list[!Species_list$sp_name %in% Trait_by_species2$species_name,])
  colnames(missing_Species_2) <- "species_name"
  matrix_2 <- as.data.frame(matrix( ,nrow(missing_Species_2), length(unique(Trait_by_species2$trait_name))))
  colnames(matrix_2) <- c(unique(Trait_by_species2$trait_name))
  missing_Species_2 <- cbind(missing_Species_2, matrix_2)

  ## n
  n_wide_2 <- spread(Trait_by_species2, trait_name, N_pop)
  n_wide_2 <- rbind(n_wide_2, missing_Species_2)
  n_wide_2 <- n_wide_2[order(n_wide_2$species_name),]

  Species_list2 <- as.character(Species_list$sp_name)
  Cat_traits<- c(unique(aus_categorical$trait_name))

  all_traits <- list()
  for(i in 1:length(Cat_traits)){
    trait <- aus_categorical[aus_categorical$trait_name == Cat_traits[i],]
    df <- list()
    for(ii in 1:length(Species_list2)){
      species <- trait[trait$species_name == Species_list2[ii],]
      sp <- as.data.frame(Cat_traits[i])
      colnames(sp) <- "trait_name"
      sp$species_name <- Species_list2[ii]
      sp$trait_value <- if(nrow(species)>0)  { paste0(unique(species$value)[1]) } else {NA} 
      sp$no_trait_values <- length(unique(species$value))
      df[[ii]] <- sp
    }

    all_traits[[i]] <- do.call("rbind",df)
  }

  rm(i, ii, list_traits, Categorical_traits, continuous_traits)

  Trait_mean2 <- do.call("rbind", all_traits)
  Trait_mean3 <- Trait_mean2[,c("trait_name", "species_name", "trait_value")]
  mean_wide_2 <- spread(Trait_mean3, trait_name, trait_value)
  mean_wide_2 <- mean_wide_2[order(mean_wide_2$species_name),]

  Trait_mean4 <- Trait_mean2[,c("trait_name", "species_name", "no_trait_values")]
  mean_n_trait_2 <- spread(Trait_mean4, trait_name, no_trait_values)
  mean_n_trait_2 <- mean_n_trait_2[order(mean_n_trait_2$species_name),]

  rm(Species_list2, all_traits, missing_Species_2, matrix_2, trait, df, species, sp,
     Trait_mean3, Trait_mean4)

  trait_values <- cbind(mean_wide, mean_wide_2[,2:length(mean_wide_2)])
  number_of_observations <- cbind(n_wide, n_wide_2[,2:length(n_wide_2)])
  number_of_catergorical_values <- mean_n_trait_2

  rm(mean_wide, mean_wide_2, n_wide, n_wide_2, mean_n_trait_2, SD_wide,
     Trait_by_species, Trait_by_species2, Trait_mean2, Species_list)

  output <- list()
  output[[1]] <- trait_values ## trait values for each species
  output[[2]] <- number_of_observations ## number of data points for each species
  output[[3]] <- number_of_catergorical_values ## number of values per species for categorical traits
  output[[4]] <- SD_for_values ## SD for means calculated for numeric traits
  return(output)

}
SamCAndrew commented 6 years ago

I made some edits and corrections for the long to wide function. For categorical traits the function now selects the value that has been reported the most times when more than one value has been reported for a species. Before it was just giving the top value from the list of reported values. This changed the output quite a bit for plant growth form but it is still probably not perfect.

## set up data

austraits <- remake::make('austraits')

austraits_data <- austraits$data

list_traits <- unique(austraits_data$trait_name)

## the Function

long_to_wide <- function(list_traits, austraits_data) {
  require(plyr)
  require(tidyr)
  ### Prep data
  austraits_data[austraits_data == "NA"] <- NA
  austraits_data1 <- austraits_data[!is.na(austraits_data$value),]
  austraits_data1 <- austraits_data1[austraits_data1$trait_name %in% list_traits, ]
  ### make trait names
  austraits_data1$trait_type <- paste(austraits_data1$trait_name, austraits_data1$value_type, sep = "_")
  list_traits2 <- as.data.frame(unique(paste(austraits_data1$trait_name, austraits_data1$value_type, sep = "/")))

  continuous_traits <- c("leaf_area", "leaf_length", "leaf_width", "leaf_area_per_sapwood_area", "leaf_area_ratio",
                         "seed_mass", "seed_volume", "seed_length", "seed_width", "wood_density",
                         "seed_breadth", "leaf_C_per_dry_mass", "leaf_cell_wall_fraction", "leaf_cell_wall_N",
                         "bark_mass_area", "bark_thickness", "branch_mass_fraction", "diaspore_mass", "flowering_time",
                         "fruit_diameter", "fruit_length", "fruiting_time", "genome_size", "glaucous",
                         "leaf_cell_wall_N_fraction", "leaf_chlorophyll_per_dry_mass", "leaf_CN_ratio", "leaf_dark_respiration_per_area", "leaf_dark_respiration_per_dry_mass",
                         "leaf_delta13C", "leaf_delta15N", "leaf_dry_mass", "leaf_dry_matter_content", "leaf_elastic_modulus",
                         "leaf_fracture_force", "leaf_K_per_area", "leaf_K_per_dry_mass", "lignotuber", "leaf_lifespan",
                         "leaf_lobed", "leaf_mass_fraction", "leaf_N_per_area", "leaf_N_per_dry_mass", "leaf_P_per_area",
                         "leaf_P_per_dry_mass", "leaf_photosynthetic_nitrogen_use_efficiency", "leaf_photosynthetic_water_use_efficiency", "leaf_saturated_water_content_per_mass", "leaf_specific_conductivity",
                         "leaf_thickness", "leaf_toughness", "leaf_water_content_per_area", "leaf_water_content_per_mass", "nitrogen_fixing",
                         "photosynthetic_rate_per_area", "photosynthetic_rate_per_dry_mass", "plant_height", "root_wood_density", "sapwood_specific_conductivity",
                         "seed_mass_reserve", "seed_P_concentration", "seed_size_less_than_10mm", "seed_size_over_10mm", "serotiny",
                         "specific_leaf_area", "specific_leaf_area_SE", "stem_cross_section_area", "stem_cross_section_area_no_bark", "stem_density",
                         "stem_tissue_density", "stomatal_conductance_per_area", "storage_organ_present", "succulent", "water_use_efficiency")

  Categorical_traits <- c("aquatic", "dispersal_syndrome", "fire_response", "flower_colour", "fruit_type",
                          "fruit_type_botany", "fruit_type_function", "growth_habit", "leaf_arrangement", "leaf_compoundness",
                          "leaf_margin", "leaf_phenology", "leaf_shape", "leaf_type", "life_history",
                          "lifespan", "lineage", "photosynthetic_pathway", "plant_growth_form", "regen_strategy",
                          "root_structure", "seed_shape", "woodiness", "dispersal_appendage")

  Species_list <- as.data.frame(unique(austraits_data1$species_name))
  colnames(Species_list) <- "species_name"

  aus_continuous <- austraits_data1[austraits_data1$trait_name %in% continuous_traits,]
  aus_continuous$value1 <- as.numeric(aus_continuous$value)
  #### same thing for all birds just without "sex_M.1" as a extra subsetting factor
  if(nrow(aus_continuous) > 0 ) {  
    Trait_by_species<-ddply(aus_continuous,c("trait_type", "species_name"),summarise,
                      N_pop=sum(!is.na(value1)), trait_mean=mean(value1),
                      SD_trait=sd(value1))

    ### Get missing data
    missing_Species_1 <- as.data.frame(Species_list[!Species_list$species_name %in% Trait_by_species$species_name,])
    colnames(missing_Species_1) <- "species_name"
    matrix_1 <- as.data.frame(matrix( ,nrow(missing_Species_1), length(unique(Trait_by_species$trait_type))))
    colnames(matrix_1) <- c(unique(Trait_by_species$trait_type))
    missing_Species_1 <- cbind(missing_Species_1, matrix_1)

    ## mean
    Trait_mean <- Trait_by_species[,c("trait_type", "species_name", "trait_mean")]
    mean_wide <- spread(Trait_mean, trait_type, trait_mean)
    mean_wide <- rbind(mean_wide, missing_Species_1)
    mean_wide <- mean_wide[order(mean_wide$species_name),]
    ## n
    Trait_n <- Trait_by_species[,c("trait_type", "species_name", "N_pop")]
    n_wide <- spread(Trait_n, trait_type, N_pop)
    n_wide <- rbind(n_wide, missing_Species_1)
    n_wide <- n_wide[order(n_wide$species_name),]
    ## SD
    Trait_SD <- Trait_by_species[,c("trait_type", "species_name", "SD_trait")]
    SD_wide <- spread(Trait_SD, trait_type, SD_trait)
    SD_wide <- rbind(SD_wide, missing_Species_1)
    SD_wide <- SD_wide[order(SD_wide$species_name),]
    SD_for_values <- SD_wide

    rm(Trait_mean, Trait_n, Trait_SD, missing_Species_1, matrix_1)
  } else { 
    n_wide <- Species_list
    mean_wide <- Species_list
    SD_for_values <- Species_list
    }

  aus_categorical <- austraits_data1[austraits_data1$trait_name %in% Categorical_traits,]

  if(nrow(aus_categorical) > 0) {  
    Trait_by_species2<-ddply(aus_categorical,c("trait_name", "species_name"),summarise,
                            N_pop=sum(!is.na(value)))#, trait_mean=paste0(unique(value)))

    missing_Species_2 <- as.data.frame(Species_list[!Species_list$species_name %in% Trait_by_species2$species_name,])
    colnames(missing_Species_2) <- "species_name"
    matrix_2 <- as.data.frame(matrix( ,nrow(missing_Species_2), length(unique(Trait_by_species2$trait_name))))
    colnames(matrix_2) <- c(unique(Trait_by_species2$trait_name))
    missing_Species_2 <- cbind(missing_Species_2, matrix_2)

    ## n
    n_wide_2 <- spread(Trait_by_species2, trait_name, N_pop)
    n_wide_2 <- rbind(n_wide_2, missing_Species_2)
    n_wide_2 <- n_wide_2[order(n_wide_2$species_name),]

    Species_list2 <- as.character(Species_list$species_name)
    Cat_traits<- c(unique(aus_categorical$trait_name))

    all_traits <- list()
    for(i in 1:length(Cat_traits)){
      trait <- aus_categorical[aus_categorical$trait_name == Cat_traits[i],]
      df <- list()
      for(ii in 1:length(Species_list2)){
        species <- trait[trait$species_name == Species_list2[ii],]
        species2 <- as.data.frame(table(species[,"value"]))
        if(nrow(species2)>0)  { species2 <- species2[order(species2$Freq, decreasing = TRUE),]}

        sp <- as.data.frame(Cat_traits[i])
        colnames(sp) <- "trait_name"
        sp$species_name <- Species_list2[ii]
        sp$trait_value <- if(nrow(species2)>0)  { paste0(unique(species2$Var1)[1]) } else {NA} 
        sp$no_trait_values <- length(unique(species$value))
        df[[ii]] <- sp
      }

      all_traits[[i]] <- do.call("rbind",df)
    }

    rm(i, ii, list_traits, Categorical_traits, continuous_traits)

    Trait_mean2 <- do.call("rbind", all_traits)
    Trait_mean3 <- Trait_mean2[,c("trait_name", "species_name", "trait_value")]
    mean_wide_2 <- spread(Trait_mean3, trait_name, trait_value)
    mean_wide_2 <- mean_wide_2[order(mean_wide_2$species_name),]

    Trait_mean4 <- Trait_mean2[,c("trait_name", "species_name", "no_trait_values")]
    mean_n_trait_2 <- spread(Trait_mean4, trait_name, no_trait_values)
    mean_n_trait_2 <- mean_n_trait_2[order(mean_n_trait_2$species_name),]

    rm(Species_list2, all_traits, missing_Species_2, matrix_2, trait, df, species, sp,
       Trait_mean3, Trait_mean4)
  }
  trait_values <- cbind(mean_wide, mean_wide_2[,2:length(mean_wide_2), drop=FALSE])
  number_of_observations <- cbind(n_wide, n_wide_2[,2:length(n_wide_2), drop=FALSE])
  number_of_catergorical_values <- mean_n_trait_2

  rm(mean_wide, mean_wide_2, n_wide, n_wide_2, mean_n_trait_2, SD_wide,
     Trait_by_species, Trait_by_species2, Trait_mean2, Species_list)

  output <- list()
  output[[1]] <- trait_values ## trait values for each species
  output[[2]] <- number_of_observations ## number of data points for each species
  output[[3]] <- number_of_catergorical_values ## number of values per species for categorical traits
  output[[4]] <- SD_for_values ## SD for means calculated for numeric traits
  return(output)

}
dfalster commented 6 years ago

Hi @SamCAndrew ,

Thanks for this! So there are two key steps here:

  1. convert from long to wide, and
  2. take species averages.

You've gone down the path of species average first, but i wonder if the other way around is more useful? The reason is that there are cases where we may want to go from long to wide, without taking means.

In any case, it is easier once #94 is implemented (unique identifiers).

dfalster commented 6 years ago

Here'a a prototype:

options(width=200)
library(tidyverse)
source("R/austraits.R")
spread_traits_data <- function(data) {

  vars <- c("value", "unit", "value_type", "replicates", "precision", "methodology_ids")
  ret <- list()
  for(v in vars) {
    ret[[v]] <-
      data %>% rename(to_spread = !!v) %>%
        select(dataset_id, species_name, site_name, observation_id, trait_name, to_spread, original_name) %>%
        spread(trait_name, to_spread)
  }
  ret
}

austraits <- remake::make("austraits")
## <  MAKE > austraits
x <- extract_dataset(austraits, "Falster_2005_1")$data %>% spread_traits_data()

We then a list with elements giving the values:

x$value
## # A tibble: 45 x 11
##    dataset_id     species_name               site_name        observation_id    original_name             branch_mass_fraction leaf_area leaf_N_per_dry_mass seed_mass specific_leaf_area wood_density
##    <chr>          <chr>                      <chr>            <chr>             <chr>                     <chr>                <chr>     <chr>               <chr>     <chr>              <chr>       
##  1 Falster_2005_1 Acronychia acidula         Atherton         Falster_2005_1_2  Acronychia acidula        0.48                 14302     24.3                <NA>      11.7647058823529   0.525       
##  2 Falster_2005_1 Aleurites rockinghamensis  Cape Tribulation Falster_2005_1_12 Aleurites rockinghamensis 0.02                 73984     18.4                7077      8.84955752212389   0.28        
##  3 Falster_2005_1 Alphitonia petriei         Atherton         Falster_2005_1_3  Alphitonia petriei        0.42                 6820      16.3                27.2      6.71140939597315   0.413       
##  4 Falster_2005_1 Alstonia scholaris         Cape Tribulation Falster_2005_1_13 Alstonia scholaris        0.46                 6182      22.3                1.53      9.34579439252336   0.361       
##  5 Falster_2005_1 Argyrodendron peralatum    Cape Tribulation Falster_2005_1_14 Argyrodendron peralatum   0.71                 3201      11.9                433       4.0983606557377    0.726       
##  6 Falster_2005_1 Atractocarpus hirtus       Cape Tribulation Falster_2005_1_15 Atractocarpus hirtus      0.28                 11374     15.7                <NA>      13.8888888888889   0.804       
##  7 Falster_2005_1 Cardwellia sublimis        Cape Tribulation Falster_2005_1_17 Cardwellia sublimis       0.56                 4352      12.8                582.3     7.8740157480315    0.603       
##  8 Falster_2005_1 Castanospermum australe    Cape Tribulation Falster_2005_1_18 Castanospermum australe   0.37                 2765      23.7                14851     8.54700854700855   0.587       
##  9 Falster_2005_1 Cleistanthus oblongifolius Cape Tribulation Falster_2005_1_19 Cleistanthus myrianthus   0.56                 4828      22.8                <NA>      12.8205128205128   0.588       
## 10 Falster_2005_1 Cryptocarya laevigata      Cape Tribulation Falster_2005_1_20 Cryptocarya laevigata     0.65                 2682      15.6                1281      11.4942528735632   0.64        
## # ... with 35 more rows

the units

x$unit
## # A tibble: 45 x 11
##    dataset_id     species_name               site_name        observation_id    original_name             branch_mass_fraction leaf_area leaf_N_per_dry_mass seed_mass specific_leaf_area wood_density
##    <chr>          <chr>                      <chr>            <chr>             <chr>                     <chr>                <chr>     <chr>               <chr>     <chr>              <chr>       
##  1 Falster_2005_1 Acronychia acidula         Atherton         Falster_2005_1_2  Acronychia acidula        dimensionless        mm2       mg/g                <NA>      mm2/mg             mg/mm3      
##  2 Falster_2005_1 Aleurites rockinghamensis  Cape Tribulation Falster_2005_1_12 Aleurites rockinghamensis dimensionless        mm2       mg/g                mg        mm2/mg             mg/mm3      
##  3 Falster_2005_1 Alphitonia petriei         Atherton         Falster_2005_1_3  Alphitonia petriei        dimensionless        mm2       mg/g                mg        mm2/mg             mg/mm3      
##  4 Falster_2005_1 Alstonia scholaris         Cape Tribulation Falster_2005_1_13 Alstonia scholaris        dimensionless        mm2       mg/g                mg        mm2/mg             mg/mm3      
##  5 Falster_2005_1 Argyrodendron peralatum    Cape Tribulation Falster_2005_1_14 Argyrodendron peralatum   dimensionless        mm2       mg/g                mg        mm2/mg             mg/mm3      
##  6 Falster_2005_1 Atractocarpus hirtus       Cape Tribulation Falster_2005_1_15 Atractocarpus hirtus      dimensionless        mm2       mg/g                <NA>      mm2/mg             mg/mm3      
##  7 Falster_2005_1 Cardwellia sublimis        Cape Tribulation Falster_2005_1_17 Cardwellia sublimis       dimensionless        mm2       mg/g                mg        mm2/mg             mg/mm3      
##  8 Falster_2005_1 Castanospermum australe    Cape Tribulation Falster_2005_1_18 Castanospermum australe   dimensionless        mm2       mg/g                mg        mm2/mg             mg/mm3      
##  9 Falster_2005_1 Cleistanthus oblongifolius Cape Tribulation Falster_2005_1_19 Cleistanthus myrianthus   dimensionless        mm2       mg/g                <NA>      mm2/mg             mg/mm3      
## 10 Falster_2005_1 Cryptocarya laevigata      Cape Tribulation Falster_2005_1_20 Cryptocarya laevigata     dimensionless        mm2       mg/g                mg        mm2/mg             mg/mm3      
## # ... with 35 more rows

and so forth

names(x)
## [1] "value"           "unit"            "value_type"      "replicates"      "precision"       "methodology_ids"
dfalster commented 6 years ago

The challenge is that the above does not work when there are two rows with the same observation_id and trait_name. this can occur when we have min and max values, which come from original columns like leaf_width_min & leaf_width_max.

Connects with #124

dfalster commented 6 years ago

As a result of the above, I'm a bit stumped about how to handle range values just now. Wondering whether we instead encode a range, e.g. 20-30, instead of splitting it into min and max, as we have mostly done.

dfalster commented 6 years ago

Here's example of duplication which causes the above to break:

df <- austraits$data %>% 
        select(dataset_id, species_name, site_name, observation_id, trait_name, original_name)
i <- duplicated(df) | duplicated(df, fromLast = TRUE)
austraits$data[i,]
# A tibble: 63,542 x 12
   dataset_id species_name        site_name observation_id trait_name   value unit  value_type replicates precision methodology_ids original_name
   <chr>      <chr>               <chr>     <chr>          <chr>        <chr> <chr> <chr>      <chr>      <chr>     <chr>           <chr>
 1 Bean_1997  Baeckea leptocaulis NA        Bean_1997_3    plant_height 0.3   m     min        unknown    unknown   unknown         Baeckea leptocaulis
 2 Bean_1997  Baeckea leptocaulis NA        Bean_1997_3    plant_height 1     m     max        unknown    unknown   unknown         Baeckea leptocaulis
 3 Bean_1997  Baeckea frutescens  NA        Bean_1997_1    leaf_length  5.5   mm    min        unknown    unknown   unknown         Baeckea frutescens
 4 Bean_1997  Baeckea linifolia   NA        Bean_1997_2    leaf_length  6     mm    min        unknown    unknown   unknown         Baeckea linifolia
 5 Bean_1997  Baeckea leptocaulis NA        Bean_1997_3    leaf_length  4.5   mm    min        unknown    unknown   unknown         Baeckea leptocaulis
 6 Bean_1997  Baeckea imbricata   NA        Bean_1997_4    leaf_length  2.4   mm    min        unknown    unknown   unknown         Baeckea imbricata
 7 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    leaf_length  2.1   mm    min        unknown    unknown   unknown         Baeckea diosmifolia
 8 Bean_1997  Baeckea trapeza     NA        Bean_1997_6    leaf_length  3     mm    min        unknown    unknown   unknown         Baeckea trapeza
 9 Bean_1997  Baeckea omissa      NA        Bean_1997_7    leaf_length  2     mm    min        unknown    unknown   unknown         Baeckea omissa
10 Bean_1997  Baeckea gunniana    NA        Bean_1997_8    leaf_length  1.9   mm    min        unknown    unknown   unknown         Baeckea gunniana

And here's where the errors are araising from:

> austraits$data[i,] %>% pull(trait_name) %>% table()

.
      leaf_length        leaf_width plant_growth_form      plant_height      seed_breadth       seed_length        seed_width
            18110             16512                10             13854               530              8300              6226

or by dataset_id

> austraits$data[i,] %>% pull(dataset_id) %>% table() %>% sort()
.
Keighery_2004   Wilson_2008   Craven_2010    Crisp_1984  Trudgen_2005    Rye_2013_2    Rye_2009_1     Bean_1997      Rye_2002    Rye_2009_2    Rye_2013_1      Rye_2006  Trudgen_2014  Toelken_1996
            8            17            28            30            38            40            48            62            64            78            80            84            94           195
     Rye_2015 Thompson_2001   Craven_1987 Chandler_2002     TMAG_2009    Brock_1993 Chinnock_2007      NTH_2014     CPRR_2002  Wheeler_2002      SAH_2014      WAH_1998  Kooyman_2011
          220           299           408           672          1088          1688          2683          3866          5330          5544          9456         10620         20802
dfalster commented 6 years ago

We could then replace the min/max/quantile values with a median?

here's the original (rows with duplicate keys only)

> austraits$data[i,] %>% arrange(dataset_id, species_name)
# A tibble: 63,532 x 12
   dataset_id species_name        site_name observation_id trait_name  value unit  value_type replicates precision methodology_ids original_name
   <chr>      <chr>               <chr>     <chr>          <chr>       <chr> <chr> <chr>      <chr>      <chr>     <chr>           <chr>
 1 Bean_1997  Baeckea brevifolia  NA        Bean_1997_12   leaf_length 1     mm    min        unknown    unknown   unknown         Baeckea brevifolia
 2 Bean_1997  Baeckea brevifolia  NA        Bean_1997_12   leaf_length 2.3   mm    max        unknown    unknown   unknown         Baeckea brevifolia
 3 Bean_1997  Baeckea brevifolia  NA        Bean_1997_12   leaf_width  0.7   mm    min        unknown    unknown   unknown         Baeckea brevifolia
 4 Bean_1997  Baeckea brevifolia  NA        Bean_1997_12   leaf_width  1     mm    max        unknown    unknown   unknown         Baeckea brevifolia
 5 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    leaf_length 2.1   mm    min        unknown    unknown   unknown         Baeckea diosmifolia
 6 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    leaf_length 4.5   mm    max        unknown    unknown   unknown         Baeckea diosmifolia
 7 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    leaf_width  0.5   mm    min        unknown    unknown   unknown         Baeckea diosmifolia
 8 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    leaf_width  1.5   mm    max        unknown    unknown   unknown         Baeckea diosmifolia
 9 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    seed_length 0.5   mm    min        unknown    unknown   unknown         Baeckea diosmifolia
10 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    seed_length 0.5   mm    max        unknown    unknown   unknown         Baeckea diosmifolia
# ... with 63,522 more rows

And now the median

> austraits$data[i,] %>%
+   group_by(dataset_id, species_name, site_name, observation_id, trait_name) %>%
+   summarise(value = as.character(mean(as.numeric(value))), value_type = "median") %>%
+   arrange(dataset_id, species_name)
# A tibble: 30,916 x 7
# Groups:   dataset_id, species_name, site_name, observation_id [14,195]
   dataset_id species_name        site_name observation_id trait_name  value value_type
   <chr>      <chr>               <chr>     <chr>          <chr>       <chr> <chr>
 1 Bean_1997  Baeckea brevifolia  NA        Bean_1997_12   leaf_length 1.65  median
 2 Bean_1997  Baeckea brevifolia  NA        Bean_1997_12   leaf_width  0.85  median
 3 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    leaf_length 3.3   median
 4 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    leaf_width  1     median
 5 Bean_1997  Baeckea diosmifolia NA        Bean_1997_5    seed_length 0.5   median
 6 Bean_1997  Baeckea frutescens  NA        Bean_1997_1    leaf_length 8.5   median
 7 Bean_1997  Baeckea frutescens  NA        Bean_1997_1    leaf_width  0.6   median
 8 Bean_1997  Baeckea frutescens  NA        Bean_1997_1    seed_length 0.5   median
 9 Bean_1997  Baeckea gunniana    NA        Bean_1997_8    leaf_length 2.85  median
10 Bean_1997  Baeckea gunniana    NA        Bean_1997_8    leaf_width  1.15  median
dfalster commented 6 years ago

Ok, after thinking about this and looking at @SamCAndrew and my approaches, it is obvious there is no single best approach. Depending on use case, we might want to