make it easy to convert between our output format and what is needed to submit to earthchem database

I worked on this on 2023-07-26. Copied from Teams:

R-script I used to re-order our raw data into their columns! May not be useful for most of you but could be a starting point to make this easier in the future?

sample means

# from world ocean atlas processing script
u1411lat <- 41.618333
u1411lon <- -49
# get SDs
simple_summ <- readd(filtsamps) |> # this gets me my filtered samples without any standards and/or bad measurements
  summarize(
    d13C_simple_mean = mean(d13C_PDB_mean),
    d18O_simple_mean = mean(d18O_PDB_mean),
    sd_d13C = sd(d13C_PDB_mean),
    sd_d18O = sd(d18O_PDB_mean),
    N = n(),
    simple_seD47 = sd(D47_final) / sqrt(N),
    sd_d13C_final = sd(d13C_offset_corrected, na.rm = TRUE),
    sd_d18O_final = sd(d18O_offset_corrected, na.rm = TRUE),
    .by = c(smp, dwelling_depth)
  )
# we can also use the bootstrapped things to calculate SE
# according to StatQuest https://www.youtube.com/watch?v=Xz0x-8-cgaQ
# the sd(bootstrapped_means) = SE
boot_summ <- readd(sample_boots) |> # these are bootstrapped averages for each sample
  ungroup() |>
  summarize(seD47 = sd(D47),
            seT = sd(temp),
            .by = c(smp, dwelling_depth)) |>
  mutate(smp = as.integer(smp))
# thesea are the bootstrapped averages for each sample
# in my case, binned by dwelling_depth and sample number
readd(sample_averages) |>
  filter(.width == .95) |>
  mutate(Bin = 1:n(),
         IGSN = NA_character_,
         Latitude = u1411lat,
         Longitude = u1411lon,
         Mineralogy = "C", FormT = NA_real_, erFormT = NA_real_,
         Site = "U1411", Location = "Newfoundland Margin, North Atlantic",
         ) |>
  left_join(simple_summ) |>
  left_join(boot_summ) |>
  select(SampName = grp,
         IGSN, Latitude, Longitude, Age = age, Bin,
         ## Description, Species, SampSubCategory,
         ## SampNum,
         Mineralogy, FormT, erFormT, Site, Location, N = n,
         d13C = d13C_simple_mean, sd13 = sd_d13C,
         d18O = d18O_simple_mean, sd18 = sd_d18O,
         Final_d13Ccarb = d13C,
         sd_d13C_final,
         ## Final_d18Ocarb_VPDB,
         Final_d18Ocarb_VSMOW = d18O,
         sd_d18O_final,
         D47rfac = D47,
         seD47,
         ## extSE = ,
         Temp = temp,
         seT,
         ## ext_seT
         ) |>
  ## glimpse()
  writexl::write_xlsx("out/2023-07-25_sample-averages.xlsx")

replicates

# allout holds all standard + sample replicates, is the output of the clumped-processing pipeline
# maybe with some extra metadata added for my use-case
more_summary_stats <- readd(allout) |>
  group_by(etf_grp) |>
  separate(etf_stds, sep = " ", into=c("std1","std2","std3","std4")) |>
  tidylog::filter(broadid ==  std1 | broadid == std2 | broadid == std3 | broadid == std4) |>
  summarize(No_Stds = n())
# SOMEDAY: get actual number of replicates for d13C and d18O offset correction?
## summ_stats <- readd(allout) |>
##     No_Stds_13C_18O,
readd(allout) |>
  ungroup() |>
  mutate(
    SampNum = 1:n(),
    SampCategory = ifelse(broadid == "other", "sample", "carbSTD"),
    SampSubCategory = ifelse(broadid == "other", "biogenic", broadid),
    Reference = NA_character_,
    Mineralogy = "C",
    Mineralogy2 = NA_character_,
    Bad = ifelse(outlier, 1, 0),
    Date = lubridate::date(file_datetime),
    Time = format(as.POSIXct(file_datetime), format = "%H:%M:%S"),
    FormT = NA_real_,
    erFormT = NA_real_,
    d18O_wg_VSMOW = NA_real_,
    se_d45 = d45_sd / sqrt(n_ok),
    se_d46 = d46_sd / sqrt(n_ok),
    se_d47 = d47_sd / sqrt(n_ok),
    se_d48 = d48_sd / sqrt(n_ok),
    se_d49 = d49_sd / sqrt(n_ok),
    SampYN = ifelse(broadid == "other", "Y", "N"),
    RefYN = ifelse(broadid == "other", "N", "Y"),
    d18O_VSMOW = NA_real_,
    sd_18_VSMOW = NA_real_,
    se_18_VSMOW = NA_real_,
    AFF_d18O = NA_real_,
    Final_d18Ocarb_VSMOW = NA_real_,
    se_D49 = D49_raw_sd / sqrt(n_ok),
    SlopeEGL = NA_real_,
    Bin = ifelse(!is.na(smp), paste(smp, dwelling_depth, sep = "_"), NA_character_),
    ) |>
  group_by(etf_grp) |>
  mutate(
    ARF_ID2 = first(Analysis),
    ARF_ID3 = last(Analysis),
    ) |>
  left_join(
    more_summary_stats # this gives us No_Stds
  ) |>
  ungroup() |>
  select(
    SampName = identifier_1,
    Bin,
    SampCategory,
    SampSubCategory,
    SampNum,
    Reference,
    Mineralogy,
    Mineralogy2,
    Date,
    Time,
    AnalysisID = Analysis,
    ReplicateID = file_id,
    MassSpec = masspec,
    FormT,
    erFormT,
    rxnTemp = acid_temperature,
    SampYN,
    RefYN,
    Bad,
    D47TE = expected_D47,
    AFF = acid_fractionation_factor,
    ARF_ID1 = etf_grp,
    ARF_ID2,
    ARF_ID3,
    Stds_used = etf_stds,
    ## No_Stds = etf_width,
    No_Stds,
    Run = preparation,
    d45 = d45_mean,
    sd_d45 = d45_sd,
    se_d45,
    d46 = d46_mean,
    sd_d46 = d46_sd,
    se_d46,
    d47 = d47_mean,
    sd_d47 = d47_sd,
    se_d47,
    d48 = d48_mean,
    sd_d48 = d48_sd,
    se_d48,
    d49 = d49_mean,
    sd_d49 = d49_sd,
    se_d49,
    d13C_wg_VPDB = d13C_PDB_wg,
    d18O_wg_VPDB = d18O_PDBCO2_wg,
    d18O_wg_VSMOW,
    d13C = d13C_PDB_mean,
    sd_13 = d13C_PDB_sd,
    se_13 = d13C_PDB_sem,
    d18O_VPDB = d18O_PDB_mean,
    sd_18_VPDB = d18O_PDB_sd,
    se_18_VPDB = d18O_PDB_sem,
    d18O_VSMOW,
    sd_18_VSMOW,
    se_18_VSMOW,
    Stds_used_13C_18O = off_d13C_stds,
    No_Stds_13C_18O = off_d13C_width, # TODO: make offset_correction report N
    AFF_d18O,
    Final_d13Ccarb = d13C_offset_corrected,
    Final_d18Ocarb_VPDB = d18O_offset_corrected,
    Final_d18Ocarb_VSMOW,
    D47 = D47_raw_mean,
    sd_D47 = D47_raw_sd,
    se_D47 = D47_raw_sem,
    D48 = D48_raw_mean,
    sd_D48 = D47_raw_sd,
    se_D48 = D47_raw_sem,
    D49 = D49_raw_mean,
    sd_D49 = D49_raw_sd,
    se_D49,
    SlopeEGL,
    SlopeETF = etf_slope,
    IntETF = etf_intercept,
    D47rfac = D47_final
  ) |>
  writexl::write_xlsx("out/2023-07-25_sample-standard-replicates.xlsx")

Oh and just to make it clear: after running these scripts I copy pasted everything over to the correct tab in the template (including the header to make sure it's the same for every column) and then deleted that double row.

UtrechtUniversity / clumped-processing

make it easy to convert between our output format and what is needed to submit to earthchem database #16

sample means

replicates