Open japhir opened 1 year ago
I worked on this on 2023-07-26. Copied from Teams:
R-script I used to re-order our raw data into their columns! May not be useful for most of you but could be a starting point to make this easier in the future?
# from world ocean atlas processing script
u1411lat <- 41.618333
u1411lon <- -49
# get SDs
simple_summ <- readd(filtsamps) |> # this gets me my filtered samples without any standards and/or bad measurements
summarize(
d13C_simple_mean = mean(d13C_PDB_mean),
d18O_simple_mean = mean(d18O_PDB_mean),
sd_d13C = sd(d13C_PDB_mean),
sd_d18O = sd(d18O_PDB_mean),
N = n(),
simple_seD47 = sd(D47_final) / sqrt(N),
sd_d13C_final = sd(d13C_offset_corrected, na.rm = TRUE),
sd_d18O_final = sd(d18O_offset_corrected, na.rm = TRUE),
.by = c(smp, dwelling_depth)
)
# we can also use the bootstrapped things to calculate SE
# according to StatQuest https://www.youtube.com/watch?v=Xz0x-8-cgaQ
# the sd(bootstrapped_means) = SE
boot_summ <- readd(sample_boots) |> # these are bootstrapped averages for each sample
ungroup() |>
summarize(seD47 = sd(D47),
seT = sd(temp),
.by = c(smp, dwelling_depth)) |>
mutate(smp = as.integer(smp))
# thesea are the bootstrapped averages for each sample
# in my case, binned by dwelling_depth and sample number
readd(sample_averages) |>
filter(.width == .95) |>
mutate(Bin = 1:n(),
IGSN = NA_character_,
Latitude = u1411lat,
Longitude = u1411lon,
Mineralogy = "C", FormT = NA_real_, erFormT = NA_real_,
Site = "U1411", Location = "Newfoundland Margin, North Atlantic",
) |>
left_join(simple_summ) |>
left_join(boot_summ) |>
select(SampName = grp,
IGSN, Latitude, Longitude, Age = age, Bin,
## Description, Species, SampSubCategory,
## SampNum,
Mineralogy, FormT, erFormT, Site, Location, N = n,
d13C = d13C_simple_mean, sd13 = sd_d13C,
d18O = d18O_simple_mean, sd18 = sd_d18O,
Final_d13Ccarb = d13C,
sd_d13C_final,
## Final_d18Ocarb_VPDB,
Final_d18Ocarb_VSMOW = d18O,
sd_d18O_final,
D47rfac = D47,
seD47,
## extSE = ,
Temp = temp,
seT,
## ext_seT
) |>
## glimpse()
writexl::write_xlsx("out/2023-07-25_sample-averages.xlsx")
# allout holds all standard + sample replicates, is the output of the clumped-processing pipeline
# maybe with some extra metadata added for my use-case
more_summary_stats <- readd(allout) |>
group_by(etf_grp) |>
separate(etf_stds, sep = " ", into=c("std1","std2","std3","std4")) |>
tidylog::filter(broadid == std1 | broadid == std2 | broadid == std3 | broadid == std4) |>
summarize(No_Stds = n())
# SOMEDAY: get actual number of replicates for d13C and d18O offset correction?
## summ_stats <- readd(allout) |>
## No_Stds_13C_18O,
readd(allout) |>
ungroup() |>
mutate(
SampNum = 1:n(),
SampCategory = ifelse(broadid == "other", "sample", "carbSTD"),
SampSubCategory = ifelse(broadid == "other", "biogenic", broadid),
Reference = NA_character_,
Mineralogy = "C",
Mineralogy2 = NA_character_,
Bad = ifelse(outlier, 1, 0),
Date = lubridate::date(file_datetime),
Time = format(as.POSIXct(file_datetime), format = "%H:%M:%S"),
FormT = NA_real_,
erFormT = NA_real_,
d18O_wg_VSMOW = NA_real_,
se_d45 = d45_sd / sqrt(n_ok),
se_d46 = d46_sd / sqrt(n_ok),
se_d47 = d47_sd / sqrt(n_ok),
se_d48 = d48_sd / sqrt(n_ok),
se_d49 = d49_sd / sqrt(n_ok),
SampYN = ifelse(broadid == "other", "Y", "N"),
RefYN = ifelse(broadid == "other", "N", "Y"),
d18O_VSMOW = NA_real_,
sd_18_VSMOW = NA_real_,
se_18_VSMOW = NA_real_,
AFF_d18O = NA_real_,
Final_d18Ocarb_VSMOW = NA_real_,
se_D49 = D49_raw_sd / sqrt(n_ok),
SlopeEGL = NA_real_,
Bin = ifelse(!is.na(smp), paste(smp, dwelling_depth, sep = "_"), NA_character_),
) |>
group_by(etf_grp) |>
mutate(
ARF_ID2 = first(Analysis),
ARF_ID3 = last(Analysis),
) |>
left_join(
more_summary_stats # this gives us No_Stds
) |>
ungroup() |>
select(
SampName = identifier_1,
Bin,
SampCategory,
SampSubCategory,
SampNum,
Reference,
Mineralogy,
Mineralogy2,
Date,
Time,
AnalysisID = Analysis,
ReplicateID = file_id,
MassSpec = masspec,
FormT,
erFormT,
rxnTemp = acid_temperature,
SampYN,
RefYN,
Bad,
D47TE = expected_D47,
AFF = acid_fractionation_factor,
ARF_ID1 = etf_grp,
ARF_ID2,
ARF_ID3,
Stds_used = etf_stds,
## No_Stds = etf_width,
No_Stds,
Run = preparation,
d45 = d45_mean,
sd_d45 = d45_sd,
se_d45,
d46 = d46_mean,
sd_d46 = d46_sd,
se_d46,
d47 = d47_mean,
sd_d47 = d47_sd,
se_d47,
d48 = d48_mean,
sd_d48 = d48_sd,
se_d48,
d49 = d49_mean,
sd_d49 = d49_sd,
se_d49,
d13C_wg_VPDB = d13C_PDB_wg,
d18O_wg_VPDB = d18O_PDBCO2_wg,
d18O_wg_VSMOW,
d13C = d13C_PDB_mean,
sd_13 = d13C_PDB_sd,
se_13 = d13C_PDB_sem,
d18O_VPDB = d18O_PDB_mean,
sd_18_VPDB = d18O_PDB_sd,
se_18_VPDB = d18O_PDB_sem,
d18O_VSMOW,
sd_18_VSMOW,
se_18_VSMOW,
Stds_used_13C_18O = off_d13C_stds,
No_Stds_13C_18O = off_d13C_width, # TODO: make offset_correction report N
AFF_d18O,
Final_d13Ccarb = d13C_offset_corrected,
Final_d18Ocarb_VPDB = d18O_offset_corrected,
Final_d18Ocarb_VSMOW,
D47 = D47_raw_mean,
sd_D47 = D47_raw_sd,
se_D47 = D47_raw_sem,
D48 = D48_raw_mean,
sd_D48 = D47_raw_sd,
se_D48 = D47_raw_sem,
D49 = D49_raw_mean,
sd_D49 = D49_raw_sd,
se_D49,
SlopeEGL,
SlopeETF = etf_slope,
IntETF = etf_intercept,
D47rfac = D47_final
) |>
writexl::write_xlsx("out/2023-07-25_sample-standard-replicates.xlsx")
Oh and just to make it clear: after running these scripts I copy pasted everything over to the correct tab in the template (including the header to make sure it's the same for every column) and then deleted that double row.