Closed wkumler closed 1 year ago
Here's what we get from running the PERMANOVA/NMDS multivariate statistics on them, looking for depth as an explanatory factor:
library(vegan)
MS3000_metadata <- file_data %>%
filter(samp_type=="Smp") %>%
filter(str_detect(filename, "180821")) %>%
select(filename, depth) %>%
column_to_rownames("filename")
norm_area_data <- read_csv("manuscript/MS3000_norm_areas.csv")
both_min_model <- rbind(FT2040_features, MS3000_features) %>%
select(feat_class, med_cor, med_SNR) %>%
filter(feat_class%in%c("Good", "Bad")) %>%
mutate(feat_class=feat_class=="Good") %>%
glm(formula=feat_class~., family = binomial)
pred_probs <- MS3000_features %>%
mutate(pred_prob=predict(object=both_min_model,newdata = ., type = "response")) %>%
select(feature, feat_class, pred_prob)
wide_data <- norm_area_data %>%
group_by(feature) %>%
filter(var(norm_area)>0) %>%
mutate(norm_area=scale(norm_area)[,1]) %>%
left_join(pred_probs) %>%
pivot_wider(names_from = "filename", values_from = norm_area) %>%
column_to_rownames("feature")
mat_all <- wide_data
mat_good <- wide_data %>% filter(feat_class=="Good")
mat_90 <- wide_data %>% filter(pred_prob>0.9)
mat_50 <- wide_data %>% filter(pred_prob>0.5)
mat_10 <- wide_data %>% filter(pred_prob>0.1)
mat_01 <- wide_data %>% filter(pred_prob>0.01)
mat_list <- lst(mat_good, mat_90, mat_50, mat_10, mat_01, mat_all)
thresh_tests <- lapply(mat_list, function(mat_given){
mat_data <- mat_given %>%
select(-feat_class, -pred_prob) %>%
data.matrix() %>%
t()
perm_output <- adonis2(mat_data~depth, data = MS3000_metadata, method = "manhattan")
nmds_output <- metaMDS(mat_data, k = 2, distance = "manhattan", trace = 0,
autotransform = FALSE, noshare = FALSE, wascores = FALSE)
nmds_gp <- nmds_output$points %>%
as.data.frame() %>%
cbind(MS3000_metadata) %>%
rownames_to_column("filename") %>%
select(filename, starts_with("MDS"), depth) %>%
mutate(basename=str_extract(filename, "MS\\d+C\\d+.*(?=_[A-C])")) %>%
ggplot(aes(x=MDS1, y=MDS2)) +
geom_polygon(aes(fill=depth, color=depth, group=basename), alpha=0.5) +
annotate(geom = "label", x=Inf, y=Inf,
label=paste("Stress =", round(nmds_output$stress, 2)),
hjust=1, vjust=1, label.r=unit(0, "in")) +
theme_minimal()
list(perm_output, nmds_output, nmds_gp)
}) %>% set_names(names(mat_list))
imap(thresh_tests, function(thresh_i, thresh_name){
as.data.frame(thresh_i[[1]][1,]) %>% mutate(thresh_name)
}) %>% bind_rows() %>%
select(thresh_name, R2, `F`, `Pr(>F)`) %>%
`rownames<-`(NULL) %>%
column_to_rownames("thresh_name")
lapply(thresh_tests, `[[`, 3) %>%
do.call(what=gridExtra::grid.arrange)
PERMANOVA output:
R2 | F | Pr(>F) | |
---|---|---|---|
mat_good | 0.4424214 | 38.086514 | 0.001 |
mat_90 | 0.4668699 | 42.034306 | 0.001 |
mat_50 | 0.3455154 | 25.340154 | 0.001 |
mat_10 | 0.2500523 | 16.004461 | 0.001 |
mat_01 | 0.1515176 | 8.571594 | 0.001 |
mat_all | 0.1035519 | 5.544648 | 0.001 |
NMDS figures:
Most of this behaves as expected - including a bunch of noise peaks reduces our power significantly so the NMDS and PERMANOVA struggle to explain as much variance. However, one thing to note is that the "Good only" peaks actually perform worse than the 90% predicted! This is super neat and I don't really know how to interpret it. Remember that p-value is NOT one of our predictors so we haven't pre-selected based on the p-value.
One way to show that this matters is to run all of the normal multivariate and univariate tests that we'd normally do, but run each full analysis multiple times with different data subsets. Maybe the first subset is just all the peaks, the next is only the manually-labeled "good" ones, then like > 0.9, 0.5, 0.1, and 0.01 thresholds?