How does the biological significance change with different thresholds?

Here's what we get from running the PERMANOVA/NMDS multivariate statistics on them, looking for depth as an explanatory factor:

library(vegan)
MS3000_metadata <- file_data %>%
  filter(samp_type=="Smp") %>%
  filter(str_detect(filename, "180821")) %>%
  select(filename, depth) %>%
  column_to_rownames("filename")
norm_area_data <- read_csv("manuscript/MS3000_norm_areas.csv")

both_min_model <- rbind(FT2040_features, MS3000_features) %>%
  select(feat_class, med_cor, med_SNR) %>%
  filter(feat_class%in%c("Good", "Bad")) %>%
  mutate(feat_class=feat_class=="Good") %>%
  glm(formula=feat_class~., family = binomial)
pred_probs <- MS3000_features %>%
  mutate(pred_prob=predict(object=both_min_model,newdata = ., type = "response")) %>%
  select(feature, feat_class, pred_prob)

wide_data <- norm_area_data %>%
  group_by(feature) %>%
  filter(var(norm_area)>0) %>%
  mutate(norm_area=scale(norm_area)[,1]) %>%
  left_join(pred_probs) %>% 
  pivot_wider(names_from = "filename", values_from = norm_area) %>%
  column_to_rownames("feature")

mat_all <- wide_data
mat_good <- wide_data %>% filter(feat_class=="Good")
mat_90 <- wide_data %>% filter(pred_prob>0.9)
mat_50 <- wide_data %>% filter(pred_prob>0.5)
mat_10 <- wide_data %>% filter(pred_prob>0.1)
mat_01 <- wide_data %>% filter(pred_prob>0.01)

mat_list <- lst(mat_good, mat_90, mat_50, mat_10, mat_01, mat_all)
thresh_tests <- lapply(mat_list, function(mat_given){
  mat_data <- mat_given %>%
    select(-feat_class, -pred_prob) %>%
    data.matrix() %>%
    t()
  perm_output <- adonis2(mat_data~depth, data = MS3000_metadata, method = "manhattan")
  nmds_output <- metaMDS(mat_data, k = 2, distance = "manhattan", trace = 0,
                         autotransform = FALSE, noshare = FALSE, wascores = FALSE)
  nmds_gp <- nmds_output$points %>%
    as.data.frame() %>%
    cbind(MS3000_metadata) %>%
    rownames_to_column("filename") %>%
    select(filename, starts_with("MDS"), depth) %>%
    mutate(basename=str_extract(filename, "MS\\d+C\\d+.*(?=_[A-C])")) %>%
    ggplot(aes(x=MDS1, y=MDS2)) +
    geom_polygon(aes(fill=depth, color=depth, group=basename), alpha=0.5) +
    annotate(geom = "label", x=Inf, y=Inf, 
             label=paste("Stress =", round(nmds_output$stress, 2)),
             hjust=1, vjust=1, label.r=unit(0, "in")) +
    theme_minimal()
  list(perm_output, nmds_output, nmds_gp)
}) %>% set_names(names(mat_list))

imap(thresh_tests, function(thresh_i, thresh_name){
  as.data.frame(thresh_i[[1]][1,]) %>% mutate(thresh_name)
}) %>% bind_rows() %>%
  select(thresh_name, R2, `F`, `Pr(>F)`) %>%
  `rownames<-`(NULL) %>%
  column_to_rownames("thresh_name")

lapply(thresh_tests, `[[`, 3) %>%
  do.call(what=gridExtra::grid.arrange)

PERMANOVA output:

	R2	F	Pr(>F)
mat_good	0.4424214	38.086514	0.001
mat_90	0.4668699	42.034306	0.001
mat_50	0.3455154	25.340154	0.001
mat_10	0.2500523	16.004461	0.001
mat_01	0.1515176	8.571594	0.001
mat_all	0.1035519	5.544648	0.001

NMDS figures:

Most of this behaves as expected - including a bunch of noise peaks reduces our power significantly so the NMDS and PERMANOVA struggle to explain as much variance. However, one thing to note is that the "Good only" peaks actually perform worse than the 90% predicted! This is super neat and I don't really know how to interpret it. Remember that p-value is NOT one of our predictors so we haven't pre-selected based on the p-value.

wkumler / MS_metrics

How does the biological significance change with different thresholds? #17