Arcadia-Science / sourmashconsumr

Working with the outputs of sourmash in R
https://arcadia-science.github.io/sourmashconsumr/
Other
21 stars 3 forks source link

code i used to figure out how to the make the sankey plot #33

Closed taylorreiter closed 1 year ago

taylorreiter commented 1 year ago

no promises that it runs, but recording here so it's somewhere

library(ggalluvial)
library(magrittr)
library(sourmashconsumr)

taxonomy_annotate_df <- read_taxonomy_annotate(Sys.glob("tests/testthat/SRR19*lineage*.csv"), separate_lineage = T) %>%
  dplyr::select(f_unique_to_query, f_unique_weighted, domain, phylum, class, family, order, genus, species) %>%
  dplyr::group_by(domain, phylum, class, family, order, genus, species) %>%
  dplyr::summarize(sum_f_unique_weighted = sum(f_unique_weighted))

ggalluvial::is_alluvia_form(taxonomy_annotate_df)

ggplot2::ggplot(taxonomy_annotate_df,
       ggplot2::aes(y = sum_f_unique_weighted, axis1 = domain, axis2 = phylum, axis3 = class, axis4 = order, axis5 = family)) +
  #ggalluvial::geom_alluvium(aes(fill = order), width = 1/12) +
  ggalluvial::geom_flow() +
  ggalluvial::geom_stratum(width = 1/10, alpha = .5, aes(fill = c(family))) +
  ggplot2::geom_text(stat = "stratum", aes(label = after_stat(stratum)),
                     size = 2, hjust = -0.25) +
  theme_classic() +
  labs(x = "tanomic rank", y = "abundance-weighted unique fraction\ntotaled across all samples") +
  scale_x_continuous(labels = c("domain", "phylum", "class", "order", "family"),
                     breaks = c(1, 2, 3, 4, 5))

ggplot(taxonomy_annotate_df,
       aes(x = survey, stratum = response, alluvium = subject,
           y = freq,
           fill = response, label = response)) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = .5) +
  geom_text(stat = "stratum", size = 3) +
  theme(legend.position = "none") +
  ggtitle("vaccination survey responses at three points in time")

taxonomy_annotate_df <- read_taxonomy_annotate(Sys.glob("tests/testthat/*lineage*.csv"), separate_lineage = T) %>%
  dplyr::select(query_name, f_unique_to_query, f_unique_weighted, domain, phylum, class, family, order, genus, species) %>%
  dplyr::group_by(query_name, domain, phylum, class, family, order, genus, species) %>%
  dplyr::summarize(sum_f_unique_weighted = sum(f_unique_weighted))
to_lodes_form(taxonomy_annotate_df_long)

# create a fill variable --> it will be based on alphabetical order (which is how the alluvial plot is ordered)
# and it will be for each level of taxonomy
# probably needs to switch to long format
taxonomy_annotate_df_long <- taxonomy_annotate_df %>%
  tidyr::pivot_longer(cols = domain:species, names_to = "taxonomic_rank", values_to = "taxonomic_label")

taxonomy_annotate_df_long <- transform(taxonomy_annotate_df_long, taxonomic_label = factor(taxonomic_label))
to_lodes_form(taxonomy_annotate_df_long)
ggplot(taxonomy_annotate_df_long,
       aes(x = taxonomic_rank, stratum = taxonomic_label, alluvium = query_name,
           y = sum_f_unique_weighted,
           fill = taxonomic_label, label = taxonomic_label)) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = .5) +
  geom_text(stat = "stratum", size = 3) +
  theme(legend.position = "none") +
  ggtitle("alluvial plot")

# test data ---------------------------------------------------------------

data(vaccinations)
vaccinations <- transform(vaccinations,
                          response = factor(response, rev(levels(response))))
ggplot(vaccinations,
       aes(x = survey, stratum = response, alluvium = subject,
           y = freq,
           fill = response, label = response)) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = .5) +
  geom_text(stat = "stratum", size = 3) +
  theme(legend.position = "none") +
  ggtitle("vaccination survey responses at three points in time")

# try parallel sets -------------------------------------------------------

data <- reshape2::melt(Titanic)
data <- gather_set_data(data, 1:4)
data

data <- gather_set_data(taxonomy_annotate_df, 1:7)
palette <- colorRampPalette(RColorBrewer::brewer.pal(8, "Set2"))(length(unique(data$y)))
ggplot(data, aes(x, id = id, split = y, value = sum_f_unique_weighted)) +
  geom_parallel_sets(alpha = 0.3, axis.width = 0.1) +
  geom_parallel_sets_axes(axis.width = 0.2, aes(fill = y)) +
  geom_parallel_sets_labels(colour = 'black', angle = 360, size = 2, hjust = -0.25) +
  theme_classic() +
  theme(axis.line.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "None") +
  labs(x = "tanomic rank") +
  scale_x_continuous(labels = c("domain", "phylum", "class", "order", "family", "genus", "species", ""),
                     breaks = c(1, 2, 3, 4, 5, 6, 7, 8),
                     limits = c(.75, 8)) +
  scale_fill_manual(values = palette)