Open raissameyer opened 3 months ago
Let's follow that data trail!
####################################
## prep
####################################
# Install and load necessary packages
if (!require(readxl)) install.packages("readxl")
library(readxl)
library(dplyr)
# Set working directory
setwd("/Users/rameyer/Documents/_P3/P3dataAnalysis/P3_metadata/public-marine-omics-metadata/")
####################################
## go
####################################
# Define the URL of the Excel file associated with the BGS data paper
# https://doi.org/10.1038/s41597-021-00889-9 in the Supplementary material
url <- "https://static-content.springer.com/esm/art%3A10.1038%2Fs41597-021-00889-9/MediaObjects/41597_2021_889_MOESM1_ESM.xlsx"
# Download the Excel file
download.file(url, destfile = "data/BGSpubMetadata.xlsx", mode = "wb")
# Read the third sheet from the Excel file
BGS_pub_metadata <- read_excel("data/BGSpubMetadata.xlsx", sheet = 1)
head(BGS_pub_metadata)
dim(BGS_pub_metadata)
# [1] 971 19
colnames(BGS_pub_metadata)
colnames(BGS_meta_dataframe_100)
# update column names
# Custom renaming function
rename_column <- function(col_name) {
col_name %>%
tolower() %>% # Convert to lower case
gsub(" ", "_", .) %>% # Replace spaces with underscores
gsub("-", "_", .) %>% # Replace dashes with underscores
gsub("\\.", "_", .) %>% # Replace periods with underscores
gsub("\\[", "", .) %>% # Remove opening parentheses
gsub("\\]", "", .) %>% # Remove closing parentheses
gsub("/","_", .) %>% # Replace slashes with underscores
gsub("_+", "_", .) %>% # Replace multiple underscores with a single underscore
gsub("_+$", "", .) # Remove trailing underscores
}
# use on the publication table
BGS_pub_metadata <- BGS_pub_metadata %>%
rename_with(rename_column)
colnames(BGS_pub_metadata)
# Perform the left join between BGS_pub_metadata and BGS_meta_dataframe_100
BGS_pub_merged <- BGS_meta_dataframe_100 %>%
left_join(BGS_pub_metadata, by = c("biosample" = "sra_accession_number"))
dim(BGS_meta_dataframe_100)
# [1] 971 57
dim(BGS_pub_merged)
# [1] 971 75
Download the highlighted file
Has a bunch of metadata on top
And then the env metadata we are interested in
I am manually deleting the metadata on top and then reading it into R
################################################################################
### Get first metadata file as described in ...
metadataI07N_BGS <- read.csv(
"data/33RO20180423_hy1.csv",
stringsAsFactors = FALSE,
check.names = FALSE
)
colnames(metadataI07N_BGS)
head(metadataI07N_BGS)
dim(metadataI07N_BGS)
# [1] 3004 65
[!NOTE] How to link the dataframes?
sample_alias
in theBGS_meta_dataframe_100
consists of the cruise ID and sample ID (and year): I7N2018_328sect_id
andsampno
have the cruise ID (I7N) and sample ID (328) as well
#########################################
# Separate the sample_alias into sect_id and sampno
BGS_meta_dataframe_100 <- BGS_meta_dataframe_100 %>%
separate(sample_alias, into = c("sect_id_year", "sampno"), sep = "_", convert = TRUE) %>%
mutate(
sect_id = substr(sect_id_year, 1, 3), # Extract the sect_id
sampno = as.integer(sampno) # Convert sampno to integer for matching
) %>%
select(-sect_id_year) # Remove the temporary column
# Rename columns in metadataI07N_BGS to add suffix
metadataI07N_BGS <- metadataI07N_BGS %>%
rename_with(~ paste0(., "_cchdo"), -c(sect_id, sampno))
colnames(metadataI07N_BGS)
# Join the dataframes
linked_dataframe <- BGS_meta_dataframe_100 %>%
left_join(metadataI07N_BGS, by = c("sect_id", "sampno"))
colnames(metadataI07N_BGS)
colnames(linked_dataframe)
head(linked_dataframe)
dim(linked_dataframe)
Same as above
Is this why MGnify only had 444 samples instead of 971?
C13
shows up 1255 times across 5 columns: so 251 samples are from C13 and don't show up in MGnify records. However, added to the 444 BGS samples on MGnify that only amounts to 695, so still some more missing. same as above
Bargery A.(2021). AMT28 (JR18001) CTD profiles (pressure, temperature, salinity, potential temperature, density, fluorescence, transmittance, downwelling PAR, dissolved oxygen concentration) calibrated and binned to 1 dbar. British Oceanographic Data Centre, National Oceanography Centre, NERC, UK. doi:10.5285/b8c6056a-db74-7453-e053-6c86abc00ce5
Click on download
Downloads a folder with 63 .txt files and some other files.
Let's get them into R (but skip the metadata stuff at the top of each file
################################################################################
# Get fifth AMT-28 https://doi.org/10/fqkd
# ( this one is a bit different)
# Load necessary packages
library(dplyr)
install.packages("readr")
library(readr)
# Define the directory containing the files
directory <- "data/RN-20240730141451_B8C6056ADB747453E0536C86ABC00CE5"
# Get the list of all .txt files in the directory
file_list <- list.files(path = directory, pattern = "\\.txt$", full.names = TRUE)
# Function to read and clean a single file
clean_file <- function(file_path) {
# Read the file
lines <- read_lines(file_path)
# Find the line number where "Cruise" appears
start_line <- grep("^Cruise", lines)
# If the "Cruise" line is found, read the data from that line onwards
if (length(start_line) > 0) {
cleaned_data <- read.table(text = lines[start_line:length(lines)], header = TRUE, sep = "\t")
return(cleaned_data)
} else {
warning(paste("No 'Cruise' line found in", file_path))
return(NULL)
}
}
# Apply the cleaning function to all files and store the results in a list
cleaned_data_list <- lapply(file_list, clean_file)
# Combine all cleaned data into a single data frame
metadataAMT28_BGS <- bind_rows(cleaned_data_list)
dim(metadataAMT28_BGS)
# [1] 43117 43
head(metadataAMT28_BGS)
# rename column headers
metadataAMT28_BGS <- metadataAMT28_BGS %>%
rename_with(rename_column)
colnames(metadataAMT28_BGS)
# Filter out rows with NA or empty strings in 'Cruise' and 'Station' columns
metadataAMT28_BGS <- metadataAMT28_BGS %>%
filter(!is.na(cruise) & cruise != "" & !is.na(station) & station != "")
dim(metadataAMT28_BGS)
# [1] 63 43
# rename the columns it has in common with the other dfs and that I want to match the others
metadataAMT28_BGS <- metadataAMT28_BGS %>%
rename(
ctdprs = pres_z_dbar,
salinity_pss = p_sal_ctd2_dmnless,
)
# make it such that the columns that we will use to map to between dfs are of the same structure and name as those in the dfs above
# add a column called `sect_id` which adds the value `AMT` and a add a column called `sampno` which takes the part of `station ` that comes after _CTD (so in JR18001_CTD001 that would be 001)
metadataAMT28_BGS <- metadataAMT28_BGS %>%
mutate(
sect_id = "AMT",
sampno = sub(".*_CTD(.*)", "\\1", station)
)
metadataAMT28_BGS$station
metadataAMT28_BGS$sect_id
metadataAMT28_BGS$sampno
[!NOTE] In the df, the cruise is called JR18001 instead of AMT28.
Lomas, M. W., & Martiny, A. (2020). Depth profile data from R/V New Horizons NH1418 in the tropical Pacific from Sept-Oct. 2014 (Version 1) [Data set]. Biological and Chemical Oceanography Data Management Office (BCO-DMO). https://doi.org/10.26008/1912/BCO-DMO.829895.1
Let's download the dataset .tsv file under Files
(the https://darchive.mblwhoilibrary.org/bitstreams/29d11253-100e-5851-87b1-de6079b5fed8/download has some info on units)
Get it into R
################################################################################
# Get sixth NH1418 https://doi.org/10.26008/1912/bco-dmo.829895.1
metadataNH1418_BGS <- read.csv(
"data/download_NH1418",
stringsAsFactors = FALSE,
sep = "\t",
check.names = FALSE
)
colnames(metadataNH1418_BGS)
head(metadataNH1418_BGS)
dim(metadataNH1418_BGS)
# [1] 191 35
# rename column headers on the publication table
metadataNH1418_BGS <- metadataNH1418_BGS %>%
rename_with(rename_column)
colnames(metadataNH1418_BGS)
View(metadataP18_BGS)
[!NOTE] No way to match with sample_alias (as is possible for the other datasets), can use time and depth...
Lomas, M. W., & Martiny, A. (2020). Depth profile data from R/V Atlantic Explorer AE1319 in the NW Atlantic from Aug-Sept. 2013 (Version 1) [Data set]. Biological and Chemical Oceanography Data Management Office (BCO-DMO). https://doi.org/10.26008/1912/BCO-DMO.829797.1
[!NOTE] As you can see, this has two DOIs associated with it
First DOI: https://doi.org/10.26008/1912/bco-dmo.829797.1
Second DOI: https://doi.org/10.26008/1912/bco-dmo.538091.2
[!NOTE] The first DOI contains the info we want
Download the dataset under the first DOI and get it into R
################################################################################
# Get seventh AE1319 https://doi.org/10.26008/1912/bco-dmo.829797.1
metadataAE1319_BGS <- read.csv(
"data/downloadAE1319",
stringsAsFactors = FALSE,
sep = "\t",
check.names = FALSE
)
colnames(metadataAE1319_BGS)
head(metadataAE1319_BGS)
dim(metadataAE1319_BGS)
# [1] 128 13
# rename column headers on the publication table
metadataAE1319_BGS <- metadataAE1319_BGS %>%
rename_with(rename_column)
colnames(metadataAE1319_BGS)
Lomas, M. W., & Martiny, A. (2020). Depth profile data from Bermuda Atlantic Time-Series Validation cruise 46 (BVAL46) in the Sargasso Sea from Sept-Oct. 2011 (Version 1) [Data set]. Biological and Chemical Oceanography Data Management Office (BCO-DMO). https://doi.org/10.26008/1912/BCO-DMO.829843.1
[!NOTE] As you can see, this has two DOIs associated with it. Again, the first is the one we need
Same as above.
In the Publication
The Supplementary material table contains
Table 2 of the main text then shows how to get more metadata variables for the different campaigns (called section_ID in the Supplementary material table).