Open turbomam opened 8 months ago
import json
import csv
# Load JSON data
with open('all-gold-studies.json', 'r') as f:
data = json.load(f)
print("LOADED")
# Open TSV file for writing
with open('output.tsv', 'w', newline='') as tsvfile:
# Define TSV writer with tab as the delimiter
writer = csv.writer(tsvfile, delimiter='\t')
# Write header row with reordered columns
writer.writerow(['studyGoldId', 'biosampleGoldId', 'projectGoldId', 'ncbiBioSampleAccession', 'ncbiBioProjectAccession', 'sraExperimentIds'])
# Iterate over studies
for study in data:
study_gold_id = study.get('studyGoldId', '')
# Iterate over biosamples
biosamples = study.get('biosamples', [])
for biosample in biosamples:
biosample_gold_id = biosample.get('biosampleGoldId', '')
# Iterate over projects
projects = biosample.get('projects', [])
for project in projects:
project_gold_id = project.get('projectGoldId', '')
print(f"{study_gold_id}: {biosample_gold_id} {project_gold_id}")
ncbi_bio_project_accession = project.get('ncbiBioProjectAccession', '')
ncbi_bio_sample_accession = project.get('ncbiBioSampleAccession', '')
# Extract sraExperimentIds and pipe-delimit them
sra_experiment_ids = '|'.join(project.get('sraExperimentIds', []))
# Write row to TSV with reordered columns and sraExperimentIds
writer.writerow([study_gold_id, biosample_gold_id, project_gold_id, ncbi_bio_sample_accession, ncbi_bio_project_accession, sra_experiment_ids])
print("TSV file generated successfully.")
Add code for generating all-gold-studies.json
~ 188,000 mappings, ~186,000 mappings to ncbiBioSampleAccession
via JSON
determined in Excel
columns from the Sequencing Project
tab within goldData.xlsx
import pandas as pd
from sqlalchemy import create_engine
# Read the Excel file
xlsx_file = "goldData.xlsx"
sheet_name = "Sequencing Project"
# Define the output TSV file path
filtered_output_tsv_file = "sequencing_project_filtered.tsv"
intact_output_tsv_file = "sequencing_project.tsv"
# Define database connection parameters
DB_USER = 'postgres'
DB_PASSWORD = '<SECRET>'
DB_HOST = 'localhost'
DB_PORT = '15432'
DB_NAME = 'ncbi_biosamples_feb26'
# Define table name
table_name = 'gold_sequencing_project'
# , nrows=1000
df = pd.read_excel(xlsx_file, sheet_name=sheet_name)
# df = df.head(100)
# Trim off new line characters and everything after them, and replace whitespace with underscores
# Change column names to lowercase and replace whitespace with underscores
df.columns = df.columns.str.split('\n').str[0].str.lower().str.replace(' ', '_')
df.to_csv(intact_output_tsv_file, sep='\t', index=False)
# Filter rows with non-blank values in both columns
df_filtered = df.dropna(subset=["ncbi_biosample_accession", "biosample_gold_id"])
# Select only the desired columns
df_filtered = df_filtered[["ncbi_biosample_accession", "biosample_gold_id"]]
# Remove duplicate rows
df_filtered = df_filtered.drop_duplicates()
# Save the DataFrame as TSV
df_filtered.to_csv(filtered_output_tsv_file, sep='\t', index=False)
print(f"The filtered data have been saved as '{filtered_output_tsv_file}'.")
# Create SQLAlchemy engine
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')
# Insert DataFrame into PostgreSQL
df.to_sql(table_name, engine, if_exists='replace', index=False)
print(f"The data have been inserted into the '{table_name}' table in the '{DB_NAME}' database.")
Should possibly be inserting the data into Postgres chunk-wise
Might have to grant permissions on this table to other users since this was created by postgres
.
We can get 185,570 unique NCBI Biosample mappings from goldData.xlsx
, without having to run through the API
select
COUNT(*) as distinct_count
from
(
select
distinct
"ncbi_biosample_accession",
"biosample_gold_id"
from
gold_sequencing_project
where
"ncbi_biosample_accession" is not null
and "biosample_gold_id" is not null
) as distinct_rows;
185,569
select
"project_gold_id",
COUNT(*) as num_duplicates
from
gold_sequencing_project
group by
"project_gold_id"
having
COUNT(*) > 1;
Empty result
Could do a unique/pk index on project_gold_id
, but we are much more likely to search on ncbi_biosample_accession
Do we want to load the GOLD Biosample's metadata into Postgres? That probably would have to come from the API.
Possibly available from https://gold.jgi.doe.gov/downloads, specifically the Public Studies/Biosamples/SPs/APs/Organisms Excel link, which downloads
goldData.xlsx
The
Sequencing Project
tab withingoldData.xlsx
looks most promisingBut it might be beneficial to try other approaches, too.