turbomam / biosample-xmldb-sqldb

Tools for loading NCBI Biosample into an XML database and then transforming that into a SQL database
MIT License
0 stars 1 forks source link

Add INSDC missing value table #34

Closed turbomam closed 6 months ago

turbomam commented 6 months ago

https://www.insdc.org/submitting-standards/missing-value-reporting/

turbomam commented 6 months ago
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO

import sqlalchemy
from sqlalchemy import create_engine, MetaData

# URL of the webpage
url = "https://www.insdc.org/submitting-standards/missing-value-reporting/"

engine = create_engine(
    'postgresql://postgres:<SECRET>@localhost:15432/ncbi_biosamples_feb26')

destination_table_name = "insdc_missing_data_table"

# Send a GET request to fetch the webpage content
response = requests.get(url)

# Parse HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Find all tables on the webpage
tables = soup.find_all('table')

# Initialize a dictionary to store data from all tables
table_data = {}

for index, table in enumerate(tables):
    # Get HTML content of the table
    table_html = str(table)

    # Read HTML table into a Pandas DataFrame
    df = pd.read_html(StringIO(table_html))[0]

    # Add DataFrame to dictionary with a key
    table_data[f"Table_{index+1}"] = df

# # Summarize tables
# for table_name, df in table_data.items():
#     print(f"Summary for {table_name}:")
#     print(df.describe())  # Summary statistics for numerical columns
#     print('-' * 50)  # Separator

tidy_frame = table_data['Table_1']

tidy_frame.columns = [
"top_level",
"lower_level",
"lower_level_definition",
"reporting_level",
"reporting_definition",
]

print(tidy_frame)

tidy_frame.to_sql(destination_table_name, engine, index=False, if_exists='replace')
turbomam commented 6 months ago
-- Step 1: Create a new table with a single column to store unique values
create table insdc_missing_data_list (
    missing_value_term VARCHAR
);
-- Step 2: Insert unique, non-null values from the specified columns of the original table into the new table
insert
    into
    insdc_missing_data_list (missing_value_term)
select
    top_level as missing_value_term
from
    insdc_missing_data_table
where
    top_level is not null
union
select
    lower_level as missing_value_term
from
    insdc_missing_data_table
where
    lower_level is not null
union
select
    reporting_level as missing_value_term
from
    insdc_missing_data_table
where
    reporting_level is not null;