Closed turbomam closed 8 months ago
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import sqlalchemy
from sqlalchemy import create_engine, MetaData
# URL of the webpage
url = "https://www.insdc.org/submitting-standards/missing-value-reporting/"
engine = create_engine(
'postgresql://postgres:<SECRET>@localhost:15432/ncbi_biosamples_feb26')
destination_table_name = "insdc_missing_data_table"
# Send a GET request to fetch the webpage content
response = requests.get(url)
# Parse HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all tables on the webpage
tables = soup.find_all('table')
# Initialize a dictionary to store data from all tables
table_data = {}
for index, table in enumerate(tables):
# Get HTML content of the table
table_html = str(table)
# Read HTML table into a Pandas DataFrame
df = pd.read_html(StringIO(table_html))[0]
# Add DataFrame to dictionary with a key
table_data[f"Table_{index+1}"] = df
# # Summarize tables
# for table_name, df in table_data.items():
# print(f"Summary for {table_name}:")
# print(df.describe()) # Summary statistics for numerical columns
# print('-' * 50) # Separator
tidy_frame = table_data['Table_1']
tidy_frame.columns = [
"top_level",
"lower_level",
"lower_level_definition",
"reporting_level",
"reporting_definition",
]
print(tidy_frame)
tidy_frame.to_sql(destination_table_name, engine, index=False, if_exists='replace')
-- Step 1: Create a new table with a single column to store unique values
create table insdc_missing_data_list (
missing_value_term VARCHAR
);
-- Step 2: Insert unique, non-null values from the specified columns of the original table into the new table
insert
into
insdc_missing_data_list (missing_value_term)
select
top_level as missing_value_term
from
insdc_missing_data_table
where
top_level is not null
union
select
lower_level as missing_value_term
from
insdc_missing_data_table
where
lower_level is not null
union
select
reporting_level as missing_value_term
from
insdc_missing_data_table
where
reporting_level is not null;
https://www.insdc.org/submitting-standards/missing-value-reporting/