turbomam / biosample-xmldb-sqldb

Tools for loading NCBI Biosample into an XML database and then transforming that into a SQL database
MIT License
0 stars 1 forks source link

Add INSDC country code list #33

Closed turbomam closed 8 months ago

turbomam commented 8 months ago

‘Country’ will transition to ‘Geographic Location’ effective June 2024

https://ncbiinsights.ncbi.nlm.nih.gov/2023/12/14/update-genbank-qualifier/

turbomam commented 8 months ago

https://www.insdc.org/submitting-standards/country-qualifier-vocabulary/

turbomam commented 8 months ago
import requests
from bs4 import BeautifulSoup
import sqlalchemy
from sqlalchemy import create_engine

import pandas as pd

url = 'https://www.insdc.org/submitting-standards/country-qualifier-vocabulary/'

engine = create_engine(
    'postgresql://postgres:<SECRET>@localhost:15432/ncbi_biosamples_feb26')

destination_table_name = "insdc_country_values"

# Send a GET request to the URL
response = requests.get(url)

# Create a BeautifulSoup object and parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the <ul> tags containing the lists
ul_tags = soup.find_all('ul')

# Extract the lists containing "Thailand" or "Siam" and create a union set
union_set = set()
for ul in ul_tags:
    if any("Thailand" in li.text or "Siam" in li.text for li in ul.find_all('li')):
        for li in ul.find_all('li'):
            union_set.add(li.text.strip())

# Convert the set to a sorted list
sorted_list = sorted(union_set)

# Print the sorted list
print("Sorted Union List:")
for item in sorted_list:
    print(item)

# Create a DataFrame with the sorted list
df = pd.DataFrame({'insdc_country': sorted_list})

# Insert the DataFrame into a PostgreSQL table
df.to_sql(destination_table_name, engine, if_exists='replace', index=False)

print(f"Data inserted into the '{destination_table_name}' table successfully.")