Closed turbomam closed 8 months ago
import requests
from bs4 import BeautifulSoup
import sqlalchemy
from sqlalchemy import create_engine
import pandas as pd
url = 'https://www.insdc.org/submitting-standards/country-qualifier-vocabulary/'
engine = create_engine(
'postgresql://postgres:<SECRET>@localhost:15432/ncbi_biosamples_feb26')
destination_table_name = "insdc_country_values"
# Send a GET request to the URL
response = requests.get(url)
# Create a BeautifulSoup object and parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find the <ul> tags containing the lists
ul_tags = soup.find_all('ul')
# Extract the lists containing "Thailand" or "Siam" and create a union set
union_set = set()
for ul in ul_tags:
if any("Thailand" in li.text or "Siam" in li.text for li in ul.find_all('li')):
for li in ul.find_all('li'):
union_set.add(li.text.strip())
# Convert the set to a sorted list
sorted_list = sorted(union_set)
# Print the sorted list
print("Sorted Union List:")
for item in sorted_list:
print(item)
# Create a DataFrame with the sorted list
df = pd.DataFrame({'insdc_country': sorted_list})
# Insert the DataFrame into a PostgreSQL table
df.to_sql(destination_table_name, engine, if_exists='replace', index=False)
print(f"Data inserted into the '{destination_table_name}' table successfully.")
‘Country’ will transition to ‘Geographic Location’ effective June 2024
https://ncbiinsights.ncbi.nlm.nih.gov/2023/12/14/update-genbank-qualifier/