turbomam / biosample-xmldb-sqldb

Tools for loading NCBI Biosample into an XML database and then transforming that into a SQL database
MIT License
0 stars 1 forks source link

NCBI Package/Attribute use #31

Closed turbomam closed 6 months ago

turbomam commented 6 months ago
import xml.etree.ElementTree as ET
import sqlalchemy
from sqlalchemy import create_engine, MetaData
import csv
import pandas as pd

INPUT_FILE = 'ncbi_biosample_package_definitions.xml'
OUTPUT_FILE = 'ncbi_biosample_package_definitions.tsv'   

headers = ['Name', 'DisplayName','EnvPackage', 'EnvPackageDisplay', 'Example', 'NotAppropriateFor', 'ShortName', 'group', 'Description', 'attributes']

# df = pd.DataFrame(columns=headers)

lod = []

destination_table_name = 'ncbi_package_definitions'

engine = create_engine(
    'postgresql://postgres:<SECRET>@localhost:15432/ncbi_biosamples_feb26')

# connection = engine.connect()

with open(INPUT_FILE) as f:
    xml = f.read()

root = ET.fromstring(xml)

with open(OUTPUT_FILE, 'w', newline='') as f:
    # dict_writer = csv.DictWriter(f, delimiter='\t', fieldnames=headers)
    # dict_writer.writeheader()

    for package in root.findall('Package'):
        row = {}
        for field in headers:
            if field == 'group':
                row[field] = package.get('group')

            elif field == 'attributes':
                attrs = []
                for key, value in package.items():  
                    if key != 'group':
                        attrs.append(f"{key}={value}")
                row[field] = "|".join(attrs)

            else:
                row[field] = package.findtext(field)

            lod.append(row)

lod_frame = pd.DataFrame(lod)

print(lod_frame)

lod_frame.to_sql(destination_table_name, engine, index=False, if_exists='replace')
turbomam commented 6 months ago
select
    use ,
    count(1)
from
    ncbi_package_attribute_use npau
group by
    use
order by
    count(1) desc ;
use count
optional 17,239
mandatory 2,605
either_one_mandatory 318

either_one_mandatory ?

  <Attribute>
    <Name>age</Name>
    <HarmonizedName>age</HarmonizedName>
    <Description>age at the time of sampling; relevant scale depends on species and study, e.g. could be seconds for amoebae or centuries for trees</Description>
    <Format>{float} {unit}</Format>
    <Package use="mandatory">Human.1.0</Package>
    <Package use="optional">Invertebrate.1.0</Package>
    <Package use="either_one_mandatory" group_name="Age/stage">Model.organism.animal.1.0</Package>
    <Package use="either_one_mandatory" group_name="Age/stage">Plant.1.0</Package>
  </Attribute>