turbomam / biosample-xmldb-sqldb

Tools for loading NCBI Biosample into an XML database and then transforming that into a SQL database
MIT License
0 stars 1 forks source link

Repair of MIxS checklist/extension like values #3

Open turbomam opened 7 months ago

turbomam commented 7 months ago

I made a curated lookup in the past.

Use a LLM now?

turbomam commented 7 months ago
select
    model ,
    package ,
    package_name ,
    count(1) as biosample_count
from
    non_attribute_metadata nam
group by
    model ,
    package ,
    package_name
order by
    count(1) desc;

slow. would benefit from indexing.

turbomam commented 7 months ago

observed_models_packages.tsv.txt

turbomam commented 7 months ago

see

turbomam commented 7 months ago
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine(
    'postgresql://postgres:<SECRET>@localhost:15432/ncbi_biosamples_feb26')
destination_table_name = 'mixs_class_definitions'

df = pd.read_csv('mixs-schemasheets-concise.tsv', sep='\t')

# Remove rows where the first column value starts with ">"
df = df[~df.iloc[:, 0].astype(str).str.startswith(">")]

# Keep rows with a value for "class" but no value for "slot"
df = df[df['class'].notnull() & df['slot'].isnull()]

# Remove columns that are all null
df = df.dropna(axis=1, how='all')

df.to_csv('mixs-schemasheets-concise-class-definitions.tsv', sep='\t', index=False)

# Create a table with the same columns as the DataFrame
df.to_sql(destination_table_name, engine, if_exists='replace', index=False)

# Commit the transaction
engine.dispose()
turbomam commented 7 months ago

It looks like the new Extension annotations haven't been merged in

turbomam commented 7 months ago
import xml.etree.ElementTree as ET
import sqlalchemy
from sqlalchemy import create_engine, MetaData
import csv
import pandas as pd

INPUT_FILE = 'ncbi_biosample_package_definitions.xml'
OUTPUT_FILE = 'ncbi_biosample_package_definitions.tsv'   

headers = ['Name', 'DisplayName','EnvPackage', 'EnvPackageDisplay', 'Example', 'NotAppropriateFor', 'ShortName', 'group', 'Description', 'attributes']

# df = pd.DataFrame(columns=headers)

lod = []

destination_table_name = 'ncbi_package_definitions'

engine = create_engine(
    'postgresql://postgres:<SECRET>@localhost:15432/ncbi_biosamples_feb26')

# connection = engine.connect()

with open(INPUT_FILE) as f:
    xml = f.read()

root = ET.fromstring(xml)

with open(OUTPUT_FILE, 'w', newline='') as f:
    # dict_writer = csv.DictWriter(f, delimiter='\t', fieldnames=headers)
    # dict_writer.writeheader()

    for package in root.findall('Package'):
        row = {}
        for field in headers:
            if field == 'group':
                row[field] = package.get('group')

            elif field == 'attributes':
                attrs = []
                for key, value in package.items():  
                    if key != 'group':
                        attrs.append(f"{key}={value}")
                row[field] = "|".join(attrs)

            else:
                row[field] = package.findtext(field)

            lod.append(row)

lod_frame = pd.DataFrame(lod)

lod_frame.drop_duplicates(inplace=True)

print(lod_frame)

lod_frame.to_sql(destination_table_name, engine, index=False, if_exists='replace')