Open turbomam opened 7 months ago
select
model ,
package ,
package_name ,
count(1) as biosample_count
from
non_attribute_metadata nam
group by
model ,
package ,
package_name
order by
count(1) desc;
slow. would benefit from indexing.
import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(
'postgresql://postgres:<SECRET>@localhost:15432/ncbi_biosamples_feb26')
destination_table_name = 'mixs_class_definitions'
df = pd.read_csv('mixs-schemasheets-concise.tsv', sep='\t')
# Remove rows where the first column value starts with ">"
df = df[~df.iloc[:, 0].astype(str).str.startswith(">")]
# Keep rows with a value for "class" but no value for "slot"
df = df[df['class'].notnull() & df['slot'].isnull()]
# Remove columns that are all null
df = df.dropna(axis=1, how='all')
df.to_csv('mixs-schemasheets-concise-class-definitions.tsv', sep='\t', index=False)
# Create a table with the same columns as the DataFrame
df.to_sql(destination_table_name, engine, if_exists='replace', index=False)
# Commit the transaction
engine.dispose()
It looks like the new Extension annotations haven't been merged in
import xml.etree.ElementTree as ET
import sqlalchemy
from sqlalchemy import create_engine, MetaData
import csv
import pandas as pd
INPUT_FILE = 'ncbi_biosample_package_definitions.xml'
OUTPUT_FILE = 'ncbi_biosample_package_definitions.tsv'
headers = ['Name', 'DisplayName','EnvPackage', 'EnvPackageDisplay', 'Example', 'NotAppropriateFor', 'ShortName', 'group', 'Description', 'attributes']
# df = pd.DataFrame(columns=headers)
lod = []
destination_table_name = 'ncbi_package_definitions'
engine = create_engine(
'postgresql://postgres:<SECRET>@localhost:15432/ncbi_biosamples_feb26')
# connection = engine.connect()
with open(INPUT_FILE) as f:
xml = f.read()
root = ET.fromstring(xml)
with open(OUTPUT_FILE, 'w', newline='') as f:
# dict_writer = csv.DictWriter(f, delimiter='\t', fieldnames=headers)
# dict_writer.writeheader()
for package in root.findall('Package'):
row = {}
for field in headers:
if field == 'group':
row[field] = package.get('group')
elif field == 'attributes':
attrs = []
for key, value in package.items():
if key != 'group':
attrs.append(f"{key}={value}")
row[field] = "|".join(attrs)
else:
row[field] = package.findtext(field)
lod.append(row)
lod_frame = pd.DataFrame(lod)
lod_frame.drop_duplicates(inplace=True)
print(lod_frame)
lod_frame.to_sql(destination_table_name, engine, index=False, if_exists='replace')
I made a curated lookup in the past.
Use a LLM now?