MobleyLab / FreeSolv

Experimental and calculated small molecule hydration free energies
http://www.escholarship.org/uc/item/6sd403pz
101 stars 53 forks source link

Sanitize SDF files #48

Open proteneer opened 4 years ago

proteneer commented 4 years ago

The current SDF files have about ~40 molecules in SDF format that are non-neutral. Here's a script that regenerates correct ones.

import csv
import os
from rdkit import Chem
from rdkit.Chem import AllChem

def is_neutral(mol):
    net_charge = 0
    for a in mol.GetAtoms():
        net_charge += a.GetFormalCharge()
    return net_charge == 0

mols = []

mmff_fail_count = 0

with open('database.txt', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=';', quotechar='|')
    for line, row in enumerate(spamreader):
        if line > 2:
            name = row[0]
            smiles = row[1]

            mol = Chem.MolFromSmiles(smiles)      
            mol = Chem.AddHs(mol)

            print(smiles)
            res = AllChem.EmbedMolecule(mol)
            assert res == 0 
            res = AllChem.MMFFOptimizeMolecule(mol)

            if res != 0:
                mmff_fail_count += 1

            exp_dG = float(row[3])
            exp_dG_err = float(row[4])

            mol.SetProp('_Name', name)
            mol.SetProp('dG', str(exp_dG))
            mol.SetProp('dG_err', str(exp_dG_err))

            assert is_neutral(mol)

            mols.append(mol)

print("mm_fail", mmff_fail_count)

w = Chem.SDWriter('freesolv.sdf')
for m in mols: w.write(m)
w.flush()

print("wrote", len(mols), "mols")