openmm / pdbfixer

PDBFixer fixes problems in PDB files
Other
453 stars 114 forks source link

one case cannot fix missing residue #241

Closed yhgon closed 2 years ago

yhgon commented 2 years ago

I evaluate one chain from complex assembly pdb file(6HQV).

when I select chain A with biopython, it detect discountinous but it cannot detect missing residue with PDBFixer. and it pass on energy minimizing step but cause the NaN position error in OpenMM simulation iteration.

However, when I select chain A with PDBTools, it detect lots of missing residues with PDB Fixer, but it also fail on energy minimizing step with error ValueError: No template found for residue 1590 (GLU). The set of atoms is similar to DC5, but it is missing 10 atoms.

How could I fix this issue? thanks

below is detail log. custom script is also attached in the bottom.

## configure 6hqv with chain A 
pdb_id       = '6hqv'
chain        = 'A'
pdb_original = '{}.pdb'.format(pdb_id)
pdb_chain    = '{}_{}.pdb'.format(pdb_id,chain)
pdb_fixed    = '{}_{}_fixed.pdb'.format(pdb_id,chain)
pdb_water    = '{}_{}_water.pdb'.format(pdb_id,chain)
pdb_output   = '{}_{}_out.pdb'.format(pdb_id,chain)

savepdb(pdb_id=pdb_id)
pdb2fasta(pdb_chain)

0 A 1555 EPTRIAILGKEDIIVDHGIWLNFVAHDLLQTLPSSTYVLITDTNLYTTYVPPFQAVFEAAAPRDVRLLTYAIPPGEYSKSRETKAEIEDWMLSHACTRDTVIIALGGGVIGDMIGYVAATFMRGVRFVQVPTTLLAMVDSSIGGKTAIDTPMGKNLIGAFWQPRRIYIDLAFLETLPVREFINGMAEVIKTAAIWNETEFTALEENAAAILEAVRSKASSPAARLAPIRHILKRIVLGSARVKAEVVSADEREGGLRNLLNFGHSIGHAYEAILAPQVLHGECVAIGMVKEAELARYLGVLRPSAVARLTKLIASYDLPTSVHDKRIAKLSAGKECPVDVLLQKMAVDKKNEGRKKKIVLLSAIGKTYEKKATVVDDRAIRLVLSPSVRVTPGVPKGLSVTVTPPGSKSISNRALVLAALGEGTTRIHGLLHSDDVQYMLAAIEQLHGADFSWEDAGEILVVTGKGGKLQASKEPLYLGNAGTASRFLTSVVALCAPSAVSSTVLTGNARMKVRPIGALVDALRANGVGVKYLEKEKSLPVEVDAAGGFAGGVIELAATVSSQYVSSILMAAPYAHQPVTLRLVGGKPISQPYIDMTIAMMASFGIKVERSAEDPNTYLIPKGVYKNPPEYVVESDASSATYPLAVAAITGTTCTIPNIGSESLQGDARFAVEVLRPMGCAVEQTATSTTVTGPPIGTLKAIPHVDMEPMTDAFLTAAVLAAVADGTTQITGIANQRVKECNRIAAMKDQLAKFGVQCNELEDGIEVIGKPYQELRNPVEGIYCYDDHRVAMSHSVLSTISPHPVLILERECTAKTWPGWWDILSQFFKVQLDGEEDPTGTDRSIFIVGMRGAGKSTAGRWMSELLKRPLVDLDAELERREGMTIPEIIRGERGWEGFRQAELELLQDVIKNQSKGYIFSCGGGIVETEAARKLLIDYHKNGGPVLLVHRDTDQVVEYLMRDKTRPAYSENIREVYERRKPWFYECSNLQYHSPHEDGSEALLQPPADFARFVKLIAGQSTHLEDVRAKKHSFFVSLTVPNVADALDIIPRVVVGSDAVELRVDLLESYEPEFVARQVALLRAAAQVPIVYTVRTQSQGGKFPDEDYDLALRLYQTGLRSGVEYLDLEMTMPDHILQAVTDAKGFTSIIASHHDPQCKLSWKSGSWIPFYNKALQYGDVIKLVGVAREMADNFALTNFKAKMLAAHDNKPMIALNMGTAGKLSRVLNGFLTPVSHPALPSKAAPGQLSATEIRQALSLIGEIEPKSFYLFGKPISASRSPALHNTLFYKTGLPHHYSRFETDEASKALESLIRSPDFGGASVTIPLKLDIMPLLDSATDAARTIGAVNTIIPQTRDGSTTTLVGDNTDWRGMVHALLHSSGSGSVVQRTAAPRGAAMVVGSGGTARAAIYALHDLGFAPIWIVARSEERVAELVRGFDGYDLRRMTSPHQGKDNMPSVVISTIPATQPIDPSMREVIVEVLKHGHPSAEGKVLLEMAYQPPRTPLMTLAEDQGWRTVGGLEVLAAQGWYQFQLWTGITPLYEEARAAVMGEDSVE
from simtk.openmm.app import *
from simtk.openmm import *
from simtk.unit import *
from sys import stdout

from pdbfixer import PDBFixer
import mdtraj as md

pdb = PDBFile(pdb_water)
platform = Platform.getPlatformByName('CUDA')
properties = {'DeviceIndex': '0,1,2,3,4,5,6,7', 'Precision': 'single'}
forcefield = ForceField('amber99sb.xml', 'tip3p.xml')
system = forcefield.createSystem(pdb.topology, nonbondedMethod=PME, 
                                 nonbondedCutoff=1.0*nanometer, constraints=HBonds)
integrator = LangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds) # temperature, friction, timestep 
simulation1 = Simulation(pdb.topology, system, integrator, platform, properties)
simulation1.context.setPositions(pdb.positions)
simulation1.minimizeEnergy(maxIterations=25)

steps = 4
saveInterval = 1
printInterval = 1

simulation1.reporters.append(PDBReporter( file = pdb_output, reportInterval = saveInterval, enforcePeriodicBox =False))
simulation1.reporters.append(StateDataReporter(stdout, printInterval, 
                                              step=True,  
                                              totalSteps = steps, 
                                              potentialEnergy=True, 
                                              kineticEnergy =True, 
                                              totalEnergy =True, 
                                              temperature=True,
                                              elapsedTime =True,
                                              remainingTime =True,
                                              progress =True,
                                              speed=True ))

simulation1.step(steps)

error log NaN postion

#"Progress (%)","Step","Potential Energy (kJ/mole)","Kinetic Energy (kJ/mole)","Total Energy (kJ/mole)","Temperature (K)","Speed (ns/day)","Elapsed Time (s)","Time Remaining"
25.0%,1,4276352.3766489215,1706178409.6887324,1710454762.0653813,755882.022021287,0,0.0008809566497802734,--
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<timed eval> in <module>

/opt/conda/lib/python3.8/site-packages/simtk/openmm/app/simulation.py in step(self, steps)
    130     def step(self, steps):
    131         """Advance the simulation by integrating a specified number of time steps."""
--> 132         self._simulate(endStep=self.currentStep+steps)
    133 
    134     def runForClockTime(self, time, checkpointFile=None, stateFile=None, checkpointInterval=None):

/opt/conda/lib/python3.8/site-packages/simtk/openmm/app/simulation.py in _simulate(self, endStep, endTime)
    234                     self._generate_reports(wrapped, True)
    235                 if len(unwrapped) > 0:
--> 236                     self._generate_reports(unwrapped, False)
    237 
    238     def _generate_reports(self, reports, periodic):

/opt/conda/lib/python3.8/site-packages/simtk/openmm/app/simulation.py in _generate_reports(self, reports, periodic)
    254                                       groups=self.context.getIntegrator().getIntegrationForceGroups())
    255         for reporter, next in reports:
--> 256             reporter.report(self, state)
    257 
    258     def saveCheckpoint(self, file):

/opt/conda/lib/python3.8/site-packages/simtk/openmm/app/pdbreporter.py in report(self, simulation, state)
     97             self._topology = simulation.topology
     98             self._nextModel += 1
---> 99         PDBFile.writeModel(simulation.topology, state.getPositions(), self._out, self._nextModel)
    100         self._nextModel += 1
    101         if hasattr(self._out, 'flush') and callable(self._out.flush):

/opt/conda/lib/python3.8/site-packages/simtk/openmm/app/pdbfile.py in writeModel(topology, positions, file, modelIndex, keepIds, extraParticleIdentifier)
    333             positions = positions.value_in_unit(angstroms)
    334         if any(math.isnan(norm(pos)) for pos in positions):
--> 335             raise ValueError('Particle position is NaN')
    336         if any(math.isinf(norm(pos)) for pos in positions):
    337             raise ValueError('Particle position is infinite')

ValueError: Particle position is NaN
$ pdb_selchain -A 6hqv.pdb > 6hqv_A.pdb
fix_missing(pdb_chain, pdb_fixed)

{(0, 0): ['MET', 'ALA', 'THR', 'ALA', 'ASN', 'VAL', 'ALA', 'GLY', 'ALA', 'GLY', 'GLY', 'SER', 'GLY', 'SER'], (0, 839): ['LYS', 'ARG', 'THR', 'THR', 'GLN', 'SER', 'THR', 'GLN', 'GLN', 'VAL', 'ARG', 'LYS'], (0, 1555): ['LEU', 'GLU', 'HIS', 'HIS', 'HIS', 'HIS', 'HIS', 'HIS']}

after fix, I could see the added residues as reported.

before : 0 A 1555                                EPTRIAILGKEDIIVDHG  ...  MGEDSVE
after    : 0 A 1589 MATANVAGAGGSGSEPTRIAILGKEDIIVDHG  ...  MGEDSVELEHHHHHH

it's odd error becuase, it has 1589 residues and end is not GLU(E) but HIS(H) in the fixed PDB. . detail error below :

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<timed exec> in <module>

/opt/conda/lib/python3.8/site-packages/simtk/openmm/app/forcefield.py in createSystem(self, topology, nonbondedMethod, nonbondedCutoff, constraints, rigidWater, removeCMMotion, hydrogenMass, residueTemplates, ignoreExternalBonds, switchDistance, flexibleConstraints, drudeMass, **args)
   1177         # Find the template matching each residue and assign atom types.
   1178 
-> 1179         templateForResidue = self._matchAllResiduesToTemplates(data, topology, residueTemplates, ignoreExternalBonds)
   1180         for res in topology.residues():
   1181             if res.name == 'HOH':

/opt/conda/lib/python3.8/site-packages/simtk/openmm/app/forcefield.py in _matchAllResiduesToTemplates(self, data, topology, residueTemplates, ignoreExternalBonds, ignoreExtraParticles, recordParameters)
   1391                             break
   1392             if matches is None:
-> 1393                 raise ValueError('No template found for residue %d (%s).  %s' % (res.index+1, res.name, _findMatchErrors(self, res)))
   1394             else:
   1395                 if recordParameters:

ValueError: No template found for residue 1590 (GLU).  The set of atoms is similar to DC5, but it is missing 10 atoms.

when I check the whole sequence, fixed pdb have two chain. second chain include only residue E. it seems that cause the error.

pdb2fasta(pdb_fixed)
0 A 1589 MATANVAGAGGSGSEPTRIAILGKEDIIVDHGIWLNFVAHDLLQTLPSSTYVLITDTNLYTTYVPPFQAVFEAAAPRDVRLLTYAIPPGEYSKSRETKAEIEDWMLSHACTRDTVIIALGGGVIGDMIGYVAATFMRGVRFVQVPTTLLAMVDSSIGGKTAIDTPMGKNLIGAFWQPRRIYIDLAFLETLPVREFINGMAEVIKTAAIWNETEFTALEENAAAILEAVRSKASSPAARLAPIRHILKRIVLGSARVKAEVVSADEREGGLRNLLNFGHSIGHAYEAILAPQVLHGECVAIGMVKEAELARYLGVLRPSAVARLTKLIASYDLPTSVHDKRIAKLSAGKECPVDVLLQKMAVDKKNEGRKKKIVLLSAIGKTYEKKATVVDDRAIRLVLSPSVRVTPGVPKGLSVTVTPPGSKSISNRALVLAALGEGTTRIHGLLHSDDVQYMLAAIEQLHGADFSWEDAGEILVVTGKGGKLQASKEPLYLGNAGTASRFLTSVVALCAPSAVSSTVLTGNARMKVRPIGALVDALRANGVGVKYLEKEKSLPVEVDAAGGFAGGVIELAATVSSQYVSSILMAAPYAHQPVTLRLVGGKPISQPYIDMTIAMMASFGIKVERSAEDPNTYLIPKGVYKNPPEYVVESDASSATYPLAVAAITGTTCTIPNIGSESLQGDARFAVEVLRPMGCAVEQTATSTTVTGPPIGTLKAIPHVDMEPMTDAFLTAAVLAAVADGTTQITGIANQRVKECNRIAAMKDQLAKFGVQCNELEDGIEVIGKPYQELRNPVEGIYCYDDHRVAMSHSVLSTISPHPVLILERECTAKTWPGWWDILSQFFKVQLDGEEDPTKRTTQSTQQVRKGTDRSIFIVGMRGAGKSTAGRWMSELLKRPLVDLDAELERREGMTIPEIIRGERGWEGFRQAELELLQDVIKNQSKGYIFSCGGGIVETEAARKLLIDYHKNGGPVLLVHRDTDQVVEYLMRDKTRPAYSENIREVYERRKPWFYECSNLQYHSPHEDGSEALLQPPADFARFVKLIAGQSTHLEDVRAKKHSFFVSLTVPNVADALDIIPRVVVGSDAVELRVDLLESYEPEFVARQVALLRAAAQVPIVYTVRTQSQGGKFPDEDYDLALRLYQTGLRSGVEYLDLEMTMPDHILQAVTDAKGFTSIIASHHDPQCKLSWKSGSWIPFYNKALQYGDVIKLVGVAREMADNFALTNFKAKMLAAHDNKPMIALNMGTAGKLSRVLNGFLTPVSHPALPSKAAPGQLSATEIRQALSLIGEIEPKSFYLFGKPISASRSPALHNTLFYKTGLPHHYSRFETDEASKALESLIRSPDFGGASVTIPLKLDIMPLLDSATDAARTIGAVNTIIPQTRDGSTTTLVGDNTDWRGMVHALLHSSGSGSVVQRTAAPRGAAMVVGSGGTARAAIYALHDLGFAPIWIVARSEERVAELVRGFDGYDLRRMTSPHQGKDNMPSVVISTIPATQPIDPSMREVIVEVLKHGHPSAEGKVLLEMAYQPPRTPLMTLAEDQGWRTVGGLEVLAAQGWYQFQLWTGITPLYEEARAAVMGEDSVELEHHHHHH
1 B 1 E

it's google drive link for all PDB_files

-------------------- below is utility script what I test --------------------

from simtk.openmm.app import *
from simtk.openmm import *
from simtk.unit import *
from sys import stdout

from pdbfixer import PDBFixer
import mdtraj as md

def savepdb(pdb_id='6btc'):
    from pypdb import get_pdb_file
    pdb_data = get_pdb_file(pdb_id, filetype='pdb', compression=False)
    filename = str(pdb_id)+'.pdb'
    textfile = open(filename, 'w')
    textfile.write(pdb_data)
    textfile.close()

def select_chain(filename='/content/6btc.pdb', chain='A'):
    import os
    from Bio.PDB import Select, PDBIO
    from Bio.PDB.PDBParser import PDBParser

    dir, file = os.path.split(filename)
    filebody, file_ext=os.path.splitext(file)
    write_filename = os.path.join(dir, filebody+'_'+str(chain)+file_ext)

    p = PDBParser()
    structure  = p.get_structure('new', filename )

    class ChainSelect(Select):
        def __init__(self, chain):
            self.chain = chain

        def accept_chain(self, chain):
            if chain.get_id() == self.chain:
                return 1
            else:          
                return 0
    new = PDBIO()
    new.set_structure(structure)
    new.save(write_filename, ChainSelect(chain) )

    print("force to write PDB with only Chain", chain)    

def pdb2fasta(filename='/content/6btc.pdb'):
    import sys
    import os
    import re

    letters = {'ALA':'A','ARG':'R','ASN':'N','ASP':'D','CYS':'C','GLU':'E','GLN':'Q','GLY':'G','HIS':'H',
           'ILE':'I','LEU':'L','LYS':'K','MET':'M','PHE':'F','PRO':'P','SER':'S','THR':'T','TRP':'W','MSE':'M',
           'TYR':'Y','VAL':'V'}

    patterns=re.compile("^ATOM\s{2,6}\d{1,5}\s{2}CA\s[\sA]([A-Z]{3})\s([\s\w])|^HETATM\s{0,4}\d{1,5}\s{2}CA\s[\sA](MSE)\s([\s\w])")
    fp=open(filename,'r') 

    chain_dict=dict()
    chain_list=[]    

    for i, line in enumerate(fp.read().splitlines()):
        if line.startswith("ENDMDL"):
            break

        match_list=patterns.findall(line)

        if match_list:
            #print(i,line)
            resn  = match_list[0][0]+match_list[0][2]
            chain = match_list[0][1]+match_list[0][3]
            #print(i, chain, resn, letters[resn],  line)
            if chain in chain_dict:
                chain_dict[chain] += letters[resn]
            else:
                chain_dict[chain]  = letters[resn]
                chain_list.append(chain)

    fp.close()         

    for i,chain in enumerate(chain_list):
         print(i, chain, len(chain_dict[chain]), chain_dict[chain] )

def fix_missing(input, output ):
    from pdbfixer import PDBFixer
    fixer = PDBFixer(filename=input)
    fixer.findMissingResidues()
    print(fixer.missingResidues )
    fixer.findNonstandardResidues()
    fixer.replaceNonstandardResidues()
    fixer.removeHeterogens(False)
    fixer.findMissingAtoms()
    fixer.addMissingAtoms()
    print(fixer.missingResidues )
    fixer.addMissingHydrogens(7.0)
    #fixer.addSolvent(fixer.topology.getUnitCellDimensions())
    #fixer.addSolvent(padding=1.0*nanometer) #at least 1 nm of water surrounding the solute on all sides.
    PDBFile.writeFile(fixer.topology, fixer.positions, open(output, 'w'))

def add_water(input, output ):
    from pdbfixer import PDBFixer
    fixer = PDBFixer(filename=input)
    fixer.findMissingResidues()
    fixer.findNonstandardResidues()
    fixer.replaceNonstandardResidues()
    fixer.removeHeterogens(False)
    fixer.findMissingAtoms()
    fixer.addMissingAtoms()
    fixer.addMissingHydrogens(7.0)
    #fixer.addSolvent(fixer.topology.getUnitCellDimensions())
    fixer.addSolvent(padding=1.0*nanometer) #at least 1 nm of water surrounding the solute on all sides.
    PDBFile.writeFile(fixer.topology, fixer.positions, open(output, 'w'))    
peastman commented 2 years ago

Can you create a simpler example that uses only PDBFixer, without Biopython or PDBTools? It can download the PDB file for you, so you don't need anything else to do that:

fixer = PDBFixer(pdbid='6hqv')

It also can remove the unwanted chains with fixer.removeChains() so you don't need anything else for that either.

yhgon commented 2 years ago

I evaluate it with only PDBFixer.
and I found the reason.

fixer.removeChains cannot seperate chainA. I guess biopython and PDBtools also have problem for these issue.

def download_pdb(pdbid, chain_id='A'):
    from openmm.app import PDBFile
    from pdbfixer import PDBFixer
    fixer = PDBFixer(pdbid=pdbid)
    ## select first chain
    fixer.removeChains(chainIds=['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' ]) 
    fixer.removeHeterogens(False)    
    output_pdb = '{}_{}.pdb'.format(pdbid,chain_id)
    PDBFile.writeFile(fixer.topology, fixer.positions, open(output_pdb, 'w'))

download_pdb(pdbid='6hqv')

output have below results and ATOM 11471 N GLU B is problem.

ATOM  11459  CG1 VAL A1516     104.078 112.253  12.457  1.00  0.00           C  
ATOM  11460  CG2 VAL A1516     105.582 113.311  10.763  1.00  0.00           C  
ATOM  11461  N   GLU A1517     105.519 109.079   9.275  1.00  0.00           N  
ATOM  11462  CA  GLU A1517     105.696 108.512   7.923  1.00  0.00           C  
ATOM  11463  C   GLU A1517     105.135 107.093   7.863  1.00  0.00           C  
ATOM  11464  O   GLU A1517     103.927 106.903   7.742  1.00  0.00           O  
ATOM  11465  CB  GLU A1517     107.175 108.493   7.505  1.00  0.00           C  
ATOM  11466  CG  GLU A1517     108.157 108.167   8.604  1.00  0.00           C  
ATOM  11467  CD  GLU A1517     109.597 108.351   8.173  1.00  0.00           C  
ATOM  11468  OE1 GLU A1517     110.497 107.939   8.942  1.00  0.00           O  
ATOM  11469  OE2 GLU A1517     109.820 108.910   7.070  1.00  0.00           O  
TER   11470      GLU A1517
ATOM  11471  N   GLU B   1      84.985 174.967  28.676  1.00  0.00           N  
ATOM  11472  CA  GLU B   1      84.095 174.330  27.668  1.00  0.00           C  
ATOM  11473  C   GLU B   1      84.611 172.951  27.233  1.00  0.00           C  
ATOM  11474  O   GLU B   1      83.897 172.195  26.567  1.00  0.00           O  
ATOM  11475  CB  GLU B   1      83.955 175.248  26.458  1.00  0.00           C  
ATOM  11476  CG  GLU B   1      82.570 175.251  25.831  1.00  0.00           C  
ATOM  11477  CD  GLU B   1      81.630 176.243  26.486  1.00  0.00           C  
ATOM  11478  OE1 GLU B   1      81.035 177.065  25.760  1.00  0.00           O  
ATOM  11479  OE2 GLU B   1      81.492 176.198  27.723  1.00  0.00           O  
ATOM  11480  OXT GLU B   1      85.739 172.548  27.540  1.00  0.00           O  
TER   11481      GLU B   1
END

when I check original PDB file 6hqv,
TOM 11471 N GLU B 1 , it came from HETATM23437 ~ HETATM23446

HETATM23437  N   GLU A1603      84.985 174.967  28.676  1.00 97.46           N  
ANISOU23437  N   GLU A1603    12736  12114  12179    840   -451   -515       N  
HETATM23438  CA  GLU A1603      84.095 174.330  27.668  1.00 97.25           C  
ANISOU23438  CA  GLU A1603    12712  12128  12110    812   -426   -445       C  
HETATM23439  C   GLU A1603      84.611 172.951  27.233  1.00 98.81           C  
ANISOU23439  C   GLU A1603    12895  12346  12302    802   -445   -493       C  
HETATM23440  O   GLU A1603      83.897 172.195  26.567  1.00101.12           O  
ANISOU23440  O   GLU A1603    13197  12671  12551    775   -433   -457       O  
HETATM23441  CB  GLU A1603      83.955 175.248  26.458  1.00 98.77           C  
ANISOU23441  CB  GLU A1603    12875  12337  12313    752   -377   -390       C  
HETATM23442  CG  GLU A1603      82.570 175.251  25.831  1.00 98.68           C  
ANISOU23442  CG  GLU A1603    12873  12356  12262    730   -368   -275       C  
HETATM23443  CD  GLU A1603      81.630 176.243  26.486  1.00 94.26           C  
ANISOU23443  CD  GLU A1603    12315  11747  11751    772   -357   -210       C  
HETATM23444  OE1 GLU A1603      81.035 177.065  25.760  1.00 90.98           O  
ANISOU23444  OE1 GLU A1603    11872  11324  11369    742   -345   -112       O  
HETATM23445  OE2 GLU A1603      81.492 176.198  27.723  1.00 91.28           O  
ANISOU23445  OE2 GLU A1603    11964  11336  11383    828   -359   -258       O  
HETATM23446  OXT GLU A1603      85.739 172.548  27.540  1.00 94.34           O  
ANISOU23446  OXT GLU A1603    12299  11757  11788    819   -475   -573       O  

so when I check the sequence, it show two chain.

0 A 1517 EPTRIAILGKEDIIVDHGIWLNFVAHDLLQTLPSSTYVLITDTNLYTTYVPPFQAVFEAAAPRDVRLLTYAIPPGEYSKSRETKAEIEDWLSHACTRDTVIIALGGGVIGDIGYVAATFRGVRFVQVPTTLLAVDSSIGGKTAIDTPGKNLIGAFWQPRRIYIDLAFLETLPVREFINGAEVIKTAAIWNETEFTALEENAAAILEAVRSKASSPAARLAPIRHILKRIVLGSARVKAEVVSADEREGGLRNLLNFGHSIGHAYEAILAPQVLHGECVAIGVKEAELARYLGVLRPSAVARLTKLIASYDLPTSVHDKRIAKLSAGKECPVDVLLQKAVDKKNEGRKKKIVLLSAIGKTYEKKATVVDDRAIRLVLSPSVRVTPGVPKGLSVTVTPPGSKSISNRALVLAALGEGTTRIHGLLHSDDVQYLAAIEQLHGADFSWEDAGEILVVTGKGGKLQASKEPLYLGNAGTASRFLTSVVALCAPSAVSSTVLTGNARKVRPIGALVDALRANGVGVKYLEKEKSLPVEVDAAGGFAGGVIELAATVSSQYVSSILAAPYAHQPVTLRLVGGKPISQPYIDTIAASFGIKVERSAEDPNTYLIPKGVYKNPPEYVVESDASSATYPLAVAAITGTTCTIPNIGSESLQGDARFAVEVLRPGCAVEQTATSTTVTGPPIGTLKAIPHVDEPTDAFLTAAVLAAVADGTTQITGIANQRVKECNRIAAKDQLAKFGVQCNELEDGIEVIGKPYQELRNPVEGIYCYDDHRVASHSVLSTISPHPVLILERECTAKTWPGWWDILSQFFKVQLDGEEDPTGTDRSIFIVGRGAGKSTAGRWSELLKRPLVDLDAELERREGTIPEIIRGERGWEGFRQAELELLQDVIKNQSKGYIFSCGGGIVETEAARKLLIDYHKNGGPVLLVHRDTDQVVEYLRDKTRPAYSENIREVYERRKPWFYECSNLQYHSPHEDGSEALLQPPADFARFVKLIAGQSTHLEDVRAKKHSFFVSLTVPNVADALDIIPRVVVGSDAVELRVDLLESYEPEFVARQVALLRAAAQVPIVYTVRTQSQGGKFPDEDYDLALRLYQTGLRSGVEYLDLETPDHILQAVTDAKGFTSIIASHHDPQCKLSWKSGSWIPFYNKALQYGDVIKLVGVAREADNFALTNFKAKLAAHDNKPIALNGTAGKLSRVLNGFLTPVSHPALPSKAAPGQLSATEIRQALSLIGEIEPKSFYLFGKPISASRSPALHNTLFYKTGLPHHYSRFETDEASKALESLIRSPDFGGASVTIPLKLDIPLLDSATDAARTIGAVNTIIPQTRDGSTTTLVGDNTDWRGVHALLHSSGSGSVVQRTAAPRGAAVVGSGGTARAAIYALHDLGFAPIWIVARSEERVAELVRGFDGYDLRRTSPHQGKDNPSVVISTIPATQPIDPSREVIVEVLKHGHPSAEGKVLLEAYQPPRTPLTLAEDQGWRTVGGLEVLAAQGWYQFQLWTGITPLYEEARAAVGEDSVE
1 B 1 E

I rerun load PDB file and select chain A, it eliminate GLU in chain B. however, It cannot detect missing residues after run.

yhgon commented 2 years ago

I implement it with redundant script. it make output what I expected.

download 1818.9ms
select chain 190.8ms
detect missing residues  1.9ms
{(0, 0): ['MET', 'ALA', 'THR', 'ALA', 'ASN', 'VAL', 'ALA', 'GLY', 'ALA', 'GLY', 'GLY', 'SER', 'GLY', 'SER'], 
(0, 839): ['LYS', 'ARG', 'THR', 'THR', 'GLN', 'SER', 'THR', 'GLN', 'GLN', 'VAL', 'ARG', 'LYS'], 
(0, 1555): ['LEU', 'GLU', 'HIS', 'HIS', 'HIS', 'HIS', 'HIS', 'HIS']}
remove heterogens 580.8ms
add Missing Atoms 97680.9ms
save tmp file and load 791.8ms
select chain 323.3ms
remove small molecules  0.0ms
save file 128.2ms
0 A 1589 MATANVAGAGGSGSEPTRIAILGKEDIIVDHGIWLNFVAHDLLQTLPSSTYVLITDTNLYTTYVPPFQAVFEAAAPRDVRLLTYAIPPGEYSKSRETKAEIEDWMLSHACTRDTVIIALGGGVIGDMIGYVAATFMRGVRFVQVPTTLLAMVDSSIGGKTAIDTPMGKNLIGAFWQPRRIYIDLAFLETLPVREFINGMAEVIKTAAIWNETEFTALEENAAAILEAVRSKASSPAARLAPIRHILKRIVLGSARVKAEVVSADEREGGLRNLLNFGHSIGHAYEAILAPQVLHGECVAIGMVKEAELARYLGVLRPSAVARLTKLIASYDLPTSVHDKRIAKLSAGKECPVDVLLQKMAVDKKNEGRKKKIVLLSAIGKTYEKKATVVDDRAIRLVLSPSVRVTPGVPKGLSVTVTPPGSKSISNRALVLAALGEGTTRIHGLLHSDDVQYMLAAIEQLHGADFSWEDAGEILVVTGKGGKLQASKEPLYLGNAGTASRFLTSVVALCAPSAVSSTVLTGNARMKVRPIGALVDALRANGVGVKYLEKEKSLPVEVDAAGGFAGGVIELAATVSSQYVSSILMAAPYAHQPVTLRLVGGKPISQPYIDMTIAMMASFGIKVERSAEDPNTYLIPKGVYKNPPEYVVESDASSATYPLAVAAITGTTCTIPNIGSESLQGDARFAVEVLRPMGCAVEQTATSTTVTGPPIGTLKAIPHVDMEPMTDAFLTAAVLAAVADGTTQITGIANQRVKECNRIAAMKDQLAKFGVQCNELEDGIEVIGKPYQELRNPVEGIYCYDDHRVAMSHSVLSTISPHPVLILERECTAKTWPGWWDILSQFFKVQLDGEEDPTKRTTQSTQQVRKGTDRSIFIVGMRGAGKSTAGRWMSELLKRPLVDLDAELERREGMTIPEIIRGERGWEGFRQAELELLQDVIKNQSKGYIFSCGGGIVETEAARKLLIDYHKNGGPVLLVHRDTDQVVEYLMRDKTRPAYSENIREVYERRKPWFYECSNLQYHSPHEDGSEALLQPPADFARFVKLIAGQSTHLEDVRAKKHSFFVSLTVPNVADALDIIPRVVVGSDAVELRVDLLESYEPEFVARQVALLRAAAQVPIVYTVRTQSQGGKFPDEDYDLALRLYQTGLRSGVEYLDLEMTMPDHILQAVTDAKGFTSIIASHHDPQCKLSWKSGSWIPFYNKALQYGDVIKLVGVAREMADNFALTNFKAKMLAAHDNKPMIALNMGTAGKLSRVLNGFLTPVSHPALPSKAAPGQLSATEIRQALSLIGEIEPKSFYLFGKPISASRSPALHNTLFYKTGLPHHYSRFETDEASKALESLIRSPDFGGASVTIPLKLDIMPLLDSATDAARTIGAVNTIIPQTRDGSTTTLVGDNTDWRGMVHALLHSSGSGSVVQRTAAPRGAAMVVGSGGTARAAIYALHDLGFAPIWIVARSEERVAELVRGFDGYDLRRMTSPHQGKDNMPSVVISTIPATQPIDPSMREVIVEVLKHGHPSAEGKVLLEMAYQPPRTPLMTLAEDQGWRTVGGLEVLAAQGWYQFQLWTGITPLYEEARAAVMGEDSVELEHHHHHH

def fixing_pdb(pdbid, chain_id='A'):
    from openmm.app import PDBFile
    from pdbfixer import PDBFixer
    import time
    tic = time.time()
    fixer = PDBFixer(pdbid=pdbid)
    toc = time.time()
    dur = toc-tic     
    print("download {:4.1f}ms".format(dur*1000))

    ## select first chain
    tic = time.time()
    fixer.removeChains(chainIds=['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' ]) 
    toc = time.time()
    dur = toc-tic 
    print("select chain {:4.1f}ms".format(dur*1000))

    ## detect missing residues
    tic = time.time()    
    fixer.findMissingResidues()
    toc = time.time()
    dur = toc-tic 
    print("detect missing residues {:4.1f}ms".format(dur*1000))

    print(fixer.missingResidues )
    tic = time.time()   
    fixer.findNonstandardResidues()
    fixer.replaceNonstandardResidues()
    fixer.removeHeterogens(False)
    toc = time.time()
    dur = toc-tic 
    print("remove heterogens {:4.1f}ms".format(dur*1000))    

    tic = time.time()    
    fixer.findMissingAtoms()
    fixer.addMissingAtoms()
    toc = time.time()
    dur = toc-tic     
    print("add Missing Atoms {:4.1f}ms".format(dur*1000))    

    tic = time.time()
    PDBFile.writeFile(fixer.topology, fixer.positions, open('tmp.pdb', 'w'))
    fixer = PDBFixer(filename='tmp.pdb')
    toc = time.time()
    dur = toc-tic     
    print("save tmp file and load {:4.1f}ms".format(dur*1000))

    ## select first chain
    tic = time.time()
    fixer.removeChains(chainIds=['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' ]) 
    toc = time.time()
    dur = toc-tic 
    print("select chain {:4.1f}ms".format(dur*1000))

    ## remove heterogens
    tic = time.time()
    #fixer.removeHeterogens(False)    
    toc = time.time()
    dur = toc-tic 
    print("remove small molecules {:4.1f}ms".format(dur*1000))

    tic = time.time()    
    output_pdb = '{}_{}.pdb'.format(pdbid,chain_id)
    PDBFile.writeFile(fixer.topology, fixer.positions, open(output_pdb, 'w'))
    toc = time.time()
    dur = toc-tic     
    print("save file {:4.1f}ms".format(dur*1000))
yhgon commented 2 years ago

I evaluate OpenMM simulation and the output works well.