pinder-org / pinder

PINDER: The Protein INteraction Dataset and Evaluation Resource
https://pinder-org.github.io/pinder/
Apache License 2.0
89 stars 6 forks source link

pdb_kind in sequence dataset #24

Open Leo-T-Zang opened 2 days ago

Leo-T-Zang commented 2 days ago

Hi,

For sequence dataset from pinder.core.index.utils import get_sequence_database, it has pdb_kind column with following types:

How can we find the interacting sequence/partner for each row or each type?

Thanks a lot!

danielkovtun commented 1 day ago

Hi @Leo-T-Zang , the sequence database contains long-form, for every single PDB file that exists in the database, the sequence. Depending on your use-case, if you want to map this onto the pinder index (where everything is paired to partners/interactions), I think the following should work:

from pinder.core.index.utils import get_sequence_database
from pinder.core import get_index

seq_db = get_sequence_database()
seq_index = get_index()
for pdb_kind, sub_db in seq_db.groupby('pdb_kind', observed=True):
    sub_db.drop("pdb_kind", axis=1, inplace=True)
    if pdb_kind in ["receptor", "ligand"]:
        # Holo R/L
        map_col = f"holo_{pdb_kind[0].upper()}_pdb"
        sub_db.rename(columns={"pdb": map_col, "sequence": f"{pdb_kind}_sequence"}, inplace=True)
        seq_index = pd.merge(seq_index, sub_db, how="left")
    elif pdb_kind == "dimer":
        # ID column doesn't contain .pdb suffix, just strip it from sub_db and rename to pinder ID
        sub_db["pdb"] = sub_db["pdb"].str.rstrip(".pdb")
        sub_db.rename(columns={"pdb": "id", "sequence": "dimer_sequence"}, inplace=True)
        seq_index = pd.merge(seq_index, sub_db, how="left")
    elif pdb_kind == "predicted":
        # Merge with both receptor and ligand (might be the same for both if uniprot IDs identical)
        seq_index = pd.merge(seq_index, sub_db.rename(columns={"pdb": f"{pdb_kind}_R_pdb", "sequence": f"{pdb_kind}_R_sequence"}), how="left")
        seq_index = pd.merge(seq_index, sub_db.rename(columns={"pdb": f"{pdb_kind}_L_pdb", "sequence": f"{pdb_kind}_L_sequence"}), how="left")
    elif pdb_kind == "monomer":
        # Can only get the canonical apo_R + apo_L like this. See below for how to get exhaustive mappings of apo sequences (many to one).
        seq_index = pd.merge(seq_index, sub_db.rename(columns={"pdb": "apo_R_pdb", "sequence": "apo_R_sequence"}), how="left")
        seq_index = pd.merge(seq_index, sub_db.rename(columns={"pdb": "apo_L_pdb", "sequence": "apo_L_sequence"}), how="left")
    else:
        raise ValueError(f"Unknown pdb_kind: {pdb_kind}")

print(
    seq_index[["split", "id", "pdb_id", "dimer_sequence", "receptor_sequence", "ligand_sequence", "predicted_R_sequence", "predicted_L_sequence", "apo_R_sequence", "apo_L_sequence"]]
    .sample(3)
    .to_markdown(index=False)
)
split id pdb_id dimer_sequence receptor_sequence ligand_sequence predicted_R_sequence predicted_L_sequence apo_R_sequence apo_L_sequence
train 6la5__A24_Q2LJ73--6la5__A25_Q2LJ73 6la5 VVEAVENAVARVADTISSGPSNSQAVPALTAVETGHTSQVTPSDTIQTRHVRNYHSRSESSIENFLCRSACVYMGEYHTTNTDTSKLFASWTINARRMVQMRRKLELFTYVRFDMEVTFVITSKQDQGTQLGQDMPPLTHQIMYIPPGGPIPKSVTDYTWQTSTNPSIFWTEGNAPPRMSIPFISIGNAYSNFYDGWSHFSQNGVYGYNTLNHMGQIYVRHVNGSSPLPMTSTVRMYFKPKHVKVWVPRPPRLCQYKNASTVNFTPTNITEKRQSINYIPETVKPVVEAVENAVARVADTISSGPSNSQAVPALTAVETGHTSQVTPSDTIQTRHVRNYHSRSESSIENFLCRSACVYMGEYHTTNTDTSKLFASWTINARRMVQMRRKLELFTYVRFDMEVTFVITSKQDQGTQLGQDMPPLTHQIMYIPPGGPIPKSVTDYTWQTSTNPSIFWTEGNAPPRMSIPFISIGNAYSNFYDGWSHFSQNGVYGYNTLNHMGQIYVRHVNGSSPLPMTSTVRMYFKPKHVKVWVPRPPRLCQYKNASTVNFTPTNITEKRQSINYIPETVKP VVEAVENAVARVADTISSGPSNSQAVPALTAVETGHTSQVTPSDTIQTRHVRNYHSRSESSIENFLCRSACVYMGEYHTTNTDTSKLFASWTINARRMVQMRRKLELFTYVRFDMEVTFVITSKQDQGTQLGQDMPPLTHQIMYIPPGGPIPKSVTDYTWQTSTNPSIFWTEGNAPPRMSIPFISIGNAYSNFYDGWSHFSQNGVYGYNTLNHMGQIYVRHVNGSSPLPMTSTVRMYFKPKHVKVWVPRPPRLCQYKNASTVNFTPTNITEKRQSINYIPETVKP VVEAVENAVARVADTISSGPSNSQAVPALTAVETGHTSQVTPSDTIQTRHVRNYHSRSESSIENFLCRSACVYMGEYHTTNTDTSKLFASWTINARRMVQMRRKLELFTYVRFDMEVTFVITSKQDQGTQLGQDMPPLTHQIMYIPPGGPIPKSVTDYTWQTSTNPSIFWTEGNAPPRMSIPFISIGNAYSNFYDGWSHFSQNGVYGYNTLNHMGQIYVRHVNGSSPLPMTSTVRMYFKPKHVKVWVPRPPRLCQYKNASTVNFTPTNITEKRQSINYIPETVKP nan nan nan nan
invalid 1ruf__B54_P03303--1ruf__D9_P03303 1ruf INYYKDAASTSSAGQSLSMDPSKFTEPVKDLMLKGAPALNGYSDRVQQITLGNSTITTQEAANAVVCYAEWPEYLPDVDASDVNKTSKPDTSVCRFYTLDSKTWTTGSKGWCWKLPDALKDMGVFGQNMFFHSLGRSGYTVHVQCNATKFHSGCLLVVVIPEHQLASHEGGNVSVKYTFTHPGERGIDLSSANEVGGPVKDVLYNMNGTLLGNLLIFPHQFINLRTNNTATIVIPYINSVPIDSMTRHNNVSLMVIPIAPLTVPTGATPSLPITVTIAPMCTEFSGIRSKSIVPQ GYSDRVQQITLGNSTITTQEAANAVVCYAEWPEYLPDVDASDVNKTSKPDTSVCRFYTLDSKTWTTGSKGWCWKLPDALKDMGVFGQNMFFHSLGRSGYTVHVQCNATKFHSGCLLVVVIPEHQLASHEGGNVSVKYTFTHPGERGIDLSSANEVGGPVKDVLYNMNGTLLGNLLIFPHQFINLRTNNTATIVIPYINSVPIDSMTRHNNVSLMVIPIAPLTVPTGATPSLPITVTIAPMCTEFSGIRSKSIVPQ INYYKDAASTSSAGQSLSMDPSKFTEPVKDLMLKGAPALN nan nan nan nan
invalid 2c8i__E31_P08174--2c8i__C32_P29813 2c8i QDCGLPPDVPNAQPALEGRTSFPEDTVITYKCEESFVKIPGEKDSVICLKGSQWSDIEEFCNRSCEVPTRLNSASLKQPYITQNYFPVGTVVEYECRPGYRREPSLSPKLTCLQNLKWSTAVEFCKKKSCPNPGEIRNGQIDVPGGILFGATISFSCNTGYKLFGSTSSFCLISGSSVQWSDPLPECREIYCPAPPQIDNGIIQGERDHYGYRQSVTYACNKGFTMIGEHSIYCTVNNDEGEWSGPPPECRGGLPVINTPGSNQFLTSDDFQSPSAMPQFDVTPELNIPGEVQNLMEIAEVDSVVPVNNVAGNLETMDIYRIPVQSGNHQSSQVFGFQVQPGLDGVFKHTLLGEILNYYAHWSGSIKLTFVFCGSAMATGKFLLAYAPPGANAPKSRKDAMLGTHIIWDVGLQSSCVLCIPWISQTHYRLVQQDEYTSAGNVTCWYQTGIVVPAGTPTSCSIMCFVSACNDFSVRLLKDTPFIQQAALLQ QDCGLPPDVPNAQPALEGRTSFPEDTVITYKCEESFVKIPGEKDSVICLKGSQWSDIEEFCNRSCEVPTRLNSASLKQPYITQNYFPVGTVVEYECRPGYRREPSLSPKLTCLQNLKWSTAVEFCKKKSCPNPGEIRNGQIDVPGGILFGATISFSCNTGYKLFGSTSSFCLISGSSVQWSDPLPECREIYCPAPPQIDNGIIQGERDHYGYRQSVTYACNKGFTMIGEHSIYCTVNNDEGEWSGPPPECRG GLPVINTPGSNQFLTSDDFQSPSAMPQFDVTPELNIPGEVQNLMEIAEVDSVVPVNNVAGNLETMDIYRIPVQSGNHQSSQVFGFQVQPGLDGVFKHTLLGEILNYYAHWSGSIKLTFVFCGSAMATGKFLLAYAPPGANAPKSRKDAMLGTHIIWDVGLQSSCVLCIPWISQTHYRLVQQDEYTSAGNVTCWYQTGIVVPAGTPTSCSIMCFVSACNDFSVRLLKDTPFIQQAALLQ MTVARPSVPAALPLLGELPRLLLLVLLCLPAVWGDCGLPPDVPNAQPALEGRTSFPEDTVITYKCEESFVKIPGEKDSVICLKGSQWSDIEEFCNRSCEVPTRLNSASLKQPYITQNYFPVGTVVEYECRPGYRREPSLSPKLTCLQNLKWSTAVEFCKKKSCPNPGEIRNGQIDVPGGILFGATISFSCNTGYKLFGSTSSFCLISGSSVQWSDPLPECREIYCPAPPQIDNGIIQGERDHYGYRQSVTYACNKGFTMIGEHSIYCTVNNDEGEWSGPPPECRGKSLTSKVPPTVQKPTTVNVPTTEVSPTSQKTTTKTTTPNAQATRSTPVSRTTKHFHETTPNKGSGTTSGTTRLLSGHTCFTLTGLLGTLVTMGLLT nan nan nan

If you wanted to get the full apo pairings mapped in long-form to the holo dimer IDs, you can further extend above like this:

apoR = seq_index[["id", "apo_R_pdbs"]].query("apo_R_pdbs != ''").reset_index(drop=True)
apoL = seq_index[["id", "apo_L_pdbs"]].query("apo_L_pdbs != ''").reset_index(drop=True)
apoR["apo_R_pdbs"] = [p.split(";") for p in apoR.apo_R_pdbs]
apoL["apo_L_pdbs"] = [p.split(";") for p in apoL.apo_L_pdbs]
apoR = apoR.explode("apo_R_pdbs").rename(columns={"apo_R_pdbs": "apo_pdb"})
apoL = apoL.explode("apo_L_pdbs").rename(columns={"apo_L_pdbs": "apo_pdb"})
apoR["apo_side"] = "receptor"
apoL["apo_side"] = "ligand"
apo_pairings = pd.concat([apoR, apoL], ignore_index=True).drop_duplicates().reset_index(drop=True)
apo_pairings = pd.merge(
    apo_pairings, 
    seq_db.query("pdb_kind == 'monomer'").drop("pdb_kind", axis=1).rename(columns={"pdb": "apo_pdb", "sequence": "apo_sequence"}), 
    how="left"
)

print(apo_pairings.head(5).to_markdown(index=False))
id apo_pdb apo_side apo_sequence
6ikj__A1_Q9I4L6--6ikj__B1_Q9I4L6 4zhw__A1_Q9I4L6.pdb receptor GMSSKVLFGNNLDRLNPDSRNTLTKIARALLAVDIDKVRLEGHTDNYGDEGYNQKLSERRAESVAAVFREAGMPAANIEVRGLGMSKPVADNKTRAGRSENRRVAIIVPA
3k1i__D1_O25709--3k1i__A1_O25448 3k1h__A1_O25709.pdb receptor FSRDMKNINESVGALQVLQIACKKLFNKSMGLEDKDALQASIIKQELREIVENCQFLASPLFDTQLNIAINDEIFSMIVVNPLDLLENVGEFQAYLEEKLNEIKELLGYLSESLS
1sc1__A1_P29466--1sc1__B1_P29466 8wra__A1_P29466.pdb receptor EGNVKLCSLEEAQRIWKQKSAEIYPIMDKSSRTRLALIICNEEFDSIPRRTGAEVDITGMTMLLQNLGYSVDVKKNLTASDMTTELEAFAHRPEHKTSDSTFLVFMSHGIREGICGKKHSEQVPDILQLNAIFNMLNTKNCPSLKDKPKVIIIQACRGDSPGVVWFKDAIKKAHIEKDFIAFCSSTPDGSVFIGRLIEHMQEYACSCDVEEIFRKVRFSFEQPDGRAQMPATERVTLTRCFYLFPGH
2qh0__A1_Q97H22--2qh0__A2_Q97H22 3hdp__A1_Q97H22.pdb receptor SHMSLKVHHIGYAVKNIDSALKKFKRLGYVEESEVVRDEVRKVYIQFVINGGYRVELVAPDGEDSPINKTIKKGSTPYHICYEVEDIQKSIEEMSQIGYTLFKKAEIAPAIDNRKVAFLFSTDIGLIELLEK
5hs5__A1_Q2G0D1--5hs5__B1_Q2G0D1 5ywj__A1_Q2G0D1.pdb receptor LLGFYKQYKALSEYIDKKYKLSLNDLAVLDLTMKHCKDEKVLMQSFLKTAMDELDLSRTKLLVSIRRLIEKERLSKVRSSKDERKIYIYLNNDDISKFNALFEDVEQFLN

Let me know if this works!