Closed Leo-T-Zang closed 1 month ago
Hi @Leo-T-Zang , the sequence database contains long-form, for every single PDB file that exists in the database, the sequence. Depending on your use-case, if you want to map this onto the pinder index (where everything is paired to partners/interactions), I think the following should work:
from pinder.core.index.utils import get_sequence_database
from pinder.core import get_index
seq_db = get_sequence_database()
seq_index = get_index()
for pdb_kind, sub_db in seq_db.groupby('pdb_kind', observed=True):
sub_db.drop("pdb_kind", axis=1, inplace=True)
if pdb_kind in ["receptor", "ligand"]:
# Holo R/L
map_col = f"holo_{pdb_kind[0].upper()}_pdb"
sub_db.rename(columns={"pdb": map_col, "sequence": f"{pdb_kind}_sequence"}, inplace=True)
seq_index = pd.merge(seq_index, sub_db, how="left")
elif pdb_kind == "dimer":
# ID column doesn't contain .pdb suffix, just strip it from sub_db and rename to pinder ID
sub_db["pdb"] = sub_db["pdb"].str.rstrip(".pdb")
sub_db.rename(columns={"pdb": "id", "sequence": "dimer_sequence"}, inplace=True)
seq_index = pd.merge(seq_index, sub_db, how="left")
elif pdb_kind == "predicted":
# Merge with both receptor and ligand (might be the same for both if uniprot IDs identical)
seq_index = pd.merge(seq_index, sub_db.rename(columns={"pdb": f"{pdb_kind}_R_pdb", "sequence": f"{pdb_kind}_R_sequence"}), how="left")
seq_index = pd.merge(seq_index, sub_db.rename(columns={"pdb": f"{pdb_kind}_L_pdb", "sequence": f"{pdb_kind}_L_sequence"}), how="left")
elif pdb_kind == "monomer":
# Can only get the canonical apo_R + apo_L like this. See below for how to get exhaustive mappings of apo sequences (many to one).
seq_index = pd.merge(seq_index, sub_db.rename(columns={"pdb": "apo_R_pdb", "sequence": "apo_R_sequence"}), how="left")
seq_index = pd.merge(seq_index, sub_db.rename(columns={"pdb": "apo_L_pdb", "sequence": "apo_L_sequence"}), how="left")
else:
raise ValueError(f"Unknown pdb_kind: {pdb_kind}")
print(
seq_index[["split", "id", "pdb_id", "dimer_sequence", "receptor_sequence", "ligand_sequence", "predicted_R_sequence", "predicted_L_sequence", "apo_R_sequence", "apo_L_sequence"]]
.sample(3)
.to_markdown(index=False)
)
split | id | pdb_id | dimer_sequence | receptor_sequence | ligand_sequence | predicted_R_sequence | predicted_L_sequence | apo_R_sequence | apo_L_sequence |
---|---|---|---|---|---|---|---|---|---|
train | 6la5__A24_Q2LJ73--6la5__A25_Q2LJ73 | 6la5 | VVEAVENAVARVADTISSGPSNSQAVPALTAVETGHTSQVTPSDTIQTRHVRNYHSRSESSIENFLCRSACVYMGEYHTTNTDTSKLFASWTINARRMVQMRRKLELFTYVRFDMEVTFVITSKQDQGTQLGQDMPPLTHQIMYIPPGGPIPKSVTDYTWQTSTNPSIFWTEGNAPPRMSIPFISIGNAYSNFYDGWSHFSQNGVYGYNTLNHMGQIYVRHVNGSSPLPMTSTVRMYFKPKHVKVWVPRPPRLCQYKNASTVNFTPTNITEKRQSINYIPETVKPVVEAVENAVARVADTISSGPSNSQAVPALTAVETGHTSQVTPSDTIQTRHVRNYHSRSESSIENFLCRSACVYMGEYHTTNTDTSKLFASWTINARRMVQMRRKLELFTYVRFDMEVTFVITSKQDQGTQLGQDMPPLTHQIMYIPPGGPIPKSVTDYTWQTSTNPSIFWTEGNAPPRMSIPFISIGNAYSNFYDGWSHFSQNGVYGYNTLNHMGQIYVRHVNGSSPLPMTSTVRMYFKPKHVKVWVPRPPRLCQYKNASTVNFTPTNITEKRQSINYIPETVKP | VVEAVENAVARVADTISSGPSNSQAVPALTAVETGHTSQVTPSDTIQTRHVRNYHSRSESSIENFLCRSACVYMGEYHTTNTDTSKLFASWTINARRMVQMRRKLELFTYVRFDMEVTFVITSKQDQGTQLGQDMPPLTHQIMYIPPGGPIPKSVTDYTWQTSTNPSIFWTEGNAPPRMSIPFISIGNAYSNFYDGWSHFSQNGVYGYNTLNHMGQIYVRHVNGSSPLPMTSTVRMYFKPKHVKVWVPRPPRLCQYKNASTVNFTPTNITEKRQSINYIPETVKP | VVEAVENAVARVADTISSGPSNSQAVPALTAVETGHTSQVTPSDTIQTRHVRNYHSRSESSIENFLCRSACVYMGEYHTTNTDTSKLFASWTINARRMVQMRRKLELFTYVRFDMEVTFVITSKQDQGTQLGQDMPPLTHQIMYIPPGGPIPKSVTDYTWQTSTNPSIFWTEGNAPPRMSIPFISIGNAYSNFYDGWSHFSQNGVYGYNTLNHMGQIYVRHVNGSSPLPMTSTVRMYFKPKHVKVWVPRPPRLCQYKNASTVNFTPTNITEKRQSINYIPETVKP | nan | nan | nan | nan |
invalid | 1ruf__B54_P03303--1ruf__D9_P03303 | 1ruf | INYYKDAASTSSAGQSLSMDPSKFTEPVKDLMLKGAPALNGYSDRVQQITLGNSTITTQEAANAVVCYAEWPEYLPDVDASDVNKTSKPDTSVCRFYTLDSKTWTTGSKGWCWKLPDALKDMGVFGQNMFFHSLGRSGYTVHVQCNATKFHSGCLLVVVIPEHQLASHEGGNVSVKYTFTHPGERGIDLSSANEVGGPVKDVLYNMNGTLLGNLLIFPHQFINLRTNNTATIVIPYINSVPIDSMTRHNNVSLMVIPIAPLTVPTGATPSLPITVTIAPMCTEFSGIRSKSIVPQ | GYSDRVQQITLGNSTITTQEAANAVVCYAEWPEYLPDVDASDVNKTSKPDTSVCRFYTLDSKTWTTGSKGWCWKLPDALKDMGVFGQNMFFHSLGRSGYTVHVQCNATKFHSGCLLVVVIPEHQLASHEGGNVSVKYTFTHPGERGIDLSSANEVGGPVKDVLYNMNGTLLGNLLIFPHQFINLRTNNTATIVIPYINSVPIDSMTRHNNVSLMVIPIAPLTVPTGATPSLPITVTIAPMCTEFSGIRSKSIVPQ | INYYKDAASTSSAGQSLSMDPSKFTEPVKDLMLKGAPALN | nan | nan | nan | nan |
invalid | 2c8i__E31_P08174--2c8i__C32_P29813 | 2c8i | QDCGLPPDVPNAQPALEGRTSFPEDTVITYKCEESFVKIPGEKDSVICLKGSQWSDIEEFCNRSCEVPTRLNSASLKQPYITQNYFPVGTVVEYECRPGYRREPSLSPKLTCLQNLKWSTAVEFCKKKSCPNPGEIRNGQIDVPGGILFGATISFSCNTGYKLFGSTSSFCLISGSSVQWSDPLPECREIYCPAPPQIDNGIIQGERDHYGYRQSVTYACNKGFTMIGEHSIYCTVNNDEGEWSGPPPECRGGLPVINTPGSNQFLTSDDFQSPSAMPQFDVTPELNIPGEVQNLMEIAEVDSVVPVNNVAGNLETMDIYRIPVQSGNHQSSQVFGFQVQPGLDGVFKHTLLGEILNYYAHWSGSIKLTFVFCGSAMATGKFLLAYAPPGANAPKSRKDAMLGTHIIWDVGLQSSCVLCIPWISQTHYRLVQQDEYTSAGNVTCWYQTGIVVPAGTPTSCSIMCFVSACNDFSVRLLKDTPFIQQAALLQ | QDCGLPPDVPNAQPALEGRTSFPEDTVITYKCEESFVKIPGEKDSVICLKGSQWSDIEEFCNRSCEVPTRLNSASLKQPYITQNYFPVGTVVEYECRPGYRREPSLSPKLTCLQNLKWSTAVEFCKKKSCPNPGEIRNGQIDVPGGILFGATISFSCNTGYKLFGSTSSFCLISGSSVQWSDPLPECREIYCPAPPQIDNGIIQGERDHYGYRQSVTYACNKGFTMIGEHSIYCTVNNDEGEWSGPPPECRG | GLPVINTPGSNQFLTSDDFQSPSAMPQFDVTPELNIPGEVQNLMEIAEVDSVVPVNNVAGNLETMDIYRIPVQSGNHQSSQVFGFQVQPGLDGVFKHTLLGEILNYYAHWSGSIKLTFVFCGSAMATGKFLLAYAPPGANAPKSRKDAMLGTHIIWDVGLQSSCVLCIPWISQTHYRLVQQDEYTSAGNVTCWYQTGIVVPAGTPTSCSIMCFVSACNDFSVRLLKDTPFIQQAALLQ | MTVARPSVPAALPLLGELPRLLLLVLLCLPAVWGDCGLPPDVPNAQPALEGRTSFPEDTVITYKCEESFVKIPGEKDSVICLKGSQWSDIEEFCNRSCEVPTRLNSASLKQPYITQNYFPVGTVVEYECRPGYRREPSLSPKLTCLQNLKWSTAVEFCKKKSCPNPGEIRNGQIDVPGGILFGATISFSCNTGYKLFGSTSSFCLISGSSVQWSDPLPECREIYCPAPPQIDNGIIQGERDHYGYRQSVTYACNKGFTMIGEHSIYCTVNNDEGEWSGPPPECRGKSLTSKVPPTVQKPTTVNVPTTEVSPTSQKTTTKTTTPNAQATRSTPVSRTTKHFHETTPNKGSGTTSGTTRLLSGHTCFTLTGLLGTLVTMGLLT | nan | nan | nan |
If you wanted to get the full apo pairings mapped in long-form to the holo dimer IDs, you can further extend above like this:
apoR = seq_index[["id", "apo_R_pdbs"]].query("apo_R_pdbs != ''").reset_index(drop=True)
apoL = seq_index[["id", "apo_L_pdbs"]].query("apo_L_pdbs != ''").reset_index(drop=True)
apoR["apo_R_pdbs"] = [p.split(";") for p in apoR.apo_R_pdbs]
apoL["apo_L_pdbs"] = [p.split(";") for p in apoL.apo_L_pdbs]
apoR = apoR.explode("apo_R_pdbs").rename(columns={"apo_R_pdbs": "apo_pdb"})
apoL = apoL.explode("apo_L_pdbs").rename(columns={"apo_L_pdbs": "apo_pdb"})
apoR["apo_side"] = "receptor"
apoL["apo_side"] = "ligand"
apo_pairings = pd.concat([apoR, apoL], ignore_index=True).drop_duplicates().reset_index(drop=True)
apo_pairings = pd.merge(
apo_pairings,
seq_db.query("pdb_kind == 'monomer'").drop("pdb_kind", axis=1).rename(columns={"pdb": "apo_pdb", "sequence": "apo_sequence"}),
how="left"
)
print(apo_pairings.head(5).to_markdown(index=False))
id | apo_pdb | apo_side | apo_sequence |
---|---|---|---|
6ikj__A1_Q9I4L6--6ikj__B1_Q9I4L6 | 4zhw__A1_Q9I4L6.pdb | receptor | GMSSKVLFGNNLDRLNPDSRNTLTKIARALLAVDIDKVRLEGHTDNYGDEGYNQKLSERRAESVAAVFREAGMPAANIEVRGLGMSKPVADNKTRAGRSENRRVAIIVPA |
3k1i__D1_O25709--3k1i__A1_O25448 | 3k1h__A1_O25709.pdb | receptor | FSRDMKNINESVGALQVLQIACKKLFNKSMGLEDKDALQASIIKQELREIVENCQFLASPLFDTQLNIAINDEIFSMIVVNPLDLLENVGEFQAYLEEKLNEIKELLGYLSESLS |
1sc1__A1_P29466--1sc1__B1_P29466 | 8wra__A1_P29466.pdb | receptor | EGNVKLCSLEEAQRIWKQKSAEIYPIMDKSSRTRLALIICNEEFDSIPRRTGAEVDITGMTMLLQNLGYSVDVKKNLTASDMTTELEAFAHRPEHKTSDSTFLVFMSHGIREGICGKKHSEQVPDILQLNAIFNMLNTKNCPSLKDKPKVIIIQACRGDSPGVVWFKDAIKKAHIEKDFIAFCSSTPDGSVFIGRLIEHMQEYACSCDVEEIFRKVRFSFEQPDGRAQMPATERVTLTRCFYLFPGH |
2qh0__A1_Q97H22--2qh0__A2_Q97H22 | 3hdp__A1_Q97H22.pdb | receptor | SHMSLKVHHIGYAVKNIDSALKKFKRLGYVEESEVVRDEVRKVYIQFVINGGYRVELVAPDGEDSPINKTIKKGSTPYHICYEVEDIQKSIEEMSQIGYTLFKKAEIAPAIDNRKVAFLFSTDIGLIELLEK |
5hs5__A1_Q2G0D1--5hs5__B1_Q2G0D1 | 5ywj__A1_Q2G0D1.pdb | receptor | LLGFYKQYKALSEYIDKKYKLSLNDLAVLDLTMKHCKDEKVLMQSFLKTAMDELDLSRTKLLVSIRRLIEKERLSKVRSSKDERKIYIYLNNDDISKFNALFEDVEQFLN |
Let me know if this works!
Hi,
For sequence dataset
from pinder.core.index.utils import get_sequence_database
, it has pdb_kind column with following types:How can we find the interacting sequence/partner for each row or each type?
Thanks a lot!