biothings / mygeneset.info

Apache License 2.0
5 stars 3 forks source link

SMPDB parser with genesets and metabolite sets #35

Closed ravila4 closed 3 years ago

ravila4 commented 3 years ago

Refers to issue #33. Metabolites parser queries MyChem.info using inchi key identifiers, to verify that the compound exists in the database. Also fetches two additional ids: pubchem.cid and chembl.molecule_chembl_id, as these are fairly common and were not found in the downloaded data.

def parse_metabolites(data_folder):                                                                    
    all_compounds = set()                                                                              
    fields = ['InChI Key']                                                                             
    for f in glob(os.path.join(data_folder, "*_metabolites.csv")):                                     
        tmp_df = pd.read_csv(f, usecols=fields).fillna("")                                             
        # Skip empty files                                                                             
        if len(tmp_df) == 0:                                                                           
            continue                                                                                   
        all_compounds = all_compounds | set(tmp_df['InChI Key'])                                       
    # Query MyChem.info                                                                                
    mc = biothings_client.MyChemInfo()                                                                 
    resp = mc.getchems(all_compounds, fields='pubchem.cid, chembl.molecule_chembl_id', dotfield=True)  
...

Final ES mapping:

{
    "source": {
        "normalizer": "keyword_lowercase_normalizer",
        "type": "keyword"
    },
    "taxid": {
        "type": "integer"
    },
    "smpdb": {
        "properties": {
        "id": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword",
            "copy_to": [
            "all"
            ]
        },
        "pw_id": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword",
            "copy_to": [
            "all"
            ]
        },
        "pathway_subject": {
            "type": "keyword"
        }
        }
    },
    "genes": {
        "properties": {
        "mygene_id": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "symbol": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "ncbigene": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "ensemblgene": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "uniprot": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "name": {
            "type": "text"
        }
        }
    },
    "metabolites": {
        "properties": {
        "mychem_id": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "smpdb_metabolite": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "hmdb": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "kegg_cid": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "chebi": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "drugbank": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "smiles": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "inchi": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "inchikey": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "pubchem": {
            "type": "integer"
        },
        "chembl": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "cas": {
            "normalizer": "keyword_lowercase_normalizer",
            "type": "keyword"
        },
        "name": {
            "type": "text"
        },
        "iupac": {
            "type": "keyword"
            "normalizer": "keyword_lowercase_normalizer",
        }
        }
    },
    "name": {
        "type": "text",
        "copy_to": [
        "all"
        ]
    },
    "description": {
        "type": "text",
        "copy_to": [
        "all"
        ]
    }
    }