ucscXena / xena-analysis-grails

0 stars 0 forks source link

pull down more proper go slims and analyze #16

Closed nathandunn closed 3 years ago

nathandunn commented 3 years ago

https://xenademo.berkeleybop.io/xena/

ANALYZE


DUMP_DB


DOWNLOAD

CONVERT

REMOVE OLD

nathandunn commented 3 years ago

Putative python script:

# This script is experimental and is used to produce GMT files out of GO terms

import sys, getopt, os, json
import go_stats_utils as utils
from obo_parser import OBO_Parser, TermState

max_rows = 10000000

select_ontology = "select?fq=document_category:\"ontology_class\"&q=*:*&rows=" + str(max_rows) + "&wt=json&fq=idspace:\"GO\"&fq=is_obsolete:false&fl=annotation_class,annotation_class_label,source,regulates_closure,isa_closure,isa_partof_closure,regulates_closure"
select_annotations = "select?fq=document_category:\"annotation\"&q=*:*&rows=" + str(max_rows) + "&wt=json&fq=type:\"protein\"&fl=bioentity,annotation_class,evidence_type"

ASPECTS = {
    "GO:0003674" : "MF",
    "GO:0008150" : "BP",
    "GO:0005575" : "CC"
}

def create_ontology_map(golr_base_url):
    ontology = utils.golr_fetch(golr_base_url, select_ontology)
    ontology = ontology['response']['docs']
    map={}
    for item in ontology:
        map[item['annotation_class']] = item
    return map

def create_go_annotation_map(golr_base_url, taxa):
    """
    Create a Map { GO-Term -> [ annotations ] } using the direct annotation to the term (annotation_class)
    """
    annots = utils.golr_fetch_by_taxa(golr_base_url, select_annotations, taxa)
    annots = annots['response']['docs']
    map={}
    for item in annots:
        iclass = item['annotation_class']
        iannots = []
        if iclass in map:
            iannots = map[iclass]
        else:
            map[iclass] = iannots
        iannots.append(item)
    return map

def remap_go_annotation_map(go_annotation_map, ontology_map, closure):
    """
    Remap an existing go annotation map using a certain closure (see CLOSURE_LABELS)
    """
    new_map = {}
    for term in go_annotation_map:
        new_map[term] = []
        closure_terms = ontology_map[term][closure]

        for closure_term in closure_terms:
            # continue only if there is an annotation for that closure term
            if closure_term not in go_annotation_map:
                continue
            # discard annotation to root terms
            if closure_term in ASPECTS:
                continue
            new_map[term] = new_map[term] + go_annotation_map[closure_term]

    return new_map

def format_id(id):
    return id.replace("MGI:MGI:", "MGI:")
    # return id.replace("UniProtKB:", "")

def gmt(ontology_map, golr_base_url, taxa):
    print("\nCreating term annotation map for taxa ", taxa , " ...")
    go_annotation_map = create_go_annotation_map(golr_base_url, taxa)
    print("Term annotation map created with ", len(go_annotation_map) , " terms")

    closure = utils.CLOSURE_LABELS.REGULATES.value
    print("\nRemapping annotations using closure ", closure)
    go_annotation_map = remap_go_annotation_map(go_annotation_map, ontology_map, closure)
    print("Term annotation remapped using closure ", closure , " with ", len(go_annotation_map) , " terms")

    evidence_groups = [ "ALL", "EXPERIMENTAL", "COMPUTATIONAL" ]
    aspect_lists = [ "ALL", "BP", "MF", "CC" ]

    report = { }
    for aspect in aspect_lists:
        report[aspect] = { }

    count = 0

    for term_id, value in go_annotation_map.items():
        # do not consider aspect level terms (irrelevant: a gene supposedly always have at least 1 MF, 1 BP and 1 CC)
        if term_id in ASPECTS:
            continue

        term_label = ontology_map[term_id]['annotation_class_label']
        term_aspect = utils.aspect_from_source(ontology_map[term_id]['source'])

        # for each annotated term, we'll keep a list of all the genes associated based on their evidence groups
        id_sets = { }
        for evgroup in evidence_groups:
            id_set = set()
            id_sets[evgroup] = id_set

        # going through each annotation for the term considered
        for annot in value:
            bioentity = annot['bioentity']
            et = annot['evidence_type']

            # Don't annotate the gene to that term if ND !
            evgroup = utils.get_evidence_min_group(et)
            if(evgroup == "ND"):
                continue

            # Add all annotations (don't filter by evidence)
            id_sets["ALL"].add(bioentity)

            # Add the annotation for the specific group of evidence
            id_sets[evgroup].add(bioentity)

        # Building the report for that term; will add only the term to an evidence group report IF the term has at least one gene
        for evgroup in evidence_groups:
            id_set = id_sets[evgroup]
            if len(id_set) == 0:
                continue

            if evgroup not in report["ALL"]:
                report["ALL"][evgroup] = []         
            report["ALL"][evgroup].append(term_label + "%" + term_aspect + "%" + term_id + "\t" + "\t".join(id_set))

            if evgroup not in report[term_aspect]:
                report[term_aspect][evgroup] = []
            report[term_aspect][evgroup].append(term_label + "%" + term_aspect + "%" + term_id + "\t" + "\t".join(id_set))

        count += 1

        if count % 2000 == 0:
            print(str(count) + " terms map created...")
    print(str(count) + " terms map created...")

    # Transforming to text
    for aspect in report:
        for evgroup in report[aspect]:
            report[aspect][evgroup] = "\n".join(report[aspect][evgroup])

    return report

def filter_slim(report, terms):
    gmt_slim = { }
    for aspect in report:
        gmt_slim[aspect] = { }
        for evgroup in report[aspect]:
            gmt_aspect = report[aspect][evgroup]
            lines = gmt_aspect.split("\n")
            for line in lines:
                # test if the line contains any terms of the slim
                res = any(ele in line for ele in terms)
                if res:
                    if evgroup not in gmt_slim[aspect]:
                        gmt_slim[aspect][evgroup] = ""    
                    gmt_slim[aspect][evgroup] += line + "\n"
    return gmt_slim

def print_help():
    print('\nUsage: python go_gmt.py -g <golr_base_url> -o <output_rep> -s <slim_base_url>\n')

def main(argv):
    golr_base_url = ''
    output_rep = ''
    slim_base_url = ''

    if len(argv) < 6:
        print_help()
        sys.exit(2)

    try:
        opts, argv = getopt.getopt(argv,"g:o:s:",["golrurl=","orep=","slim="])
    except getopt.GetoptError:
        print_help()
        sys.exit(2)

    for opt, arg in opts:
        if opt == '-h':
            print_help()
            sys.exit()
        elif opt in ("-g", "--golrurl"):
            golr_base_url = arg
            if not golr_base_url.endswith("/"):
                golr_base_url = golr_base_url + "/"
        elif opt in ("-o", "--orep"):
            output_rep = arg
        elif opt in ("-s", "--slim"):
            slim_base_url = arg
            if not slim_base_url.endswith("/"):
                slim_base_url = slim_base_url + "/"

    if not output_rep.endswith("/"):
        output_rep += "/"

    if not os.path.exists(output_rep):
        os.mkdir(output_rep)

    print("\n1 - Creating ontology map...")
    ontology_map = create_ontology_map(golr_base_url)
    print("Ontology map created with ", len(ontology_map) , " terms")

    slims = [ "goslim_agr.obo", "goslim_generic.obo", "goslim_chembl.obo" ]
    print("\n2 - Loading ", len(slims), " slims to create the slim-specific GMTs...")
    slim_obos = { }

    for slim in slims:
        response = utils.fetch(slim_base_url + slim)
        obo = OBO_Parser(response.text)
        slim_obos[slim] = obo
    print("Slims loaded: ", len(slim_obos))

    # taxa = utils.REFERENCE_GENOME_IDS
    taxa = [ "NCBITaxon:9606", "NCBITaxon:10090" ]
    print("\n3 - Creating the GMTs for " , len(taxa) , " taxa")
    for taxon in taxa:
        taxon_id = taxon.split(":")[1]
        gmt_taxon = gmt(ontology_map, golr_base_url, taxon)

        output = output_rep + taxon_id

        for aspect in gmt_taxon:
            for evgroup in gmt_taxon[aspect]:
                if len(gmt_taxon[aspect][evgroup]) > 0:
                    utils.write_text(output + "-" + aspect.lower() + "-" + evgroup.lower() + ".gmt", gmt_taxon[aspect][evgroup])

        for slim_obo in slim_obos:
            oterms = slim_obos[slim_obo].get_terms(TermState.VALID)
            terms = oterms.keys()
            gmt_taxon_slim = filter_slim(gmt_taxon, terms)
            slim_key = slim_obo.replace(".obo", "")

            for aspect in gmt_taxon_slim:
                for evgroup in gmt_taxon_slim[aspect]:
                    if len(gmt_taxon_slim[aspect][evgroup]) > 0:
                        utils.write_text(output + "-" + slim_key + "-" + aspect.lower() + "-" + evgroup.lower() + ".gmt", gmt_taxon_slim[aspect][evgroup])

if __name__ == "__main__":
   main(sys.argv[1:])
nathandunn commented 3 years ago

All files in: https://geneontology-archive.s3.amazonaws.com/list.html

Filename format: {taxon}-{aspect}-{all/experimental/computational}.gmt

1 - using the regulates relationship to propagate annotations (by far the largest set) => gmts/ folder

For example, https://geneontology-archive.s3.amazonaws.com/gmts/9606-all-all.gmt is 400mo and we consider all aspects and all annotations propagated through the regulates relationship

2 - using the isa_partof relationship to propagate annotations (I personally think it's better but that's a long lasting discussion in GO as you may know) => gmts_isa_partof

The list of genes are usually very big using those mapping through ontology relationships. They are much smaller if we only use the direct annotation to the term.

3 - direct annotations (I am not using any mapping through an ontology relationship) => gmts_direct/

I won't have much more time to work on this for the moment, but I hope this will be useful. I also created those sets for 3 slims (the chembl details info on drug targets)

I will mention it briefly during the group call, but as it wasn't my priority I won't insist. Please feel free to voice your interest if you want more work on it ;)

nathandunn commented 3 years ago

Files from here: https://geneontology-archive.s3.amazonaws.com/list.html

9606 taxon.

Recommendation is to use the generic, and AGR slims as well as other aspects. Should only use ispartof.

From here:

https://geneontology-archive.s3.amazonaws.com/gmts_isa_partof/index.html

Probably just use and convert everything but the chembl.

nathandunn commented 3 years ago

ok, I have simplified the file structure a lot since i think we kind of know which closure we want to use (isa_partof) and since I don’t really have access to the evidence that support a { gene - association } when using the closures in golr; I would then have to write something completely different (e.g. full parsing of GAF) to get that; I will still think about it and will let you know; in the mean time: https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-bp-all.gmt https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-mf-all.gmt https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-cc-all.gmt 1:24 so here: using isa_partof closure (I think it’s the best and Paul T was of this opinion too) and using all annotations (independently of EXP or INFERRED) 1:29 and based on that, I also generated the slims AGR GO SLIM: https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-goslim_agr-bp-all.gmt https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-goslim_agr-mf-all.gmt https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-goslim_agr-cc-all.gmt GO GENERIC SLIM: https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-goslim_generic-bp-all.gmt https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-goslim_generic-mf-all.gmt https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-goslim_generic-cc-all.gmt CHEMBL SLIM: https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-goslim_chembl-bp-all.gmt https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-goslim_chembl-mf-all.gmt https://s3.console.aws.amazon.com/s3/object/geneontology-public?region=us-east-1&prefix=gmts/9606-goslim_chembl-cc-all.gmt

nathandunn commented 3 years ago
 aws s3 sync  s3://geneontology-public/gmts/ . 
 for FILE in `ls *.gmt` ; do groovy  convert_gmt_file.groovy -inputfile $FILE  ; done