ebi-pf-team / interproscan

Genome-scale protein function classification
Apache License 2.0
303 stars 67 forks source link

Interproscan XML produces go-xref without a category #334

Closed aldendirks closed 1 year ago

aldendirks commented 1 year ago

I am attempting to run Funannotate to annotate my fungal genomes, which makes use of interproscan. When parsing the interproscan.xml file, Funannotate is giving me the error Error parsing XML GO terms: None is not a valid term. Looking more closely at the XML file, I see withing the panther-match carrots there are go-xref lines without any category information. For example, the first protein of an interproscan.xml file is pasted below. Is this list of GO IDs without any more information normal (towards the end)?

<?xml version="1.0" encoding="UTF-8"?><protein-matches xmlns="http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5" interproscan-version="5.60-92.0">
  <protein>
    <sequence md5="9e4b4cc8d93c10ef376100d7ebfa07d0">MAPTKYTPLTLHFSDAVTNVYPRQVEKLVANDGSYEYFRALGENEQKDILWRSKIAKALVEKYLKNAKGDRLTETDTAKDYIFKTLPENYKLYEHVKGKRDEKSGGTISERRDTYLFGHPTGKRFRSPAEFVPHILHLAAQDDRPCECWICTGSKHGNPPTSVKKPTKRETEVTQARKVVALEERQREQETAGWVLRKGEVVWVWLSDNPEAEEASDDALIDGDGGLWVAGVVAERPSFTPPYQKVRKTTGNAFADIDMDDTPPTWQQEGGNVPEKTYIIQLCSDPPKLGQILKGVPQHHVKPWLSRQECAQAPPSYSGKIEHPSIPRARRVAETFSLFDRVSEPSDPPSASDPSPDAPKIANFQGVFLGAEKIYIHEPVRISSANEDEIEDVLVVDKIYTCTTTSESASSGSDGKKKTLTTTQFRGNVYTAYPSTTCTPLSSHQFTELPFRMRRGSGTGEIIKWFIRNVPEERGECSLKMILGRWYEPQAVNEWIGSTGFSGGLPSSKETAMCQKDVKRWVKNRADALGLVSVNGIDLKSEGEVKIQPGKLTSPLKPKPADATAEAMDVDEPPQVTPERGFKSVNLRISSVTPGSASSLKITPRTEADDAGIDGGDIEEEEQVEGDEDEEDEDDEATMSDDKYHQPGPEVLSRSPTKRLSK</sequence>
    <xref id="FUN_001952-T1" name="FUN_001952-T1 FUN_001952"/>
    <matches>
      <hmmer3-match evalue="5.9E-22" score="78.3">
        <signature ac="PF16761" desc="Transcription-silencing protein, cryptic loci regulator Clr2" name="Clr2_transil">
          <entry ac="IPR031915" desc="Cryptic loci regulator 2, N-terminal" name="Clr2_N" type="DOMAIN"/>
          <signature-library-release library="PFAM" version="35.0"/>
        </signature>
        <model-ac>PF16761</model-ac>
        <locations>
          <hmmer3-location env-end="151" env-start="81" post-processed="true" score="77.3" evalue="1.2E-21" hmm-start="1" hmm-end="68" hmm-length="68" hmm-bounds="COMPLETE" start="81" end="151">
            <location-fragments>
              <hmmer3-location-fragment start="81" end="151" dc-status="CONTINUOUS"/>
            </location-fragments>
          </hmmer3-location>
        </locations>
      </hmmer3-match>
      <hmmer3-match evalue="5.1E-14" score="53.2">
        <signature ac="PF10383" desc="Transcription-silencing protein Clr2" name="Clr2">
          <entry ac="IPR018839" desc="Cryptic loci regulator 2, C-terminal" name="Tscrpt-silencing_Clr2_C" type="DOMAIN"/>
          <signature-library-release library="PFAM" version="35.0"/>
        </signature>
        <model-ac>PF10383</model-ac>
        <locations>
          <hmmer3-location env-end="488" env-start="363" post-processed="true" score="51.1" evalue="2.2E-13" hmm-start="2" hmm-end="143" hmm-length="143" hmm-bounds="C_TERMINAL_COMPLETE" start="364" end="488">
            <location-fragments>
              <hmmer3-location-fragment start="364" end="488" dc-status="CONTINUOUS"/>
            </location-fragments>
          </hmmer3-location>
        </locations>
      </hmmer3-match>
      <mobidblite-match>
        <signature ac="mobidb-lite" desc="consensus disorder prediction" name="disorder_prediction">
          <signature-library-release library="MOBIDB_LITE" version="2.0"/>
        </signature>
        <model-ac>mobidb-lite</model-ac>
        <locations>
          <mobidblite-location sequence-feature="" start="549" end="662">
            <location-fragments>
              <mobidblite-location-fragment start="549" end="662" dc-status="CONTINUOUS"/>
            </location-fragments>
          </mobidblite-location>
        </locations>
      </mobidblite-match>
      <mobidblite-match>
        <signature ac="mobidb-lite" desc="consensus disorder prediction" name="disorder_prediction">
          <signature-library-release library="MOBIDB_LITE" version="2.0"/>
        </signature>
        <model-ac>mobidb-lite</model-ac>
        <locations>
          <mobidblite-location sequence-feature="Polar" start="584" end="603">
            <location-fragments>
              <mobidblite-location-fragment start="584" end="603" dc-status="CONTINUOUS"/>
            </location-fragments>
          </mobidblite-location>
        </locations>
      </mobidblite-match>
      <mobidblite-match>
        <signature ac="mobidb-lite" desc="consensus disorder prediction" name="disorder_prediction">
          <signature-library-release library="MOBIDB_LITE" version="2.0"/>
        </signature>
        <model-ac>mobidb-lite</model-ac>
        <locations>
          <mobidblite-location sequence-feature="Negative Polyelectrolyte" start="613" end="638">
            <location-fragments>
              <mobidblite-location-fragment start="613" end="638" dc-status="CONTINUOUS"/>
            </location-fragments>
          </mobidblite-location>
        </locations>
      </mobidblite-match>
      <panther-match ac="PTHR38046:SF1" evalue="5.9E-54" graft-point="PTN002866222" name="CRYPTIC LOCI REGULATOR 2" score="195.5">
        <signature ac="PTHR38046" name="CRYPTIC LOCI REGULATOR 2">
          <entry ac="IPR038986" desc="Cryptic loci regulator 2" name="Clr2" type="FAMILY">
            <go-xref category="BIOLOGICAL_PROCESS" db="GO" id="GO:0031507" name="heterochromatin formation"/>
            <go-xref category="CELLULAR_COMPONENT" db="GO" id="GO:0070824" name="SHREC complex"/>
          </entry>
          <signature-library-release library="PANTHER" version="17.0"/>
        </signature>
        <model-ac>PTHR38046:SF1</model-ac>
        <locations>
          <panther-location env-start="2" env-end="546" hmm-start="16" hmm-end="548" hmm-length="0" hmm-bounds="INCOMPLETE" start="4" end="494">
            <location-fragments>
              <panther-location-fragment start="4" end="494" dc-status="CONTINUOUS"/>
            </location-fragments>
          </panther-location>
        </locations>
        <go-xref db="GO" id="GO:0040029"/>
        <go-xref db="GO" id="GO:0043226"/>
        <go-xref db="GO" id="GO:0006996"/>
        <go-xref db="GO" id="GO:0009987"/>
        <go-xref db="GO" id="GO:0043229"/>
        <go-xref db="GO" id="GO:0043170"/>
        <go-xref db="GO" id="GO:0019538"/>
        <go-xref db="GO" id="GO:0000792"/>
        <go-xref db="GO" id="GO:0098732"/>
        <go-xref db="GO" id="GO:0009892"/>
        <go-xref db="GO" id="GO:0016570"/>
        <go-xref db="GO" id="GO:0010467"/>
        <go-xref db="GO" id="GO:0006464"/>
        <go-xref db="GO" id="GO:1901564"/>
        <go-xref db="GO" id="GO:0065007"/>
        <go-xref db="GO" id="GO:0045814"/>
        <go-xref db="GO" id="GO:0071840"/>
        <go-xref db="GO" id="GO:0110165"/>
        <go-xref db="GO" id="GO:0008152"/>
        <go-xref db="GO" id="GO:0006325"/>
        <go-xref db="GO" id="GO:0044238"/>
        <go-xref db="GO" id="GO:0070828"/>
        <go-xref db="GO" id="GO:0043412"/>
        <go-xref db="GO" id="GO:0050789"/>
        <go-xref db="GO" id="GO:0048519"/>
        <go-xref db="GO" id="GO:0044237"/>
        <go-xref db="GO" id="GO:0019222"/>
        <go-xref db="GO" id="GO:0005622"/>
        <go-xref db="GO" id="GO:0006807"/>
        <go-xref db="GO" id="GO:0043232"/>
        <go-xref db="GO" id="GO:0016043"/>
        <go-xref db="GO" id="GO:0010629"/>
        <go-xref db="GO" id="GO:0071103"/>
        <go-xref db="GO" id="GO:0051276"/>
        <go-xref db="GO" id="GO:0071704"/>
        <go-xref db="GO" id="GO:0005694"/>
        <go-xref db="GO" id="GO:0031507"/>
        <go-xref db="GO" id="GO:0060255"/>
        <go-xref db="GO" id="GO:0006323"/>
        <go-xref db="GO" id="GO:0044260"/>
        <go-xref db="GO" id="GO:0010468"/>
        <go-xref db="GO" id="GO:0036211"/>
        <go-xref db="GO" id="GO:0006476"/>
        <go-xref db="GO" id="GO:0016575"/>
        <go-xref db="GO" id="GO:0035601"/>
        <go-xref db="GO" id="GO:0000785"/>
        <go-xref db="GO" id="GO:0022607"/>
        <go-xref db="GO" id="GO:0031497"/>
        <go-xref db="GO" id="GO:0006333"/>
        <go-xref db="GO" id="GO:0044267"/>
        <go-xref db="GO" id="GO:0044085"/>
        <go-xref db="GO" id="GO:0043228"/>
        <go-xref db="GO" id="GO:0010605"/>
      </panther-match>
    </matches>
  </protein>
matthiasblum commented 1 year ago

Hi @aldendirks,

Sorry for the late reply.

We published a new version of InterProScan last week (version 5.64-96.0) and PANTHER GO terms now have the category and the name, e.g.

<?xml version="1.0" encoding="UTF-8"?><protein-matches xmlns="https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas" interproscan-version="5.64-96.0">
  <protein>
    <sequence md5="9e4b4cc8d93c10ef376100d7ebfa07d0">MAPTKYTPLTLHFSDAVTNVYPRQVEKLVANDGSYEYFRALGENEQKDILWRSKIAKALVEKYLKNAKGDRLTETDTAKDYIFKTLPENYKLYEHVKGKRDEKSGGTISERRDTYLFGHPTGKRFRSPAEFVPHILHLAAQDDRPCECWICTGSKHGNPPTSVKKPTKRETEVTQARKVVALEERQREQETAGWVLRKGEVVWVWLSDNPEAEEASDDALIDGDGGLWVAGVVAERPSFTPPYQKVRKTTGNAFADIDMDDTPPTWQQEGGNVPEKTYIIQLCSDPPKLGQILKGVPQHHVKPWLSRQECAQAPPSYSGKIEHPSIPRARRVAETFSLFDRVSEPSDPPSASDPSPDAPKIANFQGVFLGAEKIYIHEPVRISSANEDEIEDVLVVDKIYTCTTTSESASSGSDGKKKTLTTTQFRGNVYTAYPSTTCTPLSSHQFTELPFRMRRGSGTGEIIKWFIRNVPEERGECSLKMILGRWYEPQAVNEWIGSTGFSGGLPSSKETAMCQKDVKRWVKNRADALGLVSVNGIDLKSEGEVKIQPGKLTSPLKPKPADATAEAMDVDEPPQVTPERGFKSVNLRISSVTPGSASSLKITPRTEADDAGIDGGDIEEEEQVEGDEDEEDEDDEATMSDDKYHQPGPEVLSRSPTKRLSK</sequence>
    <xref id="FUN_001952-T1" name="FUN_001952-T1 FUN_001952"/>
    <matches>
      <panther-match ac="PTHR38046:SF1" evalue="5.9E-54" graft-point="PTN002866222" name="CRYPTIC LOCI REGULATOR 2" score="195.5">
        <signature ac="PTHR38046" name="CRYPTIC LOCI REGULATOR 2">
          <entry ac="IPR038986" desc="Cryptic loci regulator 2" name="Clr2" type="FAMILY">
            <go-xref category="CELLULAR_COMPONENT" db="GO" id="GO:0070824" name="SHREC complex"/>
            <go-xref category="BIOLOGICAL_PROCESS" db="GO" id="GO:0031507" name="heterochromatin formation"/>
          </entry>
          <signature-library-release library="PANTHER" version="17.0"/>
        </signature>
        <model-ac>PTHR38046:SF1</model-ac>
        <locations>
          <panther-location env-start="2" env-end="546" hmm-start="16" hmm-end="548" hmm-length="0" hmm-bounds="INCOMPLETE" start="4" end="494">
            <location-fragments>
              <panther-location-fragment start="4" end="494" dc-status="CONTINUOUS"/>
            </location-fragments>
          </panther-location>
        </locations>
        <go-xref category="BIOLOGICAL_PROCESS" db="GO" id="GO:0016575" name="histone deacetylation"/>
        <go-xref category="BIOLOGICAL_PROCESS" db="GO" id="GO:0030466" name="silent mating-type cassette heterochromatin formation"/>
        <go-xref category="CELLULAR_COMPONENT" db="GO" id="GO:0033553" name="rDNA heterochromatin"/>
      </panther-match>
    </matches>
  </protein>
</protein-matches>

You can download the latest version of InterProScan there: https://www.ebi.ac.uk/interpro/download/InterProScan/.