aymara / lima

The Libre Multilingual Analyzer, a Natural Language Processing (NLP) C++ toolkit.
http://aymara.github.io/lima/
Other
107 stars 21 forks source link

Entity position errors in .mult files #115

Closed Mitaines closed 1 year ago

Mitaines commented 3 years ago

The .mult files generated from XML files with the empty ORIGIN and TYPE tags contain entity position errors. (See the pos-len value).

To reproduce: analyzeXml -l eng -p main file.xml

Extract of an XML file with the empty ORIGIN/TYPE tags

<?xml version="1.0" ?>
<DOCSET>
    <DOC id="001">
        <ORIGN></ORIGIN>
        <TYPE></TYPE>
        <TITLE>001_sccs_o_214</TITLE>
        <TEXT>
        Lorem ipsum ....
        </TEXT>                                                                                                                                                                                                                                                                              
    </DOC>
</DOCSET>
kleag commented 3 years ago

When I correct the example xml (wrong ORIGIN opening tag) and I use the xml pipeline, I get correct results (see below). Maybe this problem has been corrected in the meantime?

<?xml-stylesheet type="text/xsl" href="bow.xslt"?>
<MultimediaDocuments>
  <node elementName="DOCSET">
    <node elementName="DOC">
      <node elementName="TITLE" indexingNode="yes">
        <content type="tokens">
          <tokens>
            <bowNamedEntity id="1" lemma="001_sccs_o_214" category="NOMBRE" position="114" length="14" type="Numex.NUMBER">
              <parts head="0">
                <bowToken id="2" lemma="001_sccs_o_214" category="NOMBRE" position="114" length="14"/>
              </parts>
              <feature name="numvalue" value="0"/>
              <feature name="value" value="001_sccs_o_214"/>
            </bowNamedEntity>
          </tokens>
          <properties>
            <property name="ContentId" type="int" value="1"/>
            <property name="type" type="string" value="tokens"/>
          </properties>
        </content>
        <properties>
          <property name="ContentId" type="int" value="1"/>
          <property name="NodeId" type="int" value="2"/>
          <property name="StructureId" type="int" value="2"/>
          <property name="offBegPrpty" type="int" value="114"/>
          <property name="offEndPrpty" type="int" value="128"/>
          <property name="encodPrpty" type="string" value="UTF8"/>
          <property name="langPrpty" type="string" value="eng"/>
          <property name="srcePrpty" type="string" value="/home/gael/Projets/Tests/TextesExemples/test-1.xml"/>
          <property name="indexDatePrpty" type="date" value="20210618"/>
        </properties>
      </node>
      <node elementName="TEXT" indexingNode="yes">
        <content type="tokens">
          <tokens>
            <bowToken id="3" lemma="Lorem" category="NP" position="160" length="5"/>
            <bowToken id="4" lemma="ipsum" category="NC" position="166" length="5"/>
          </tokens>
          <properties>
            <property name="ContentId" type="int" value="2"/>
            <property name="type" type="string" value="tokens"/>
          </properties>
        </content>
        <properties>
          <property name="ContentId" type="int" value="2"/>
          <property name="NodeId" type="int" value="3"/>
          <property name="StructureId" type="int" value="2"/>
          <property name="offBegPrpty" type="int" value="151"/>
          <property name="offEndPrpty" type="int" value="185"/>
          <property name="encodPrpty" type="string" value="UTF8"/>
          <property name="langPrpty" type="string" value="eng"/>
          <property name="srcePrpty" type="string" value="/home/gael/Projets/Tests/TextesExemples/test-1.xml"/>
          <property name="indexDatePrpty" type="date" value="20210618"/>
        </properties>
      </node>
      <properties>
        <property name="ContentId" type="int" value="0"/>
        <property name="NodeId" type="int" value="1"/>
        <property name="StructureId" type="int" value="2"/>
        <property name="offBegPrpty" type="int" value="50"/>
        <property name="offEndPrpty" type="int" value="467"/>
        <property name="encodPrpty" type="string" value="UTF8"/>
        <property name="identPrpty" type="string" value="001"/>
        <property name="langPrpty" type="string" value=""/>
        <property name="srcePrpty" type="string" value="/home/gael/Projets/Tests/TextesExemples/test-1.xml"/>
        <property name="titlePrpty" type="string" value="001_sccs_o_214"/>
        <property name="indexDatePrpty" type="date" value="20210618"/>
      </properties>
    </node>
    <properties>
      <property name="ContentId" type="int" value="0"/>
      <property name="NodeId" type="int" value="1"/>
      <property name="StructureId" type="int" value="1"/>
      <property name="offBegPrpty" type="int" value="31"/>
      <property name="offEndPrpty" type="int" value="474"/>
      <property name="encodPrpty" type="string" value="UTF8"/>
      <property name="identPrpty" type="string" value="001"/>
      <property name="srcePrpty" type="string" value="/home/gael/Projets/Tests/TextesExemples/test-1.xml"/>
      <property name="titlePrpty" type="string" value="001_sccs_o_214"/>
      <property name="indexDatePrpty" type="date" value="20210618"/>
    </properties>
  </node>
</MultimediaDocuments>
kleag commented 2 years ago

@Mitaines , could you check if the problem is solved. Recent commits could also have helped here.

kleag commented 2 years ago

Ping @Mitaines

kleag commented 1 year ago

No activity. Closing.