Closed Mitaines closed 1 year ago
When I correct the example xml (wrong ORIGIN opening tag) and I use the xml pipeline, I get correct results (see below). Maybe this problem has been corrected in the meantime?
<?xml-stylesheet type="text/xsl" href="bow.xslt"?>
<MultimediaDocuments>
<node elementName="DOCSET">
<node elementName="DOC">
<node elementName="TITLE" indexingNode="yes">
<content type="tokens">
<tokens>
<bowNamedEntity id="1" lemma="001_sccs_o_214" category="NOMBRE" position="114" length="14" type="Numex.NUMBER">
<parts head="0">
<bowToken id="2" lemma="001_sccs_o_214" category="NOMBRE" position="114" length="14"/>
</parts>
<feature name="numvalue" value="0"/>
<feature name="value" value="001_sccs_o_214"/>
</bowNamedEntity>
</tokens>
<properties>
<property name="ContentId" type="int" value="1"/>
<property name="type" type="string" value="tokens"/>
</properties>
</content>
<properties>
<property name="ContentId" type="int" value="1"/>
<property name="NodeId" type="int" value="2"/>
<property name="StructureId" type="int" value="2"/>
<property name="offBegPrpty" type="int" value="114"/>
<property name="offEndPrpty" type="int" value="128"/>
<property name="encodPrpty" type="string" value="UTF8"/>
<property name="langPrpty" type="string" value="eng"/>
<property name="srcePrpty" type="string" value="/home/gael/Projets/Tests/TextesExemples/test-1.xml"/>
<property name="indexDatePrpty" type="date" value="20210618"/>
</properties>
</node>
<node elementName="TEXT" indexingNode="yes">
<content type="tokens">
<tokens>
<bowToken id="3" lemma="Lorem" category="NP" position="160" length="5"/>
<bowToken id="4" lemma="ipsum" category="NC" position="166" length="5"/>
</tokens>
<properties>
<property name="ContentId" type="int" value="2"/>
<property name="type" type="string" value="tokens"/>
</properties>
</content>
<properties>
<property name="ContentId" type="int" value="2"/>
<property name="NodeId" type="int" value="3"/>
<property name="StructureId" type="int" value="2"/>
<property name="offBegPrpty" type="int" value="151"/>
<property name="offEndPrpty" type="int" value="185"/>
<property name="encodPrpty" type="string" value="UTF8"/>
<property name="langPrpty" type="string" value="eng"/>
<property name="srcePrpty" type="string" value="/home/gael/Projets/Tests/TextesExemples/test-1.xml"/>
<property name="indexDatePrpty" type="date" value="20210618"/>
</properties>
</node>
<properties>
<property name="ContentId" type="int" value="0"/>
<property name="NodeId" type="int" value="1"/>
<property name="StructureId" type="int" value="2"/>
<property name="offBegPrpty" type="int" value="50"/>
<property name="offEndPrpty" type="int" value="467"/>
<property name="encodPrpty" type="string" value="UTF8"/>
<property name="identPrpty" type="string" value="001"/>
<property name="langPrpty" type="string" value=""/>
<property name="srcePrpty" type="string" value="/home/gael/Projets/Tests/TextesExemples/test-1.xml"/>
<property name="titlePrpty" type="string" value="001_sccs_o_214"/>
<property name="indexDatePrpty" type="date" value="20210618"/>
</properties>
</node>
<properties>
<property name="ContentId" type="int" value="0"/>
<property name="NodeId" type="int" value="1"/>
<property name="StructureId" type="int" value="1"/>
<property name="offBegPrpty" type="int" value="31"/>
<property name="offEndPrpty" type="int" value="474"/>
<property name="encodPrpty" type="string" value="UTF8"/>
<property name="identPrpty" type="string" value="001"/>
<property name="srcePrpty" type="string" value="/home/gael/Projets/Tests/TextesExemples/test-1.xml"/>
<property name="titlePrpty" type="string" value="001_sccs_o_214"/>
<property name="indexDatePrpty" type="date" value="20210618"/>
</properties>
</node>
</MultimediaDocuments>
@Mitaines , could you check if the problem is solved. Recent commits could also have helped here.
Ping @Mitaines
No activity. Closing.
The .mult files generated from XML files with the empty ORIGIN and TYPE tags contain entity position errors. (See the pos-len value).
To reproduce: analyzeXml -l eng -p main file.xml
Extract of an XML file with the empty ORIGIN/TYPE tags